summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt80
-rw-r--r--Documentation/virtual/00-INDEX3
-rw-r--r--Documentation/virtual/virtio-spec.txt3210
-rw-r--r--MAINTAINERS2
-rw-r--r--arch/arm/mach-s3c24xx/mach-rx1950.c1
-rw-r--r--arch/arm/mach-tegra/tegra.c2
-rw-r--r--arch/arm/plat-samsung/devs.c1
-rw-r--r--arch/arm/plat-samsung/include/plat/regs-iic.h56
-rw-r--r--arch/x86/include/asm/lguest.h17
-rw-r--r--block/blk-integrity.c2
-rw-r--r--block/blk-merge.c2
-rw-r--r--drivers/Makefile2
-rw-r--r--drivers/block/virtio_blk.c148
-rw-r--r--drivers/char/hw_random/virtio-rng.c2
-rw-r--r--drivers/char/virtio_console.c14
-rw-r--r--drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c3
-rw-r--r--drivers/i2c/busses/i2c-amd756-s4882.c6
-rw-r--r--drivers/i2c/busses/i2c-at91.c58
-rw-r--r--drivers/i2c/busses/i2c-cbus-gpio.c4
-rw-r--r--drivers/i2c/busses/i2c-davinci.c77
-rw-r--r--drivers/i2c/busses/i2c-designware-core.c39
-rw-r--r--drivers/i2c/busses/i2c-designware-pcidrv.c73
-rw-r--r--drivers/i2c/busses/i2c-designware-platdrv.c83
-rw-r--r--drivers/i2c/busses/i2c-gpio.c75
-rw-r--r--drivers/i2c/busses/i2c-intel-mid.c3
-rw-r--r--drivers/i2c/busses/i2c-ismt.c2
-rw-r--r--drivers/i2c/busses/i2c-mv64xxx.c5
-rw-r--r--drivers/i2c/busses/i2c-mxs.c104
-rw-r--r--drivers/i2c/busses/i2c-nforce2-s4985.c6
-rw-r--r--drivers/i2c/busses/i2c-octeon.c10
-rw-r--r--drivers/i2c/busses/i2c-powermac.c10
-rw-r--r--drivers/i2c/busses/i2c-puv3.c10
-rw-r--r--drivers/i2c/busses/i2c-pxa.c20
-rw-r--r--drivers/i2c/busses/i2c-s3c2410.c48
-rw-r--r--drivers/i2c/busses/i2c-tegra.c26
-rw-r--r--drivers/i2c/busses/i2c-viperboard.c5
-rw-r--r--drivers/i2c/busses/i2c-xiic.c6
-rw-r--r--drivers/i2c/i2c-core.c246
-rw-r--r--drivers/i2c/i2c-mux.c9
-rw-r--r--drivers/i2c/muxes/Kconfig12
-rw-r--r--drivers/i2c/muxes/Makefile2
-rw-r--r--drivers/i2c/muxes/i2c-arb-gpio-challenge.c251
-rw-r--r--drivers/i2c/muxes/i2c-mux-gpio.c17
-rw-r--r--drivers/i2c/muxes/i2c-mux-pca954x.c6
-rw-r--r--drivers/lguest/Kconfig5
-rw-r--r--drivers/lguest/core.c67
-rw-r--r--drivers/lguest/lg.h6
-rw-r--r--drivers/lguest/lguest_user.c6
-rw-r--r--drivers/lguest/page_tables.c567
-rw-r--r--drivers/lguest/x86/core.c7
-rw-r--r--drivers/media/pci/bt8xx/bttv-i2c.c6
-rw-r--r--drivers/media/pci/mantis/mantis_i2c.c4
-rw-r--r--drivers/net/caif/Kconfig14
-rw-r--r--drivers/net/caif/Makefile3
-rw-r--r--drivers/net/caif/caif_virtio.c790
-rw-r--r--drivers/net/ethernet/sfc/falcon.c6
-rw-r--r--drivers/net/virtio_net.c77
-rw-r--r--drivers/pinctrl/core.c2
-rw-r--r--drivers/pinctrl/spear/pinctrl-plgpio.c6
-rw-r--r--drivers/rpmsg/virtio_rpmsg_bus.c8
-rw-r--r--drivers/scsi/virtio_scsi.c487
-rw-r--r--drivers/staging/media/go7007/go7007-driver.c7
-rw-r--r--drivers/vfio/pci/vfio_pci.c44
-rw-r--r--drivers/vfio/pci/vfio_pci_config.c172
-rw-r--r--drivers/vfio/pci/vfio_pci_intrs.c67
-rw-r--r--drivers/vfio/pci/vfio_pci_private.h1
-rw-r--r--drivers/vfio/vfio.c117
-rw-r--r--drivers/vhost/Kconfig18
-rw-r--r--drivers/vhost/Kconfig.tcm6
-rw-r--r--drivers/vhost/Makefile5
-rw-r--r--drivers/vhost/net.c255
-rw-r--r--drivers/vhost/scsi.c (renamed from drivers/vhost/tcm_vhost.c)255
-rw-r--r--drivers/vhost/tcm_vhost.h128
-rw-r--r--drivers/vhost/test.c13
-rw-r--r--drivers/vhost/vhost.c153
-rw-r--r--drivers/vhost/vhost.h32
-rw-r--r--drivers/vhost/vringh.c1007
-rw-r--r--drivers/virtio/virtio_balloon.c6
-rw-r--r--drivers/virtio/virtio_ring.c297
-rw-r--r--include/linux/i2c-mux.h2
-rw-r--r--include/linux/i2c-tegra.h25
-rw-r--r--include/linux/i2c.h50
-rw-r--r--include/linux/scatterlist.h16
-rw-r--r--include/linux/vfio.h3
-rw-r--r--include/linux/virtio.h20
-rw-r--r--include/linux/virtio_caif.h24
-rw-r--r--include/linux/virtio_ring.h57
-rw-r--r--include/linux/vringh.h225
-rw-r--r--include/uapi/linux/vfio.h1
-rw-r--r--include/uapi/linux/vhost.h28
-rw-r--r--include/uapi/linux/virtio_balloon.h4
-rw-r--r--include/uapi/linux/virtio_ids.h1
-rw-r--r--net/9p/trans_virtio.c48
-rw-r--r--tools/lguest/lguest.txt2
-rw-r--r--tools/virtio/Makefile10
-rw-r--r--tools/virtio/asm/barrier.h14
-rw-r--r--tools/virtio/linux/bug.h10
-rw-r--r--tools/virtio/linux/err.h26
-rw-r--r--tools/virtio/linux/export.h5
-rw-r--r--tools/virtio/linux/irqreturn.h1
-rw-r--r--tools/virtio/linux/kernel.h112
-rw-r--r--tools/virtio/linux/module.h1
-rw-r--r--tools/virtio/linux/printk.h4
-rw-r--r--tools/virtio/linux/ratelimit.h4
-rw-r--r--tools/virtio/linux/scatterlist.h189
-rw-r--r--tools/virtio/linux/types.h28
-rw-r--r--tools/virtio/linux/uaccess.h50
-rw-r--r--tools/virtio/linux/uio.h3
-rw-r--r--tools/virtio/linux/virtio.h171
-rw-r--r--tools/virtio/linux/virtio_config.h6
-rw-r--r--tools/virtio/linux/virtio_ring.h1
-rw-r--r--tools/virtio/linux/vringh.h1
-rw-r--r--tools/virtio/uapi/linux/uio.h1
-rw-r--r--tools/virtio/uapi/linux/virtio_config.h1
-rw-r--r--tools/virtio/uapi/linux/virtio_ring.h4
-rw-r--r--tools/virtio/virtio_test.c13
-rw-r--r--tools/virtio/vringh_test.c741
117 files changed, 6276 insertions, 5028 deletions
diff --git a/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt b/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt
new file mode 100644
index 0000000..1ac8ea8
--- /dev/null
+++ b/Documentation/devicetree/bindings/i2c/i2c-arb-gpio-challenge.txt
@@ -0,0 +1,80 @@
+GPIO-based I2C Arbitration Using a Challenge & Response Mechanism
+=================================================================
+This uses GPIO lines and a challenge & response mechanism to arbitrate who is
+the master of an I2C bus in a multimaster situation.
+
+In many cases using GPIOs to arbitrate is not needed and a design can use
+the standard I2C multi-master rules. Using GPIOs is generally useful in
+the case where there is a device on the bus that has errata and/or bugs
+that makes standard multimaster mode not feasible.
+
+
+Algorithm:
+
+All masters on the bus have a 'bus claim' line which is an output that the
+others can see. These are all active low with pull-ups enabled. We'll
+describe these lines as:
+
+- OUR_CLAIM: output from us signaling to other hosts that we want the bus
+- THEIR_CLAIMS: output from others signaling that they want the bus
+
+The basic algorithm is to assert your line when you want the bus, then make
+sure that the other side doesn't want it also. A detailed explanation is best
+done with an example.
+
+Let's say we want to claim the bus. We:
+1. Assert OUR_CLAIM.
+2. Waits a little bit for the other sides to notice (slew time, say 10
+ microseconds).
+3. Check THEIR_CLAIMS. If none are asserted then the we have the bus and we are
+ done.
+4. Otherwise, wait for a few milliseconds and see if THEIR_CLAIMS are released.
+5. If not, back off, release the claim and wait for a few more milliseconds.
+6. Go back to 1 (until retry time has expired).
+
+
+Required properties:
+- compatible: i2c-arb-gpio-challenge
+- our-claim-gpio: The GPIO that we use to claim the bus.
+- their-claim-gpios: The GPIOs that the other sides use to claim the bus.
+ Note that some implementations may only support a single other master.
+- Standard I2C mux properties. See mux.txt in this directory.
+- Single I2C child bus node at reg 0. See mux.txt in this directory.
+
+Optional properties:
+- slew-delay-us: microseconds to wait for a GPIO to go high. Default is 10 us.
+- wait-retry-us: we'll attempt another claim after this many microseconds.
+ Default is 3000 us.
+- wait-free-us: we'll give up after this many microseconds. Default is 50000 us.
+
+
+Example:
+ i2c@12CA0000 {
+ compatible = "acme,some-i2c-device";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+ i2c-arbitrator {
+ compatible = "i2c-arb-gpio-challenge";
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ i2c-parent = <&{/i2c@12CA0000}>;
+
+ our-claim-gpio = <&gpf0 3 1>;
+ their-claim-gpios = <&gpe0 4 1>;
+ slew-delay-us = <10>;
+ wait-retry-us = <3000>;
+ wait-free-us = <50000>;
+
+ i2c@0 {
+ reg = <0>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ i2c@52 {
+ // Normal I2C device
+ };
+ };
+ };
diff --git a/Documentation/virtual/00-INDEX b/Documentation/virtual/00-INDEX
index 924bd46..e952d30 100644
--- a/Documentation/virtual/00-INDEX
+++ b/Documentation/virtual/00-INDEX
@@ -6,6 +6,3 @@ kvm/
- Kernel Virtual Machine. See also http://linux-kvm.org
uml/
- User Mode Linux, builds/runs Linux kernel as a userspace program.
-virtio.txt
- - Text version of draft virtio spec.
- See http://ozlabs.org/~rusty/virtio-spec
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt
deleted file mode 100644
index eb09403..0000000
--- a/Documentation/virtual/virtio-spec.txt
+++ /dev/null
@@ -1,3210 +0,0 @@
-[Generated file: see http://ozlabs.org/~rusty/virtio-spec/]
-Virtio PCI Card Specification
-v0.9.5 DRAFT
--
-
-Rusty Russell <rusty@rustcorp.com.au> IBM Corporation (Editor)
-
-2012 May 7.
-
-Purpose and Description
-
-This document describes the specifications of the “virtio” family
-of PCI[LaTeX Command: nomenclature] devices. These are devices
-are found in virtual environments[LaTeX Command: nomenclature],
-yet by design they are not all that different from physical PCI
-devices, and this document treats them as such. This allows the
-guest to use standard PCI drivers and discovery mechanisms.
-
-The purpose of virtio and this specification is that virtual
-environments and guests should have a straightforward, efficient,
-standard and extensible mechanism for virtual devices, rather
-than boutique per-environment or per-OS mechanisms.
-
- Straightforward: Virtio PCI devices use normal PCI mechanisms
- of interrupts and DMA which should be familiar to any device
- driver author. There is no exotic page-flipping or COW
- mechanism: it's just a PCI device.[footnote:
-This lack of page-sharing implies that the implementation of the
-device (e.g. the hypervisor or host) needs full access to the
-guest memory. Communication with untrusted parties (i.e.
-inter-guest communication) requires copying.
-]
-
- Efficient: Virtio PCI devices consist of rings of descriptors
- for input and output, which are neatly separated to avoid cache
- effects from both guest and device writing to the same cache
- lines.
-
- Standard: Virtio PCI makes no assumptions about the environment
- in which it operates, beyond supporting PCI. In fact the virtio
- devices specified in the appendices do not require PCI at all:
- they have been implemented on non-PCI buses.[footnote:
-The Linux implementation further separates the PCI virtio code
-from the specific virtio drivers: these drivers are shared with
-the non-PCI implementations (currently lguest and S/390).
-]
-
- Extensible: Virtio PCI devices contain feature bits which are
- acknowledged by the guest operating system during device setup.
- This allows forwards and backwards compatibility: the device
- offers all the features it knows about, and the driver
- acknowledges those it understands and wishes to use.
-
- Virtqueues
-
-The mechanism for bulk data transport on virtio PCI devices is
-pretentiously called a virtqueue. Each device can have zero or
-more virtqueues: for example, the network device has one for
-transmit and one for receive.
-
-Each virtqueue occupies two or more physically-contiguous pages
-(defined, for the purposes of this specification, as 4096 bytes),
-and consists of three parts:
-
-
-+-------------------+-----------------------------------+-----------+
-| Descriptor Table | Available Ring (padding) | Used Ring |
-+-------------------+-----------------------------------+-----------+
-
-
-When the driver wants to send a buffer to the device, it fills in
-a slot in the descriptor table (or chains several together), and
-writes the descriptor index into the available ring. It then
-notifies the device. When the device has finished a buffer, it
-writes the descriptor into the used ring, and sends an interrupt.
-
-Specification
-
- PCI Discovery
-
-Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000
-through 0x103F inclusive is a virtio device[footnote:
-The actual value within this range is ignored
-]. The device must also have a Revision ID of 0 to match this
-specification.
-
-The Subsystem Device ID indicates which virtio device is
-supported by the device. The Subsystem Vendor ID should reflect
-the PCI Vendor ID of the environment (it's currently only used
-for informational purposes by the guest).
-
-
-+----------------------+--------------------+---------------+
-| Subsystem Device ID | Virtio Device | Specification |
-+----------------------+--------------------+---------------+
-+----------------------+--------------------+---------------+
-| 1 | network card | Appendix C |
-+----------------------+--------------------+---------------+
-| 2 | block device | Appendix D |
-+----------------------+--------------------+---------------+
-| 3 | console | Appendix E |
-+----------------------+--------------------+---------------+
-| 4 | entropy source | Appendix F |
-+----------------------+--------------------+---------------+
-| 5 | memory ballooning | Appendix G |
-+----------------------+--------------------+---------------+
-| 6 | ioMemory | - |
-+----------------------+--------------------+---------------+
-| 7 | rpmsg | Appendix H |
-+----------------------+--------------------+---------------+
-| 8 | SCSI host | Appendix I |
-+----------------------+--------------------+---------------+
-| 9 | 9P transport | - |
-+----------------------+--------------------+---------------+
-| 10 | mac80211 wlan | - |
-+----------------------+--------------------+---------------+
-
-
- Device Configuration
-
-To configure the device, we use the first I/O region of the PCI
-device. This contains a virtio header followed by a
-device-specific region.
-
-There may be different widths of accesses to the I/O region; the “
-natural” access method for each field in the virtio header must
-be used (i.e. 32-bit accesses for 32-bit fields, etc), but the
-device-specific region can be accessed using any width accesses,
-and should obtain the same results.
-
-Note that this is possible because while the virtio header is PCI
-(i.e. little) endian, the device-specific region is encoded in
-the native endian of the guest (where such distinction is
-applicable).
-
- Device Initialization Sequence<sub:Device-Initialization-Sequence>
-
-We start with an overview of device initialization, then expand
-on the details of the device and how each step is preformed.
-
- Reset the device. This is not required on initial start up.
-
- The ACKNOWLEDGE status bit is set: we have noticed the device.
-
- The DRIVER status bit is set: we know how to drive the device.
-
- Device-specific setup, including reading the Device Feature
- Bits, discovery of virtqueues for the device, optional MSI-X
- setup, and reading and possibly writing the virtio
- configuration space.
-
- The subset of Device Feature Bits understood by the driver is
- written to the device.
-
- The DRIVER_OK status bit is set.
-
- The device can now be used (ie. buffers added to the
- virtqueues)[footnote:
-Historically, drivers have used the device before steps 5 and 6.
-This is only allowed if the driver does not use any features
-which would alter this early use of the device.
-]
-
-If any of these steps go irrecoverably wrong, the guest should
-set the FAILED status bit to indicate that it has given up on the
-device (it can reset the device later to restart if desired).
-
-We now cover the fields required for general setup in detail.
-
- Virtio Header
-
-The virtio header looks as follows:
-
-
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Bits || 32 | 32 | 32 | 16 | 16 | 16 | 8 | 8 |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Read/Write || R | R+W | R+W | R | R+W | R+W | R+W | R |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-| Purpose || Device | Guest | Queue | Queue | Queue | Queue | Device | ISR |
-| || Features bits 0:31 | Features bits 0:31 | Address | Size | Select | Notify | Status | Status |
-+------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
-
-
-If MSI-X is enabled for the device, two additional fields
-immediately follow this header:[footnote:
-ie. once you enable MSI-X on the device, the other fields move.
-If you turn it off again, they move back!
-]
-
-
-+------------++----------------+--------+
-| Bits || 16 | 16 |
- +----------------+--------+
-+------------++----------------+--------+
-| Read/Write || R+W | R+W |
-+------------++----------------+--------+
-| Purpose || Configuration | Queue |
-| (MSI-X) || Vector | Vector |
-+------------++----------------+--------+
-
-
-Immediately following these general headers, there may be
-device-specific headers:
-
-
-+------------++--------------------+
-| Bits || Device Specific |
- +--------------------+
-+------------++--------------------+
-| Read/Write || Device Specific |
-+------------++--------------------+
-| Purpose || Device Specific... |
-| || |
-+------------++--------------------+
-
-
- Device Status
-
-The Device Status field is updated by the guest to indicate its
-progress. This provides a simple low-level diagnostic: it's most
-useful to imagine them hooked up to traffic lights on the console
-indicating the status of each device.
-
-The device can be reset by writing a 0 to this field, otherwise
-at least one bit should be set:
-
- ACKNOWLEDGE (1) Indicates that the guest OS has found the
- device and recognized it as a valid virtio device.
-
- DRIVER (2) Indicates that the guest OS knows how to drive the
- device. Under Linux, drivers can be loadable modules so there
- may be a significant (or infinite) delay before setting this
- bit.
-
- DRIVER_OK (4) Indicates that the driver is set up and ready to
- drive the device.
-
- FAILED (128) Indicates that something went wrong in the guest,
- and it has given up on the device. This could be an internal
- error, or the driver didn't like the device for some reason, or
- even a fatal error during device operation. The device must be
- reset before attempting to re-initialize.
-
- Feature Bits<sub:Feature-Bits>
-
-Thefirst configuration field indicates the features that the
-device supports. The bits are allocated as follows:
-
- 0 to 23 Feature bits for the specific device type
-
- 24 to 32 Feature bits reserved for extensions to the queue and
- feature negotiation mechanisms
-
-For example, feature bit 0 for a network device (i.e. Subsystem
-Device ID 1) indicates that the device supports checksumming of
-packets.
-
-The feature bits are negotiated: the device lists all the
-features it understands in the Device Features field, and the
-guest writes the subset that it understands into the Guest
-Features field. The only way to renegotiate is to reset the
-device.
-
-In particular, new fields in the device configuration header are
-indicated by offering a feature bit, so the guest can check
-before accessing that part of the configuration space.
-
-This allows for forwards and backwards compatibility: if the
-device is enhanced with a new feature bit, older guests will not
-write that feature bit back to the Guest Features field and it
-can go into backwards compatibility mode. Similarly, if a guest
-is enhanced with a feature that the device doesn't support, it
-will not see that feature bit in the Device Features field and
-can go into backwards compatibility mode (or, for poor
-implementations, set the FAILED Device Status bit).
-
- Configuration/Queue Vectors
-
-When MSI-X capability is present and enabled in the device
-(through standard PCI configuration space) 4 bytes at byte offset
-20 are used to map configuration change and queue interrupts to
-MSI-X vectors. In this case, the ISR Status field is unused, and
-device specific configuration starts at byte offset 24 in virtio
-header structure. When MSI-X capability is not enabled, device
-specific configuration starts at byte offset 20 in virtio header.
-
-Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of
-Configuration/Queue Vector registers, maps interrupts triggered
-by the configuration change/selected queue events respectively to
-the corresponding MSI-X vector. To disable interrupts for a
-specific event type, unmap it by writing a special NO_VECTOR
-value:
-
-/* Vector value used to disable MSI for queue */
-
-#define VIRTIO_MSI_NO_VECTOR 0xffff
-
-Reading these registers returns vector mapped to a given event,
-or NO_VECTOR if unmapped. All queue and configuration change
-events are unmapped by default.
-
-Note that mapping an event to vector might require allocating
-internal device resources, and might fail. Devices report such
-failures by returning the NO_VECTOR value when the relevant
-Vector field is read. After mapping an event to vector, the
-driver must verify success by reading the Vector field value: on
-success, the previously written value is returned, and on
-failure, NO_VECTOR is returned. If a mapping failure is detected,
-the driver can retry mapping with fewervectors, or disable MSI-X.
-
- Virtqueue Configuration<sec:Virtqueue-Configuration>
-
-As a device can have zero or more virtqueues for bulk data
-transport (for example, the network driver has two), the driver
-needs to configure them as part of the device-specific
-configuration.
-
-This is done as follows, for each virtqueue a device has:
-
- Write the virtqueue index (first queue is 0) to the Queue
- Select field.
-
- Read the virtqueue size from the Queue Size field, which is
- always a power of 2. This controls how big the virtqueue is
- (see below). If this field is 0, the virtqueue does not exist.
-
- Allocate and zero virtqueue in contiguous physical memory, on a
- 4096 byte alignment. Write the physical address, divided by
- 4096 to the Queue Address field.[footnote:
-The 4096 is based on the x86 page size, but it's also large
-enough to ensure that the separate parts of the virtqueue are on
-separate cache lines.
-]
-
- Optionally, if MSI-X capability is present and enabled on the
- device, select a vector to use to request interrupts triggered
- by virtqueue events. Write the MSI-X Table entry number
- corresponding to this vector in Queue Vector field. Read the
- Queue Vector field: on success, previously written value is
- returned; on failure, NO_VECTOR value is returned.
-
-The Queue Size field controls the total number of bytes required
-for the virtqueue according to the following formula:
-
-#define ALIGN(x) (((x) + 4095) & ~4095)
-
-static inline unsigned vring_size(unsigned int qsz)
-
-{
-
- return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2
-+ qsz))
-
- + ALIGN(sizeof(struct vring_used_elem)*qsz);
-
-}
-
-This currently wastes some space with padding, but also allows
-future extensions. The virtqueue layout structure looks like this
-(qsz is the Queue Size field, which is a variable, so this code
-won't compile):
-
-struct vring {
-
- /* The actual descriptors (16 bytes each) */
-
- struct vring_desc desc[qsz];
-
-
-
- /* A ring of available descriptor heads with free-running
-index. */
-
- struct vring_avail avail;
-
-
-
- // Padding to the next 4096 boundary.
-
- char pad[];
-
-
-
- // A ring of used descriptor heads with free-running index.
-
- struct vring_used used;
-
-};
-
- A Note on Virtqueue Endianness
-
-Note that the endian of these fields and everything else in the
-virtqueue is the native endian of the guest, not little-endian as
-PCI normally is. This makes for simpler guest code, and it is
-assumed that the host already has to be deeply aware of the guest
-endian so such an “endian-aware” device is not a significant
-issue.
-
- Descriptor Table
-
-The descriptor table refers to the buffers the guest is using for
-the device. The addresses are physical addresses, and the buffers
-can be chained via the next field. Each descriptor describes a
-buffer which is read-only or write-only, but a chain of
-descriptors can contain both read-only and write-only buffers.
-
-No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc {
-
- /* Address (guest-physical). */
-
- u64 addr;
-
- /* Length. */
-
- u32 len;
-
-/* This marks a buffer as continuing via the next field. */
-
-#define VRING_DESC_F_NEXT 1
-
-/* This marks a buffer as write-only (otherwise read-only). */
-
-#define VRING_DESC_F_WRITE 2
-
-/* This means the buffer contains a list of buffer descriptors.
-*/
-
-#define VRING_DESC_F_INDIRECT 4
-
- /* The flags as indicated above. */
-
- u16 flags;
-
- /* Next field if flags & NEXT */
-
- u16 next;
-
-};
-
-The number of descriptors in the table is specified by the Queue
-Size field for this virtqueue.
-
- <sub:Indirect-Descriptors>Indirect Descriptors
-
-Some devices benefit by concurrently dispatching a large number
-of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be
-used to allow this (see [cha:Reserved-Feature-Bits]). To increase
-ring capacity it is possible to store a table of indirect
-descriptors anywhere in memory, and insert a descriptor in main
-virtqueue (with flags&INDIRECT on) that refers to memory buffer
-containing this indirect descriptor table; fields addr and len
-refer to the indirect table address and length in bytes,
-respectively. The indirect table layout structure looks like this
-(len is the length of the descriptor that refers to this table,
-which is a variable, so this code won't compile):
-
-struct indirect_descriptor_table {
-
- /* The actual descriptors (16 bytes each) */
-
- struct vring_desc desc[len / 16];
-
-};
-
-The first indirect descriptor is located at start of the indirect
-descriptor table (index 0), additional indirect descriptors are
-chained by next field. An indirect descriptor without next field
-(with flags&NEXT off) signals the end of the indirect descriptor
-table, and transfers control back to the main virtqueue. An
-indirect descriptor can not refer to another indirect descriptor
-table (flags&INDIRECT must be off). A single indirect descriptor
-table can include both read-only and write-only descriptors;
-write-only flag (flags&WRITE) in the descriptor that refers to it
-is ignored.
-
- Available Ring
-
-The available ring refers to what descriptors we are offering the
-device: it refers to the head of a descriptor chain. The “flags”
-field is currently 0 or 1: 1 indicating that we do not need an
-interrupt when the device consumes a descriptor from the
-available ring. Alternatively, the guest can ask the device to
-delay interrupts until an entry with an index specified by the “
-used_event” field is written in the used ring (equivalently,
-until the idx field in the used ring will reach the value
-used_event + 1). The method employed by the device is controlled
-by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
-). This interrupt suppression is merely an optimization; it may
-not suppress interrupts entirely.
-
-The “idx” field indicates where we would put the next descriptor
-entry (modulo the ring size). This starts at 0, and increases.
-
-struct vring_avail {
-
-#define VRING_AVAIL_F_NO_INTERRUPT 1
-
- u16 flags;
-
- u16 idx;
-
- u16 ring[qsz]; /* qsz is the Queue Size field read from device
-*/
-
- u16 used_event;
-
-};
-
- Used Ring
-
-The used ring is where the device returns buffers once it is done
-with them. The flags field can be used by the device to hint that
-no notification is necessary when the guest adds to the available
-ring. Alternatively, the “avail_event” field can be used by the
-device to hint that no notification is necessary until an entry
-with an index specified by the “avail_event” is written in the
-available ring (equivalently, until the idx field in the
-available ring will reach the value avail_event + 1). The method
-employed by the device is controlled by the guest through the
-VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
-). [footnote:
-These fields are kept here because this is the only part of the
-virtqueue written by the device
-].
-
-Each entry in the ring is a pair: the head entry of the
-descriptor chain describing the buffer (this matches an entry
-placed in the available ring by the guest earlier), and the total
-of bytes written into the buffer. The latter is extremely useful
-for guests using untrusted buffers: if you do not know exactly
-how much has been written by the device, you usually have to zero
-the buffer to ensure no data leakage occurs.
-
-/* u32 is used here for ids for padding reasons. */
-
-struct vring_used_elem {
-
- /* Index of start of used descriptor chain. */
-
- u32 id;
-
- /* Total length of the descriptor chain which was used
-(written to) */
-
- u32 len;
-
-};
-
-
-
-struct vring_used {
-
-#define VRING_USED_F_NO_NOTIFY 1
-
- u16 flags;
-
- u16 idx;
-
- struct vring_used_elem ring[qsz];
-
- u16 avail_event;
-
-};
-
- Helpers for Managing Virtqueues
-
-The Linux Kernel Source code contains the definitions above and
-helper routines in a more usable form, in
-include/linux/virtio_ring.h. This was explicitly licensed by IBM
-and Red Hat under the (3-clause) BSD license so that it can be
-freely used by all other projects, and is reproduced (with slight
-variation to remove Linux assumptions) in Appendix A.
-
- Device Operation<sec:Device-Operation>
-
-There are two parts to device operation: supplying new buffers to
-the device, and processing used buffers from the device. As an
-example, the virtio network device has two virtqueues: the
-transmit virtqueue and the receive virtqueue. The driver adds
-outgoing (read-only) packets to the transmit virtqueue, and then
-frees them after they are used. Similarly, incoming (write-only)
-buffers are added to the receive virtqueue, and processed after
-they are used.
-
- Supplying Buffers to The Device
-
-Actual transfer of buffers from the guest OS to the device
-operates as follows:
-
- Place the buffer(s) into free descriptor(s).
-
- If there are no free descriptors, the guest may choose to
- notify the device even if notifications are suppressed (to
- reduce latency).[footnote:
-The Linux drivers do this only for read-only buffers: for
-write-only buffers, it is assumed that the driver is merely
-trying to keep the receive buffer ring full, and no notification
-of this expected condition is necessary.
-]
-
- Place the id of the buffer in the next ring entry of the
- available ring.
-
- The steps (1) and (2) may be performed repeatedly if batching
- is possible.
-
- A memory barrier should be executed to ensure the device sees
- the updated descriptor table and available ring before the next
- step.
-
- The available “idx” field should be increased by the number of
- entries added to the available ring.
-
- A memory barrier should be executed to ensure that we update
- the idx field before checking for notification suppression.
-
- If notifications are not suppressed, the device should be
- notified of the new buffers.
-
-Note that the above code does not take precautions against the
-available ring buffer wrapping around: this is not possible since
-the ring buffer is the same size as the descriptor table, so step
-(1) will prevent such a condition.
-
-In addition, the maximum queue size is 32768 (it must be a power
-of 2 which fits in 16 bits), so the 16-bit “idx” value can always
-distinguish between a full and empty buffer.
-
-Here is a description of each stage in more detail.
-
- Placing Buffers Into The Descriptor Table
-
-A buffer consists of zero or more read-only physically-contiguous
-elements followed by zero or more physically-contiguous
-write-only elements (it must have at least one element). This
-algorithm maps it into the descriptor table:
-
- for each buffer element, b:
-
- Get the next free descriptor table entry, d
-
- Set d.addr to the physical address of the start of b
-
- Set d.len to the length of b.
-
- If b is write-only, set d.flags to VRING_DESC_F_WRITE,
- otherwise 0.
-
- If there is a buffer element after this:
-
- Set d.next to the index of the next free descriptor element.
-
- Set the VRING_DESC_F_NEXT bit in d.flags.
-
-In practice, the d.next fields are usually used to chain free
-descriptors, and a separate count kept to check there are enough
-free descriptors before beginning the mappings.
-
- Updating The Available Ring
-
-The head of the buffer we mapped is the first d in the algorithm
-above. A naive implementation would do the following:
-
-avail->ring[avail->idx % qsz] = head;
-
-However, in general we can add many descriptors before we update
-the “idx” field (at which point they become visible to the
-device), so we keep a counter of how many we've added:
-
-avail->ring[(avail->idx + added++) % qsz] = head;
-
- Updating The Index Field
-
-Once the idx field of the virtqueue is updated, the device will
-be able to access the descriptor entries we've created and the
-memory they refer to. This is why a memory barrier is generally
-used before the idx update, to ensure it sees the most up-to-date
-copy.
-
-The idx field always increments, and we let it wrap naturally at
-65536:
-
-avail->idx += added;
-
- <sub:Notifying-The-Device>Notifying The Device
-
-Device notification occurs by writing the 16-bit virtqueue index
-of this virtqueue to the Queue Notify field of the virtio header
-in the first I/O region of the PCI device. This can be expensive,
-however, so the device can suppress such notifications if it
-doesn't need them. We have to be careful to expose the new idx
-value before checking the suppression flag: it's OK to notify
-gratuitously, but not to omit a required notification. So again,
-we use a memory barrier here before reading the flags or the
-avail_event field.
-
-If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if
-the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to
-the PCI configuration space.
-
-If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the
-avail_event field in the available ring structure. If the
-available index crossed_the avail_event field value since the
-last notification, we go ahead and write to the PCI configuration
-space. The avail_event field wraps naturally at 65536 as well:
-
-(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
-
- <sub:Receiving-Used-Buffers>Receiving Used Buffers From The
- Device
-
-Once the device has used a buffer (read from or written to it, or
-parts of both, depending on the nature of the virtqueue and the
-device), it sends an interrupt, following an algorithm very
-similar to the algorithm used for the driver to send the device a
-buffer:
-
- Write the head descriptor number to the next field in the used
- ring.
-
- Update the used ring idx.
-
- Determine whether an interrupt is necessary:
-
- If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check
- if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail-
- >flags
-
- If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check
- whether the used index crossed the used_event field value
- since the last update. The used_event field wraps naturally
- at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
-
- If an interrupt is necessary:
-
- If MSI-X capability is disabled:
-
- Set the lower bit of the ISR Status field for the device.
-
- Send the appropriate PCI interrupt for the device.
-
- If MSI-X capability is enabled:
-
- Request the appropriate MSI-X interrupt message for the
- device, Queue Vector field sets the MSI-X Table entry
- number.
-
- If Queue Vector field value is NO_VECTOR, no interrupt
- message is requested for this event.
-
-The guest interrupt handler should:
-
- If MSI-X capability is disabled: read the ISR Status field,
- which will reset it to zero. If the lower bit is zero, the
- interrupt was not for this device. Otherwise, the guest driver
- should look through the used rings of each virtqueue for the
- device, to see if any progress has been made by the device
- which requires servicing.
-
- If MSI-X capability is enabled: look through the used rings of
- each virtqueue mapped to the specific MSI-X vector for the
- device, to see if any progress has been made by the device
- which requires servicing.
-
-For each ring, guest should then disable interrupts by writing
-VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required.
-It can then process used ring entries finally enabling interrupts
-by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the
-EVENT_IDX field in the available structure, Guest should then
-execute a memory barrier, and then recheck the ring empty
-condition. This is necessary to handle the case where, after the
-last check and before enabling interrupts, an interrupt has been
-suppressed by the device:
-
-vring_disable_interrupts(vq);
-
-for (;;) {
-
- if (vq->last_seen_used != vring->used.idx) {
-
- vring_enable_interrupts(vq);
-
- mb();
-
- if (vq->last_seen_used != vring->used.idx)
-
- break;
-
- }
-
- struct vring_used_elem *e =
-vring.used->ring[vq->last_seen_used%vsz];
-
- process_buffer(e);
-
- vq->last_seen_used++;
-
-}
-
- Dealing With Configuration Changes<sub:Dealing-With-Configuration>
-
-Some virtio PCI devices can change the device configuration
-state, as reflected in the virtio header in the PCI configuration
-space. In this case:
-
- If MSI-X capability is disabled: an interrupt is delivered and
- the second highest bit is set in the ISR Status field to
- indicate that the driver should re-examine the configuration
- space.Note that a single interrupt can indicate both that one
- or more virtqueue has been used and that the configuration
- space has changed: even if the config bit is set, virtqueues
- must be scanned.
-
- If MSI-X capability is enabled: an interrupt message is
- requested. The Configuration Vector field sets the MSI-X Table
- entry number to use. If Configuration Vector field value is
- NO_VECTOR, no interrupt message is requested for this event.
-
-Creating New Device Types
-
-Various considerations are necessary when creating a new device
-type:
-
- How Many Virtqueues?
-
-It is possible that a very simple device will operate entirely
-through its configuration space, but most will need at least one
-virtqueue in which it will place requests. A device with both
-input and output (eg. console and network devices described here)
-need two queues: one which the driver fills with buffers to
-receive input, and one which the driver places buffers to
-transmit output.
-
- What Configuration Space Layout?
-
-Configuration space is generally used for rarely-changing or
-initialization-time parameters. But it is a limited resource, so
-it might be better to use a virtqueue to update configuration
-information (the network device does this for filtering,
-otherwise the table in the config space could potentially be very
-large).
-
-Note that this space is generally the guest's native endian,
-rather than PCI's little-endian.
-
- What Device Number?
-
-Currently device numbers are assigned quite freely: a simple
-request mail to the author of this document or the Linux
-virtualization mailing list[footnote:
-
-https://lists.linux-foundation.org/mailman/listinfo/virtualization
-] will be sufficient to secure a unique one.
-
-Meanwhile for experimental drivers, use 65535 and work backwards.
-
- How many MSI-X vectors?
-
-Using the optional MSI-X capability devices can speed up
-interrupt processing by removing the need to read ISR Status
-register by guest driver (which might be an expensive operation),
-reducing interrupt sharing between devices and queues within the
-device, and handling interrupts from multiple CPUs. However, some
-systems impose a limit (which might be as low as 256) on the
-total number of MSI-X vectors that can be allocated to all
-devices. Devices and/or device drivers should take this into
-account, limiting the number of vectors used unless the device is
-expected to cause a high volume of interrupts. Devices can
-control the number of vectors used by limiting the MSI-X Table
-Size or not presenting MSI-X capability in PCI configuration
-space. Drivers can control this by mapping events to as small
-number of vectors as possible, or disabling MSI-X capability
-altogether.
-
- Message Framing
-
-The descriptors used for a buffer should not effect the semantics
-of the message, except for the total length of the buffer. For
-example, a network buffer consists of a 10 byte header followed
-by the network packet. Whether this is presented in the ring
-descriptor chain as (say) a 10 byte buffer and a 1514 byte
-buffer, or a single 1524 byte buffer, or even three buffers,
-should have no effect.
-
-In particular, no implementation should use the descriptor
-boundaries to determine the size of any header in a request.[footnote:
-The current qemu device implementations mistakenly insist that
-the first descriptor cover the header in these cases exactly, so
-a cautious driver should arrange it so.
-]
-
- Device Improvements
-
-Any change to configuration space, or new virtqueues, or
-behavioural changes, should be indicated by negotiation of a new
-feature bit. This establishes clarity[footnote:
-Even if it does mean documenting design or implementation
-mistakes!
-] and avoids future expansion problems.
-
-Clusters of functionality which are always implemented together
-can use a single bit, but if one feature makes sense without the
-others they should not be gratuitously grouped together to
-conserve feature bits. We can always extend the spec when the
-first person needs more than 24 feature bits for their device.
-
-[LaTeX Command: printnomenclature]
-
-Appendix A: virtio_ring.h
-
-#ifndef VIRTIO_RING_H
-
-#define VIRTIO_RING_H
-
-/* An interface for efficient virtio implementation.
-
- *
-
- * This header is BSD licensed so anyone can use the definitions
-
- * to implement compatible drivers/servers.
-
- *
-
- * Copyright 2007, 2009, IBM Corporation
-
- * Copyright 2011, Red Hat, Inc
-
- * All rights reserved.
-
- *
-
- * Redistribution and use in source and binary forms, with or
-without
-
- * modification, are permitted provided that the following
-conditions
-
- * are met:
-
- * 1. Redistributions of source code must retain the above
-copyright
-
- * notice, this list of conditions and the following
-disclaimer.
-
- * 2. Redistributions in binary form must reproduce the above
-copyright
-
- * notice, this list of conditions and the following
-disclaimer in the
-
- * documentation and/or other materials provided with the
-distribution.
-
- * 3. Neither the name of IBM nor the names of its contributors
-
- * may be used to endorse or promote products derived from
-this software
-
- * without specific prior written permission.
-
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
-CONTRIBUTORS ``AS IS'' AND
-
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
-TO, THE
-
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
-PARTICULAR PURPOSE
-
- * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE
-LIABLE
-
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
-CONSEQUENTIAL
-
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
-SUBSTITUTE GOODS
-
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION)
-
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
-CONTRACT, STRICT
-
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
-IN ANY WAY
-
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-POSSIBILITY OF
-
- * SUCH DAMAGE.
-
- */
-
-
-
-/* This marks a buffer as continuing via the next field. */
-
-#define VRING_DESC_F_NEXT 1
-
-/* This marks a buffer as write-only (otherwise read-only). */
-
-#define VRING_DESC_F_WRITE 2
-
-
-
-/* The Host uses this in used->flags to advise the Guest: don't
-kick me
-
- * when you add a buffer. It's unreliable, so it's simply an
-
- * optimization. Guest will still kick if it's out of buffers.
-*/
-
-#define VRING_USED_F_NO_NOTIFY 1
-
-/* The Guest uses this in avail->flags to advise the Host: don't
-
- * interrupt me when you consume a buffer. It's unreliable, so
-it's
-
- * simply an optimization. */
-
-#define VRING_AVAIL_F_NO_INTERRUPT 1
-
-
-
-/* Virtio ring descriptors: 16 bytes.
-
- * These can chain together via "next". */
-
-struct vring_desc {
-
- /* Address (guest-physical). */
-
- uint64_t addr;
-
- /* Length. */
-
- uint32_t len;
-
- /* The flags as indicated above. */
-
- uint16_t flags;
-
- /* We chain unused descriptors via this, too */
-
- uint16_t next;
-
-};
-
-
-
-struct vring_avail {
-
- uint16_t flags;
-
- uint16_t idx;
-
- uint16_t ring[];
-
- uint16_t used_event;
-
-};
-
-
-
-/* u32 is used here for ids for padding reasons. */
-
-struct vring_used_elem {
-
- /* Index of start of used descriptor chain. */
-
- uint32_t id;
-
- /* Total length of the descriptor chain which was written
-to. */
-
- uint32_t len;
-
-};
-
-
-
-struct vring_used {
-
- uint16_t flags;
-
- uint16_t idx;
-
- struct vring_used_elem ring[];
-
- uint16_t avail_event;
-
-};
-
-
-
-struct vring {
-
- unsigned int num;
-
-
-
- struct vring_desc *desc;
-
- struct vring_avail *avail;
-
- struct vring_used *used;
-
-};
-
-
-
-/* The standard layout for the ring is a continuous chunk of
-memory which
-
- * looks like this. We assume num is a power of 2.
-
- *
-
- * struct vring {
-
- * // The actual descriptors (16 bytes each)
-
- * struct vring_desc desc[num];
-
- *
-
- * // A ring of available descriptor heads with free-running
-index.
-
- * __u16 avail_flags;
-
- * __u16 avail_idx;
-
- * __u16 available[num];
-
- *
-
- * // Padding to the next align boundary.
-
- * char pad[];
-
- *
-
- * // A ring of used descriptor heads with free-running
-index.
-
- * __u16 used_flags;
-
- * __u16 EVENT_IDX;
-
- * struct vring_used_elem used[num];
-
- * };
-
- * Note: for virtio PCI, align is 4096.
-
- */
-
-static inline void vring_init(struct vring *vr, unsigned int num,
-void *p,
-
- unsigned long align)
-
-{
-
- vr->num = num;
-
- vr->desc = p;
-
- vr->avail = p + num*sizeof(struct vring_desc);
-
- vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
-
- + align-1)
-
- & ~(align - 1));
-
-}
-
-
-
-static inline unsigned vring_size(unsigned int num, unsigned long
-align)
-
-{
-
- return ((sizeof(struct vring_desc)*num +
-sizeof(uint16_t)*(2+num)
-
- + align - 1) & ~(align - 1))
-
- + sizeof(uint16_t)*3 + sizeof(struct
-vring_used_elem)*num;
-
-}
-
-
-
-static inline int vring_need_event(uint16_t event_idx, uint16_t
-new_idx, uint16_t old_idx)
-
-{
-
- return (uint16_t)(new_idx - event_idx - 1) <
-(uint16_t)(new_idx - old_idx);
-
-}
-
-#endif /* VIRTIO_RING_H */
-
-<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits
-
-Currently there are five device-independent feature bits defined:
-
- VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature
- indicates that the driver wants an interrupt if the device runs
- out of available descriptors on a virtqueue, even though
- interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT
- flag or the used_event field. An example of this is the
- networking driver: it doesn't need to know every time a packet
- is transmitted, but it does need to free the transmitted
- packets a finite time after they are transmitted. It can avoid
- using a timer if the device interrupts it when all the packets
- are transmitted.
-
- VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature
- indicates that the driver can use descriptors with the
- VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors]
- .
-
- VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event
- and the avail_event fields. If set, it indicates that the
- device should ignore the flags field in the available ring
- structure. Instead, the used_event field in this structure is
- used by guest to suppress device interrupts. Further, the
- driver should ignore the flags field in the used ring
- structure. Instead, the avail_event field in this structure is
- used by the device to suppress notifications. If unset, the
- driver should ignore the used_event field; the device should
- ignore the avail_event field; the flags field is used
-
-Appendix C: Network Device
-
-The virtio network device is a virtual ethernet card, and is the
-most complex of the devices supported so far by virtio. It has
-enhanced rapidly and demonstrates clearly how support for new
-features should be added to an existing device. Empty buffers are
-placed in one virtqueue for receiving packets, and outgoing
-packets are enqueued into another for transmission in that order.
-A third command queue is used to control advanced filtering
-features.
-
- Configuration
-
- Subsystem Device ID 1
-
- Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote:
-Only if VIRTIO_NET_F_CTRL_VQ set
-]
-
- Feature bits
-
- VIRTIO_NET_F_CSUM (0) Device handles packets with partial
- checksum
-
- VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial
- checksum
-
- VIRTIO_NET_F_MAC (5) Device has given MAC address.
-
- VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with
- any GSO type.[footnote:
-It was supposed to indicate segmentation offload support, but
-upon further investigation it became clear that multiple bits
-were required.
-]
-
- VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4.
-
- VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6.
-
- VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN.
-
- VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO.
-
- VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4.
-
- VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6.
-
- VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN.
-
- VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO.
-
- VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers.
-
- VIRTIO_NET_F_STATUS (16) Configuration status field is
- available.
-
- VIRTIO_NET_F_CTRL_VQ (17) Control channel is available.
-
- VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support.
-
- VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering.
-
- VIRTIO_NET_F_GUEST_ANNOUNCE(21) Guest can send gratuitous
- packets.
-
- Device configuration layout Two configuration fields are
- currently defined. The mac address field always exists (though
- is only valid if VIRTIO_NET_F_MAC is set), and the status field
- only exists if VIRTIO_NET_F_STATUS is set. Two read-only bits
- are currently defined for the status field:
- VIRTIO_NET_S_LINK_UP and VIRTIO_NET_S_ANNOUNCE. #define VIRTIO_NET_S_LINK_UP 1
-
-#define VIRTIO_NET_S_ANNOUNCE 2
-
-
-
-struct virtio_net_config {
-
- u8 mac[6];
-
- u16 status;
-
-};
-
- Device Initialization
-
- The initialization routine should identify the receive and
- transmission virtqueues.
-
- If the VIRTIO_NET_F_MAC feature bit is set, the configuration
- space “mac” entry indicates the “physical” address of the the
- network card, otherwise a private MAC address should be
- assigned. All guests are expected to negotiate this feature if
- it is set.
-
- If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify
- the control virtqueue.
-
- If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link
- status can be read from the bottom bit of the “status” config
- field. Otherwise, the link should be assumed active.
-
- The receive virtqueue should be filled with receive buffers.
- This is described in detail below in “Setting Up Receive
- Buffers”.
-
- A driver can indicate that it will generate checksumless
- packets by negotating the VIRTIO_NET_F_CSUM feature. This “
- checksum offload” is a common feature on modern network cards.
-
- If that feature is negotiated[footnote:
-ie. VIRTIO_NET_F_HOST_TSO* and VIRTIO_NET_F_HOST_UFO are
-dependent on VIRTIO_NET_F_CSUM; a dvice which offers the offload
-features must offer the checksum feature, and a driver which
-accepts the offload features must accept the checksum feature.
-Similar logic applies to the VIRTIO_NET_F_GUEST_TSO4 features
-depending on VIRTIO_NET_F_GUEST_CSUM.
-], a driver can use TCP or UDP segmentation offload by
- negotiating the VIRTIO_NET_F_HOST_TSO4 (IPv4 TCP),
- VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and VIRTIO_NET_F_HOST_UFO
- (UDP fragmentation) features. It should not send TCP packets
- requiring segmentation offload which have the Explicit
- Congestion Notification bit set, unless the
- VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote:
-This is a common restriction in real, older network cards.
-]
-
- The converse features are also available: a driver can save the
- virtual device some work by negotiating these features.[footnote:
-For example, a network packet transported between two guests on
-the same system may not require checksumming at all, nor
-segmentation, if both guests are amenable.
-] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially
- checksummed packets can be received, and if it can do that then
- the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
- VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input
- equivalents of the features described above. See “Receiving
- Packets” below.
-
- Device Operation
-
-Packets are transmitted by placing them in the transmitq, and
-buffers for incoming packets are placed in the receiveq. In each
-case, the packet itself is preceded by a header:
-
-struct virtio_net_hdr {
-
-#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1
-
- u8 flags;
-
-#define VIRTIO_NET_HDR_GSO_NONE 0
-
-#define VIRTIO_NET_HDR_GSO_TCPV4 1
-
-#define VIRTIO_NET_HDR_GSO_UDP 3
-
-#define VIRTIO_NET_HDR_GSO_TCPV6 4
-
-#define VIRTIO_NET_HDR_GSO_ECN 0x80
-
- u8 gso_type;
-
- u16 hdr_len;
-
- u16 gso_size;
-
- u16 csum_start;
-
- u16 csum_offset;
-
-/* Only if VIRTIO_NET_F_MRG_RXBUF: */
-
- u16 num_buffers
-
-};
-
-The controlq is used to control device features such as
-filtering.
-
- Packet Transmission
-
-Transmitting a single packet is simple, but varies depending on
-the different features the driver negotiated.
-
- If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has
- not been fully checksummed, then the virtio_net_hdr's fields
- are set as follows. Otherwise, the packet must be fully
- checksummed, and flags is zero.
-
- flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
-
- <ite:csum_start-is-set>csum_start is set to the offset within
- the packet to begin checksumming, and
-
- csum_offset indicates how many bytes after the csum_start the
- new (16 bit ones' complement) checksum should be placed.[footnote:
-For example, consider a partially checksummed TCP (IPv4) packet.
-It will have a 14 byte ethernet header and 20 byte IP header
-followed by the TCP header (with the TCP checksum field 16 bytes
-into that header). csum_start will be 14+20 = 34 (the TCP
-checksum includes the header), and csum_offset will be 16. The
-value in the TCP checksum field should be initialized to the sum
-of the TCP pseudo header, so that replacing it by the ones'
-complement checksum of the TCP header and body will give the
-correct result.
-]
-
- <enu:If-the-driver>If the driver negotiated
- VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires
- TCP segmentation or UDP fragmentation, then the “gso_type”
- field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
- (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this
- case, packets larger than 1514 bytes can be transmitted: the
- metadata indicates how to replicate the packet header to cut it
- into smaller packets. The other gso fields are set:
-
- hdr_len is a hint to the device as to how much of the header
- needs to be kept to copy into each packet, usually set to the
- length of the headers, including the transport header.[footnote:
-Due to various bugs in implementations, this field is not useful
-as a guarantee of the transport header size.
-]
-
- gso_size is the maximum size of each packet beyond that header
- (ie. MSS).
-
- If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the
- VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well,
- indicating that the TCP packet has the ECN bit set.[footnote:
-This case is not handled by some older hardware, so is called out
-specifically in the protocol.
-]
-
- If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
- the num_buffers field is set to zero.
-
- The header and packet are added as one output buffer to the
- transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device]
- ).[footnote:
-Note that the header will be two bytes longer for the
-VIRTIO_NET_F_MRG_RXBUF case.
-]
-
- Packet Transmission Interrupt
-
-Often a driver will suppress transmission interrupts using the
-VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers]
-) and check for used packets in the transmit path of following
-packets. However, it will still receive interrupts if the
-VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that
-the transmission queue is completely emptied.
-
-The normal behavior in this interrupt handler is to retrieve and
-new descriptors from the used ring and free the corresponding
-headers and packets.
-
- Setting Up Receive Buffers
-
-It is generally a good idea to keep the receive virtqueue as
-fully populated as possible: if it runs out, network performance
-will suffer.
-
-If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or
-VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to
-accept packets of up to 65550 bytes long (the maximum size of a
-TCP or UDP packet, plus the 14 byte ethernet header), otherwise
-1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every
-buffer in the receive queue needs to be at least this length [footnote:
-Obviously each one can be split across multiple descriptor
-elements.
-].
-
-If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at
-least the size of the struct virtio_net_hdr.
-
- Packet Receive Interrupt
-
-When a packet is copied into a buffer in the receiveq, the
-optimal path is to disable further interrupts for the receiveq
-(see [sub:Receiving-Used-Buffers]) and process packets until no
-more are found, then re-enable them.
-
-Processing packet involves:
-
- If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
- then the “num_buffers” field indicates how many descriptors
- this packet is spread over (including this one). This allows
- receipt of large packets without having to allocate large
- buffers. In this case, there will be at least “num_buffers” in
- the used ring, and they should be chained together to form a
- single packet. The other buffers will not begin with a struct
- virtio_net_hdr.
-
- If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or
- the “num_buffers” field is one, then the entire packet will be
- contained within this buffer, immediately following the struct
- virtio_net_hdr.
-
- If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the
- VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be
- set: if so, the checksum on the packet is incomplete and the “
- csum_start” and “csum_offset” fields indicate how to calculate
- it (see [ite:csum_start-is-set]).
-
- If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were
- negotiated, then the “gso_type” may be something other than
- VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the
- desired MSS (see [enu:If-the-driver]).
-
- Control Virtqueue
-
-The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is
-negotiated) to send commands to manipulate various features of
-the device which would not easily map into the configuration
-space.
-
-All commands are of the following form:
-
-struct virtio_net_ctrl {
-
- u8 class;
-
- u8 command;
-
- u8 command-specific-data[];
-
- u8 ack;
-
-};
-
-
-
-/* ack values */
-
-#define VIRTIO_NET_OK 0
-
-#define VIRTIO_NET_ERR 1
-
-The class, command and command-specific-data are set by the
-driver, and the device sets the ack byte. There is little it can
-do except issue a diagnostic if the ack byte is not
-VIRTIO_NET_OK.
-
- Packet Receive Filtering
-
-If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can
-send control commands for promiscuous mode, multicast receiving,
-and filtering of MAC addresses.
-
-Note that in general, these commands are best-effort: unwanted
-packets may still arrive.
-
- Setting Promiscuous Mode
-
-#define VIRTIO_NET_CTRL_RX 0
-
- #define VIRTIO_NET_CTRL_RX_PROMISC 0
-
- #define VIRTIO_NET_CTRL_RX_ALLMULTI 1
-
-The class VIRTIO_NET_CTRL_RX has two commands:
-VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and
-VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and
-off. The command-specific-data is one byte containing 0 (off) or
-1 (on).
-
- Setting MAC Address Filtering
-
-struct virtio_net_ctrl_mac {
-
- u32 entries;
-
- u8 macs[entries][ETH_ALEN];
-
-};
-
-
-
-#define VIRTIO_NET_CTRL_MAC 1
-
- #define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
-
-The device can filter incoming packets by any number of
-destination MAC addresses.[footnote:
-Since there are no guarantees, it can use a hash filter
-orsilently switch to allmulti or promiscuous mode if it is given
-too many addresses.
-] This table is set using the class VIRTIO_NET_CTRL_MAC and the
-command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data
-is two variable length tables of 6-byte MAC addresses. The first
-table contains unicast addresses, and the second contains
-multicast addresses.
-
- VLAN Filtering
-
-If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it
-can control a VLAN filter table in the device.
-
-#define VIRTIO_NET_CTRL_VLAN 2
-
- #define VIRTIO_NET_CTRL_VLAN_ADD 0
-
- #define VIRTIO_NET_CTRL_VLAN_DEL 1
-
-Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL
-command take a 16-bit VLAN id as the command-specific-data.
-
- Gratuitous Packet Sending
-
-If the driver negotiates the VIRTIO_NET_F_GUEST_ANNOUNCE (depends
-on VIRTIO_NET_F_CTRL_VQ), it can ask the guest to send gratuitous
-packets; this is usually done after the guest has been physically
-migrated, and needs to announce its presence on the new network
-links. (As hypervisor does not have the knowledge of guest
-network configuration (eg. tagged vlan) it is simplest to prod
-the guest in this way).
-
-#define VIRTIO_NET_CTRL_ANNOUNCE 3
-
- #define VIRTIO_NET_CTRL_ANNOUNCE_ACK 0
-
-The Guest needs to check VIRTIO_NET_S_ANNOUNCE bit in status
-field when it notices the changes of device configuration. The
-command VIRTIO_NET_CTRL_ANNOUNCE_ACK is used to indicate that
-driver has recevied the notification and device would clear the
-VIRTIO_NET_S_ANNOUNCE bit in the status filed after it received
-this command.
-
-Processing this notification involves:
-
- Sending the gratuitous packets or marking there are pending
- gratuitous packets to be sent and letting deferred routine to
- send them.
-
- Sending VIRTIO_NET_CTRL_ANNOUNCE_ACK command through control
- vq.
-
- .
-
-Appendix D: Block Device
-
-The virtio block device is a simple virtual block device (ie.
-disk). Read and write requests (and other exotic requests) are
-placed in the queue, and serviced (probably out of order) by the
-device except where noted.
-
- Configuration
-
- Subsystem Device ID 2
-
- Virtqueues 0:requestq.
-
- Feature bits
-
- VIRTIO_BLK_F_BARRIER (0) Host supports request barriers.
-
- VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is
- in “size_max”.
-
- VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a
- request is in “seg_max”.
-
- VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “
- geometry”.
-
- VIRTIO_BLK_F_RO (5) Device is read-only.
-
- VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”.
-
- VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
-
- VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
-
- Device configuration layout The capacity of the device
- (expressed in 512-byte sectors) is always present. The
- availability of the others all depend on various feature bits
- as indicated above. struct virtio_blk_config {
-
- u64 capacity;
-
- u32 size_max;
-
- u32 seg_max;
-
- struct virtio_blk_geometry {
-
- u16 cylinders;
-
- u8 heads;
-
- u8 sectors;
-
- } geometry;
-
- u32 blk_size;
-
-
-
-};
-
- Device Initialization
-
- The device size should be read from the “capacity”
- configuration field. No requests should be submitted which goes
- beyond this limit.
-
- If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the
- blk_size field can be read to determine the optimal sector size
- for the driver to use. This does not effect the units used in
- the protocol (always 512 bytes), but awareness of the correct
- value can effect performance.
-
- If the VIRTIO_BLK_F_RO feature is set by the device, any write
- requests will fail.
-
- Device Operation
-
-The driver queues requests to the virtqueue, and they are used by
-the device (not necessarily in order). Each request is of form:
-
-struct virtio_blk_req {
-
-
-
- u32 type;
-
- u32 ioprio;
-
- u64 sector;
-
- char data[][512];
-
- u8 status;
-
-};
-
-If the device has VIRTIO_BLK_F_SCSI feature, it can also support
-scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req {
-
- u32 type;
-
- u32 ioprio;
-
- u64 sector;
-
- char cmd[];
-
- char data[][512];
-
-#define SCSI_SENSE_BUFFERSIZE 96
-
- u8 sense[SCSI_SENSE_BUFFERSIZE];
-
- u32 errors;
-
- u32 data_len;
-
- u32 sense_len;
-
- u32 residual;
-
- u8 status;
-
-};
-
-The type of the request is either a read (VIRTIO_BLK_T_IN), a
-write (VIRTIO_BLK_T_OUT), a scsi packet command
-(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote:
-the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device
-does not distinguish between them
-]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote:
-the FLUSH and FLUSH_OUT types are equivalent, the device does not
-distinguish between them
-]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit
-(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a
-barrier and that all preceding requests must be complete before
-this one, and all following requests must not be started until
-this is complete. Note that a barrier does not flush caches in
-the underlying backend device in host, and thus does not serve as
-data consistency guarantee. Driver must use FLUSH request to
-flush the host cache.
-
-#define VIRTIO_BLK_T_IN 0
-
-#define VIRTIO_BLK_T_OUT 1
-
-#define VIRTIO_BLK_T_SCSI_CMD 2
-
-#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
-
-#define VIRTIO_BLK_T_FLUSH 4
-
-#define VIRTIO_BLK_T_FLUSH_OUT 5
-
-#define VIRTIO_BLK_T_BARRIER 0x80000000
-
-The ioprio field is a hint about the relative priorities of
-requests to the device: higher numbers indicate more important
-requests.
-
-The sector number indicates the offset (multiplied by 512) where
-the read or write is to occur. This field is unused and set to 0
-for scsi packet commands and for flush commands.
-
-The cmd field is only present for scsi packet command requests,
-and indicates the command to perform. This field must reside in a
-single, separate read-only buffer; command length can be derived
-from the length of this buffer.
-
-Note that these first three (four for scsi packet commands)
-fields are always read-only: the data field is either read-only
-or write-only, depending on the request. The size of the read or
-write can be derived from the total size of the request buffers.
-
-The sense field is only present for scsi packet command requests,
-and indicates the buffer for scsi sense data.
-
-The data_len field is only present for scsi packet command
-requests, this field is deprecated, and should be ignored by the
-driver. Historically, devices copied data length there.
-
-The sense_len field is only present for scsi packet command
-requests and indicates the number of bytes actually written to
-the sense buffer.
-
-The residual field is only present for scsi packet command
-requests and indicates the residual size, calculated as data
-length - number of bytes actually transferred.
-
-The final status byte is written by the device: either
-VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest
-error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK 0
-
-#define VIRTIO_BLK_S_IOERR 1
-
-#define VIRTIO_BLK_S_UNSUPP 2
-
-Historically, devices assumed that the fields type, ioprio and
-sector reside in a single, separate read-only buffer; the fields
-errors, data_len, sense_len and residual reside in a single,
-separate write-only buffer; the sense field in a separate
-write-only buffer of size 96 bytes, by itself; the fields errors,
-data_len, sense_len and residual in a single write-only buffer;
-and the status field is a separate read-only buffer of size 1
-byte, by itself.
-
-Appendix E: Console Device
-
-The virtio console device is a simple device for data input and
-output. A device may have one or more ports. Each port has a pair
-of input and output virtqueues. Moreover, a device has a pair of
-control IO virtqueues. The control virtqueues are used to
-communicate information between the device and the driver about
-ports being opened and closed on either side of the connection,
-indication from the host about whether a particular port is a
-console port, adding new ports, port hot-plug/unplug, etc., and
-indication from the guest about whether a port or a device was
-successfully added, port open/close, etc.. For data IO, one or
-more empty buffers are placed in the receive queue for incoming
-data and outgoing characters are placed in the transmit queue.
-
- Configuration
-
- Subsystem Device ID 3
-
- Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control
- receiveq[footnote:
-Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
-], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1),
- ...
-
- Feature bits
-
- VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields
- are valid.
-
- VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple
- ports; configuration fields nr_ports and max_nr_ports are
- valid and control virtqueues will be used.
-
- Device configuration layout The size of the console is supplied
- in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature
- is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature
- is set, the maximum number of ports supported by the device can
- be fetched.struct virtio_console_config {
-
- u16 cols;
-
- u16 rows;
-
-
-
- u32 max_nr_ports;
-
-};
-
- Device Initialization
-
- If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver
- can read the console dimensions from the configuration fields.
-
- If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the
- driver can spawn multiple ports, not all of which may be
- attached to a console. Some could be generic ports. In this
- case, the control virtqueues are enabled and according to the
- max_nr_ports configuration-space value, the appropriate number
- of virtqueues are created. A control message indicating the
- driver is ready is sent to the host. The host can then send
- control messages for adding new ports to the device. After
- creating and initializing each port, a
- VIRTIO_CONSOLE_PORT_READY control message is sent to the host
- for that port so the host can let us know of any additional
- configuration options set for that port.
-
- The receiveq for each port is populated with one or more
- receive buffers.
-
- Device Operation
-
- For output, a buffer containing the characters is placed in the
- port's transmitq.[footnote:
-Because this is high importance and low bandwidth, the current
-Linux implementation polls for the buffer to be used, rather than
-waiting for an interrupt, simplifying the implementation
-significantly. However, for generic serial ports with the
-O_NONBLOCK flag set, the polling limitation is relaxed and the
-consumed buffers are freed upon the next write or poll call or
-when a port is closed or hot-unplugged.
-]
-
- When a buffer is used in the receiveq (signalled by an
- interrupt), the contents is the input to the port associated
- with the virtqueue for which the notification was received.
-
- If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a
- configuration change interrupt may occur. The updated size can
- be read from the configuration fields.
-
- If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT
- feature, active ports are announced by the host using the
- VIRTIO_CONSOLE_PORT_ADD control message. The same message is
- used for port hot-plug as well.
-
- If the host specified a port `name', a sysfs attribute is
- created with the name filled in, so that udev rules can be
- written that can create a symlink from the port's name to the
- char device for port discovery by applications in the guest.
-
- Changes to ports' state are effected by control messages.
- Appropriate action is taken on the port indicated in the
- control message. The layout of the structure of the control
- buffer and the events associated are:struct virtio_console_control {
-
- uint32_t id; /* Port number */
-
- uint16_t event; /* The kind of control event */
-
- uint16_t value; /* Extra information for the event */
-
-};
-
-
-
-/* Some events for the internal messages (control packets) */
-
-
-
-#define VIRTIO_CONSOLE_DEVICE_READY 0
-
-#define VIRTIO_CONSOLE_PORT_ADD 1
-
-#define VIRTIO_CONSOLE_PORT_REMOVE 2
-
-#define VIRTIO_CONSOLE_PORT_READY 3
-
-#define VIRTIO_CONSOLE_CONSOLE_PORT 4
-
-#define VIRTIO_CONSOLE_RESIZE 5
-
-#define VIRTIO_CONSOLE_PORT_OPEN 6
-
-#define VIRTIO_CONSOLE_PORT_NAME 7
-
-Appendix F: Entropy Device
-
-The virtio entropy device supplies high-quality randomness for
-guest use.
-
- Configuration
-
- Subsystem Device ID 4
-
- Virtqueues 0:requestq.
-
- Feature bits None currently defined
-
- Device configuration layout None currently defined.
-
- Device Initialization
-
- The virtqueue is initialized
-
- Device Operation
-
-When the driver requires random bytes, it places the descriptor
-of one or more buffers in the queue. It will be completely filled
-by random data by the device.
-
-Appendix G: Memory Balloon Device
-
-The virtio memory balloon device is a primitive device for
-managing guest memory: the device asks for a certain amount of
-memory, and the guest supplies it (or withdraws it, if the device
-has more than it asks for). This allows the guest to adapt to
-changes in allowance of underlying physical memory. If the
-feature is negotiated, the device can also be used to communicate
-guest memory statistics to the host.
-
- Configuration
-
- Subsystem Device ID 5
-
- Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote:
-Only if VIRTIO_BALLON_F_STATS_VQ set
-]
-
- Feature bits
-
- VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before
- pages from the balloon are used.
-
- VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest
- memory statistics is present.
-
- Device configuration layout Both fields of this configuration
- are always available. Note that they are little endian, despite
- convention that device fields are guest endian:struct virtio_balloon_config {
-
- u32 num_pages;
-
- u32 actual;
-
-};
-
- Device Initialization
-
- The inflate and deflate virtqueues are identified.
-
- If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
-
- Identify the stats virtqueue.
-
- Add one empty buffer to the stats virtqueue and notify the
- host.
-
-Device operation begins immediately.
-
- Device Operation
-
- Memory Ballooning The device is driven by the receipt of a
- configuration change interrupt.
-
- The “num_pages” configuration field is examined. If this is
- greater than the “actual” number of pages, memory must be given
- to the balloon. If it is less than the “actual” number of
- pages, memory may be taken back from the balloon for general
- use.
-
- To supply memory to the balloon (aka. inflate):
-
- The driver constructs an array of addresses of unused memory
- pages. These addresses are divided by 4096[footnote:
-This is historical, and independent of the guest page size
-] and the descriptor describing the resulting 32-bit array is
- added to the inflateq.
-
- To remove memory from the balloon (aka. deflate):
-
- The driver constructs an array of addresses of memory pages it
- has previously given to the balloon, as described above. This
- descriptor is added to the deflateq.
-
- If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the
- guest may not use these requested pages until that descriptor
- in the deflateq has been used by the device.
-
- Otherwise, the guest may begin to re-use pages previously given
- to the balloon before the device has acknowledged their
- withdrawl. [footnote:
-In this case, deflation advice is merely a courtesy
-]
-
- In either case, once the device has completed the inflation or
- deflation, the “actual” field of the configuration should be
- updated to reflect the new number of pages in the balloon.[footnote:
-As updates to configuration space are not atomic, this field
-isn't particularly reliable, but can be used to diagnose buggy
-guests.
-]
-
- Memory Statistics
-
-The stats virtqueue is atypical because communication is driven
-by the device (not the driver). The channel becomes active at
-driver initialization time when the driver adds an empty buffer
-and notifies the device. A request for memory statistics proceeds
-as follows:
-
- The device pushes the buffer onto the used ring and sends an
- interrupt.
-
- The driver pops the used buffer and discards it.
-
- The driver collects memory statistics and writes them into a
- new buffer.
-
- The driver adds the buffer to the virtqueue and notifies the
- device.
-
- The device pops the buffer (retaining it to initiate a
- subsequent request) and consumes the statistics.
-
- Memory Statistics Format Each statistic consists of a 16 bit
- tag and a 64 bit value. Both quantities are represented in the
- native endian of the guest. All statistics are optional and the
- driver may choose which ones to supply. To guarantee backwards
- compatibility, unsupported statistics should be omitted.
-
- struct virtio_balloon_stat {
-
-#define VIRTIO_BALLOON_S_SWAP_IN 0
-
-#define VIRTIO_BALLOON_S_SWAP_OUT 1
-
-#define VIRTIO_BALLOON_S_MAJFLT 2
-
-#define VIRTIO_BALLOON_S_MINFLT 3
-
-#define VIRTIO_BALLOON_S_MEMFREE 4
-
-#define VIRTIO_BALLOON_S_MEMTOT 5
-
- u16 tag;
-
- u64 val;
-
-} __attribute__((packed));
-
- Tags
-
- VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been
- swapped in (in bytes).
-
- VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been
- swapped out to disk (in bytes).
-
- VIRTIO_BALLOON_S_MAJFLT The number of major page faults that
- have occurred.
-
- VIRTIO_BALLOON_S_MINFLT The number of minor page faults that
- have occurred.
-
- VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used
- for any purpose (in bytes).
-
- VIRTIO_BALLOON_S_MEMTOT The total amount of memory available
- (in bytes).
-
-Appendix H: Rpmsg: Remote Processor Messaging
-
-Virtio rpmsg devices represent remote processors on the system
-which run in asymmetric multi-processing (AMP) configuration, and
-which are usually used to offload cpu-intensive tasks from the
-main application processor (a typical SoC methodology).
-
-Virtio is being used to communicate with those remote processors;
-empty buffers are placed in one virtqueue for receiving messages,
-and non-empty buffers, containing outbound messages, are enqueued
-in a second virtqueue for transmission.
-
-Numerous communication channels can be multiplexed over those two
-virtqueues, so different entities, running on the application and
-remote processor, can directly communicate in a point-to-point
-fashion.
-
- Configuration
-
- Subsystem Device ID 7
-
- Virtqueues 0:receiveq. 1:transmitq.
-
- Feature bits
-
- VIRTIO_RPMSG_F_NS (0) Device sends (and capable of receiving)
- name service messages announcing the creation (or
- destruction) of a channel:/**
-
- * struct rpmsg_ns_msg - dynamic name service announcement
-message
-
- * @name: name of remote service that is published
-
- * @addr: address of remote service that is published
-
- * @flags: indicates whether service is created or destroyed
-
- *
-
- * This message is sent across to publish a new service (or
-announce
-
- * about its removal). When we receives these messages, an
-appropriate
-
- * rpmsg channel (i.e device) is created/destroyed.
-
- */
-
-struct rpmsg_ns_msgoon_config {
-
- char name[RPMSG_NAME_SIZE];
-
- u32 addr;
-
- u32 flags;
-
-} __packed;
-
-
-
-/**
-
- * enum rpmsg_ns_flags - dynamic name service announcement flags
-
- *
-
- * @RPMSG_NS_CREATE: a new remote service was just created
-
- * @RPMSG_NS_DESTROY: a remote service was just destroyed
-
- */
-
-enum rpmsg_ns_flags {
-
- RPMSG_NS_CREATE = 0,
-
- RPMSG_NS_DESTROY = 1,
-
-};
-
- Device configuration layout
-
-At his point none currently defined.
-
- Device Initialization
-
- The initialization routine should identify the receive and
- transmission virtqueues.
-
- The receive virtqueue should be filled with receive buffers.
-
- Device Operation
-
-Messages are transmitted by placing them in the transmitq, and
-buffers for inbound messages are placed in the receiveq. In any
-case, messages are always preceded by the following header: /**
-
- * struct rpmsg_hdr - common header for all rpmsg messages
-
- * @src: source address
-
- * @dst: destination address
-
- * @reserved: reserved for future use
-
- * @len: length of payload (in bytes)
-
- * @flags: message flags
-
- * @data: @len bytes of message payload data
-
- *
-
- * Every message sent(/received) on the rpmsg bus begins with
-this header.
-
- */
-
-struct rpmsg_hdr {
-
- u32 src;
-
- u32 dst;
-
- u32 reserved;
-
- u16 len;
-
- u16 flags;
-
- u8 data[0];
-
-} __packed;
-
-Appendix I: SCSI Host Device
-
-The virtio SCSI host device groups together one or more virtual
-logical units (such as disks), and allows communicating to them
-using the SCSI protocol. An instance of the device represents a
-SCSI host to which many targets and LUNs are attached.
-
-The virtio SCSI device services two kinds of requests:
-
- command requests for a logical unit;
-
- task management functions related to a logical unit, target or
- command.
-
-The device is also able to send out notifications about added and
-removed logical units. Together, these capabilities provide a
-SCSI transport protocol that uses virtqueues as the transfer
-medium. In the transport protocol, the virtio driver acts as the
-initiator, while the virtio SCSI host provides one or more
-targets that receive and process the requests.
-
- Configuration
-
- Subsystem Device ID 8
-
- Virtqueues 0:controlq; 1:eventq; 2..n:request queues.
-
- Feature bits
-
- VIRTIO_SCSI_F_INOUT (0) A single request can include both
- read-only and write-only data buffers.
-
- VIRTIO_SCSI_F_HOTPLUG (1) The host should enable
- hot-plug/hot-unplug of new LUNs and targets on the SCSI bus.
-
- Device configuration layout All fields of this configuration
- are always available. sense_size and cdb_size are writable by
- the guest.struct virtio_scsi_config {
-
- u32 num_queues;
-
- u32 seg_max;
-
- u32 max_sectors;
-
- u32 cmd_per_lun;
-
- u32 event_info_size;
-
- u32 sense_size;
-
- u32 cdb_size;
-
- u16 max_channel;
-
- u16 max_target;
-
- u32 max_lun;
-
-};
-
- num_queues is the total number of request virtqueues exposed by
- the device. The driver is free to use only one request queue,
- or it can use more to achieve better performance.
-
- seg_max is the maximum number of segments that can be in a
- command. A bidirectional command can include seg_max input
- segments and seg_max output segments.
-
- max_sectors is a hint to the guest about the maximum transfer
- size it should use.
-
- cmd_per_lun is a hint to the guest about the maximum number of
- linked commands it should send to one LUN. The actual value
- to be used is the minimum of cmd_per_lun and the virtqueue
- size.
-
- event_info_size is the maximum size that the device will fill
- for buffers that the driver places in the eventq. The driver
- should always put buffers at least of this size. It is
- written by the device depending on the set of negotated
- features.
-
- sense_size is the maximum size of the sense data that the
- device will write. The default value is written by the device
- and will always be 96, but the driver can modify it. It is
- restored to the default when the device is reset.
-
- cdb_size is the maximum size of the CDB that the driver will
- write. The default value is written by the device and will
- always be 32, but the driver can likewise modify it. It is
- restored to the default when the device is reset.
-
- max_channel, max_target and max_lun can be used by the driver
- as hints to constrain scanning the logical units on the
- host.h
-
- Device Initialization
-
-The initialization routine should first of all discover the
-device's virtqueues.
-
-If the driver uses the eventq, it should then place at least a
-buffer in the eventq.
-
-The driver can immediately issue requests (for example, INQUIRY
-or REPORT LUNS) or task management functions (for example, I_T
-RESET).
-
- Device Operation: request queues
-
-The driver queues requests to an arbitrary request queue, and
-they are used by the device on that same queue. It is the
-responsibility of the driver to ensure strict request ordering
-for commands placed on different queues, because they will be
-consumed with no order constraints.
-
-Requests have the following format:
-
-struct virtio_scsi_req_cmd {
-
- // Read-only
-
- u8 lun[8];
-
- u64 id;
-
- u8 task_attr;
-
- u8 prio;
-
- u8 crn;
-
- char cdb[cdb_size];
-
- char dataout[];
-
- // Write-only part
-
- u32 sense_len;
-
- u32 residual;
-
- u16 status_qualifier;
-
- u8 status;
-
- u8 response;
-
- u8 sense[sense_size];
-
- char datain[];
-
-};
-
-
-
-/* command-specific response values */
-
-#define VIRTIO_SCSI_S_OK 0
-
-#define VIRTIO_SCSI_S_OVERRUN 1
-
-#define VIRTIO_SCSI_S_ABORTED 2
-
-#define VIRTIO_SCSI_S_BAD_TARGET 3
-
-#define VIRTIO_SCSI_S_RESET 4
-
-#define VIRTIO_SCSI_S_BUSY 5
-
-#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
-
-#define VIRTIO_SCSI_S_TARGET_FAILURE 7
-
-#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
-
-#define VIRTIO_SCSI_S_FAILURE 9
-
-
-
-/* task_attr */
-
-#define VIRTIO_SCSI_S_SIMPLE 0
-
-#define VIRTIO_SCSI_S_ORDERED 1
-
-#define VIRTIO_SCSI_S_HEAD 2
-
-#define VIRTIO_SCSI_S_ACA 3
-
-The lun field addresses a target and logical unit in the
-virtio-scsi device's SCSI domain. The only supported format for
-the LUN field is: first byte set to 1, second byte set to target,
-third and fourth byte representing a single level LUN structure,
-followed by four zero bytes. With this representation, a
-virtio-scsi device can serve up to 256 targets and 16384 LUNs per
-target.
-
-The id field is the command identifier (“tag”).
-
-task_attr, prio and crn should be left to zero. task_attr defines
-the task attribute as in the table above, but all task attributes
-may be mapped to SIMPLE by the device; crn may also be provided
-by clients, but is generally expected to be 0. The maximum CRN
-value defined by the protocol is 255, since CRN is stored in an
-8-bit integer.
-
-All of these fields are defined in SAM. They are always
-read-only, as are the cdb and dataout field. The cdb_size is
-taken from the configuration space.
-
-sense and subsequent fields are always write-only. The sense_len
-field indicates the number of bytes actually written to the sense
-buffer. The residual field indicates the residual size,
-calculated as “data_length - number_of_transferred_bytes”, for
-read or write operations. For bidirectional commands, the
-number_of_transferred_bytes includes both read and written bytes.
-A residual field that is less than the size of datain means that
-the dataout field was processed entirely. A residual field that
-exceeds the size of datain means that the dataout field was
-processed partially and the datain field was not processed at
-all.
-
-The status byte is written by the device to be the status code as
-defined in SAM.
-
-The response byte is written by the device to be one of the
-following:
-
- VIRTIO_SCSI_S_OK when the request was completed and the status
- byte is filled with a SCSI status code (not necessarily
- "GOOD").
-
- VIRTIO_SCSI_S_OVERRUN if the content of the CDB requires
- transferring more data than is available in the data buffers.
-
- VIRTIO_SCSI_S_ABORTED if the request was cancelled due to an
- ABORT TASK or ABORT TASK SET task management function.
-
- VIRTIO_SCSI_S_BAD_TARGET if the request was never processed
- because the target indicated by the lun field does not exist.
-
- VIRTIO_SCSI_S_RESET if the request was cancelled due to a bus
- or device reset (including a task management function).
-
- VIRTIO_SCSI_S_TRANSPORT_FAILURE if the request failed due to a
- problem in the connection between the host and the target
- (severed link).
-
- VIRTIO_SCSI_S_TARGET_FAILURE if the target is suffering a
- failure and the guest should not retry on other paths.
-
- VIRTIO_SCSI_S_NEXUS_FAILURE if the nexus is suffering a failure
- but retrying on other paths might yield a different result.
-
- VIRTIO_SCSI_S_BUSY if the request failed but retrying on the
- same path should work.
-
- VIRTIO_SCSI_S_FAILURE for other host or guest error. In
- particular, if neither dataout nor datain is empty, and the
- VIRTIO_SCSI_F_INOUT feature has not been negotiated, the
- request will be immediately returned with a response equal to
- VIRTIO_SCSI_S_FAILURE.
-
- Device Operation: controlq
-
-The controlq is used for other SCSI transport operations.
-Requests have the following format:
-
-struct virtio_scsi_ctrl {
-
- u32 type;
-
- ...
-
- u8 response;
-
-};
-
-
-
-/* response values valid for all commands */
-
-#define VIRTIO_SCSI_S_OK 0
-
-#define VIRTIO_SCSI_S_BAD_TARGET 3
-
-#define VIRTIO_SCSI_S_BUSY 5
-
-#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6
-
-#define VIRTIO_SCSI_S_TARGET_FAILURE 7
-
-#define VIRTIO_SCSI_S_NEXUS_FAILURE 8
-
-#define VIRTIO_SCSI_S_FAILURE 9
-
-#define VIRTIO_SCSI_S_INCORRECT_LUN 12
-
-The type identifies the remaining fields.
-
-The following commands are defined:
-
- Task management function
-#define VIRTIO_SCSI_T_TMF 0
-
-
-
-#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0
-
-#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1
-
-#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2
-
-#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3
-
-#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4
-
-#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5
-
-#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6
-
-#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7
-
-
-
-struct virtio_scsi_ctrl_tmf
-
-{
-
- // Read-only part
-
- u32 type;
-
- u32 subtype;
-
- u8 lun[8];
-
- u64 id;
-
- // Write-only part
-
- u8 response;
-
-}
-
-
-
-/* command-specific response values */
-
-#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0
-
-#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10
-
-#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11
-
- The type is VIRTIO_SCSI_T_TMF; the subtype field defines. All
- fields except response are filled by the driver. The subtype
- field must always be specified and identifies the requested
- task management function.
-
- Other fields may be irrelevant for the requested TMF; if so,
- they are ignored but they should still be present. The lun
- field is in the same format specified for request queues; the
- single level LUN is ignored when the task management function
- addresses a whole I_T nexus. When relevant, the value of the id
- field is matched against the id values passed on the requestq.
-
- The outcome of the task management function is written by the
- device in the response field. The command-specific response
- values map 1-to-1 with those defined in SAM.
-
- Asynchronous notification query
-#define VIRTIO_SCSI_T_AN_QUERY 1
-
-
-
-struct virtio_scsi_ctrl_an {
-
- // Read-only part
-
- u32 type;
-
- u8 lun[8];
-
- u32 event_requested;
-
- // Write-only part
-
- u32 event_actual;
-
- u8 response;
-
-}
-
-
-
-#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2
-
-#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4
-
-#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8
-
-#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16
-
-#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32
-
-#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64
-
- By sending this command, the driver asks the device which
- events the given LUN can report, as described in paragraphs 6.6
- and A.6 of the SCSI MMC specification. The driver writes the
- events it is interested in into the event_requested; the device
- responds by writing the events that it supports into
- event_actual.
-
- The type is VIRTIO_SCSI_T_AN_QUERY. The lun and event_requested
- fields are written by the driver. The event_actual and response
- fields are written by the device.
-
- No command-specific values are defined for the response byte.
-
- Asynchronous notification subscription
-#define VIRTIO_SCSI_T_AN_SUBSCRIBE 2
-
-
-
-struct virtio_scsi_ctrl_an {
-
- // Read-only part
-
- u32 type;
-
- u8 lun[8];
-
- u32 event_requested;
-
- // Write-only part
-
- u32 event_actual;
-
- u8 response;
-
-}
-
- By sending this command, the driver asks the specified LUN to
- report events for its physical interface, again as described in
- the SCSI MMC specification. The driver writes the events it is
- interested in into the event_requested; the device responds by
- writing the events that it supports into event_actual.
-
- Event types are the same as for the asynchronous notification
- query message.
-
- The type is VIRTIO_SCSI_T_AN_SUBSCRIBE. The lun and
- event_requested fields are written by the driver. The
- event_actual and response fields are written by the device.
-
- No command-specific values are defined for the response byte.
-
- Device Operation: eventq
-
-The eventq is used by the device to report information on logical
-units that are attached to it. The driver should always leave a
-few buffers ready in the eventq. In general, the device will not
-queue events to cope with an empty eventq, and will end up
-dropping events if it finds no buffer ready. However, when
-reporting events for many LUNs (e.g. when a whole target
-disappears), the device can throttle events to avoid dropping
-them. For this reason, placing 10-15 buffers on the event queue
-should be enough.
-
-Buffers are placed in the eventq and filled by the device when
-interesting events occur. The buffers should be strictly
-write-only (device-filled) and the size of the buffers should be
-at least the value given in the device's configuration
-information.
-
-Buffers returned by the device on the eventq will be referred to
-as "events" in the rest of this section. Events have the
-following format:
-
-#define VIRTIO_SCSI_T_EVENTS_MISSED 0x80000000
-
-
-
-struct virtio_scsi_event {
-
- // Write-only part
-
- u32 event;
-
- ...
-
-}
-
-If bit 31 is set in the event field, the device failed to report
-an event due to missing buffers. In this case, the driver should
-poll the logical units for unit attention conditions, and/or do
-whatever form of bus scan is appropriate for the guest operating
-system.
-
-Other data that the device writes to the buffer depends on the
-contents of the event field. The following events are defined:
-
- No event
-#define VIRTIO_SCSI_T_NO_EVENT 0
-
- This event is fired in the following cases:
-
- When the device detects in the eventq a buffer that is shorter
- than what is indicated in the configuration field, it might
- use it immediately and put this dummy value in the event
- field. A well-written driver will never observe this
- situation.
-
- When events are dropped, the device may signal this event as
- soon as the drivers makes a buffer available, in order to
- request action from the driver. In this case, of course, this
- event will be reported with the VIRTIO_SCSI_T_EVENTS_MISSED
- flag.
-
- Transport reset
-#define VIRTIO_SCSI_T_TRANSPORT_RESET 1
-
-
-
-struct virtio_scsi_event_reset {
-
- // Write-only part
-
- u32 event;
-
- u8 lun[8];
-
- u32 reason;
-
-}
-
-
-
-#define VIRTIO_SCSI_EVT_RESET_HARD 0
-
-#define VIRTIO_SCSI_EVT_RESET_RESCAN 1
-
-#define VIRTIO_SCSI_EVT_RESET_REMOVED 2
-
- By sending this event, the device signals that a logical unit
- on a target has been reset, including the case of a new device
- appearing or disappearing on the bus.The device fills in all
- fields. The event field is set to
- VIRTIO_SCSI_T_TRANSPORT_RESET. The lun field addresses a
- logical unit in the SCSI host.
-
- The reason value is one of the three #define values appearing
- above:
-
- VIRTIO_SCSI_EVT_RESET_REMOVED (“LUN/target removed”) is used if
- the target or logical unit is no longer able to receive
- commands.
-
- VIRTIO_SCSI_EVT_RESET_HARD (“LUN hard reset”) is used if the
- logical unit has been reset, but is still present.
-
- VIRTIO_SCSI_EVT_RESET_RESCAN (“rescan LUN/target”) is used if a
- target or logical unit has just appeared on the device.
-
- The “removed” and “rescan” events, when sent for LUN 0, may
- apply to the entire target. After receiving them the driver
- should ask the initiator to rescan the target, in order to
- detect the case when an entire target has appeared or
- disappeared. These two events will never be reported unless the
- VIRTIO_SCSI_F_HOTPLUG feature was negotiated between the host
- and the guest.
-
- Events will also be reported via sense codes (this obviously
- does not apply to newly appeared buses or targets, since the
- application has never discovered them):
-
- “LUN/target removed” maps to sense key ILLEGAL REQUEST, asc
- 0x25, ascq 0x00 (LOGICAL UNIT NOT SUPPORTED)
-
- “LUN hard reset” maps to sense key UNIT ATTENTION, asc 0x29
- (POWER ON, RESET OR BUS DEVICE RESET OCCURRED)
-
- “rescan LUN/target” maps to sense key UNIT ATTENTION, asc 0x3f,
- ascq 0x0e (REPORTED LUNS DATA HAS CHANGED)
-
- The preferred way to detect transport reset is always to use
- events, because sense codes are only seen by the driver when it
- sends a SCSI command to the logical unit or target. However, in
- case events are dropped, the initiator will still be able to
- synchronize with the actual state of the controller if the
- driver asks the initiator to rescan of the SCSI bus. During the
- rescan, the initiator will be able to observe the above sense
- codes, and it will process them as if it the driver had
- received the equivalent event.
-
- Asynchronous notification
-#define VIRTIO_SCSI_T_ASYNC_NOTIFY 2
-
-
-
-struct virtio_scsi_event_an {
-
- // Write-only part
-
- u32 event;
-
- u8 lun[8];
-
- u32 reason;
-
-}
-
- By sending this event, the device signals that an asynchronous
- event was fired from a physical interface.
-
- All fields are written by the device. The event field is set to
- VIRTIO_SCSI_T_ASYNC_NOTIFY. The lun field addresses a logical
- unit in the SCSI host. The reason field is a subset of the
- events that the driver has subscribed to via the "Asynchronous
- notification subscription" command.
-
- When dropped events are reported, the driver should poll for
- asynchronous events manually using SCSI commands.
-
-Appendix X: virtio-mmio
-
-Virtual environments without PCI support (a common situation in
-embedded devices models) might use simple memory mapped device (“
-virtio-mmio”) instead of the PCI device.
-
-The memory mapped virtio device behaviour is based on the PCI
-device specification. Therefore most of operations like device
-initialization, queues configuration and buffer transfers are
-nearly identical. Existing differences are described in the
-following sections.
-
- Device Initialization
-
-Instead of using the PCI IO space for virtio header, the “
-virtio-mmio” device provides a set of memory mapped control
-registers, all 32 bits wide, followed by device-specific
-configuration space. The following list presents their layout:
-
- Offset from the device base address | Direction | Name
- Description
-
- 0x000 | R | MagicValue
- “virt” string.
-
- 0x004 | R | Version
- Device version number. Currently must be 1.
-
- 0x008 | R | DeviceID
- Virtio Subsystem Device ID (ie. 1 for network card).
-
- 0x00c | R | VendorID
- Virtio Subsystem Vendor ID.
-
- 0x010 | R | HostFeatures
- Flags representing features the device supports.
- Reading from this register returns 32 consecutive flag bits,
- first bit depending on the last value written to
- HostFeaturesSel register. Access to this register returns bits HostFeaturesSel*32
-
- to (HostFeaturesSel*32)+31
-, eg. feature bits 0 to 31 if
- HostFeaturesSel is set to 0 and features bits 32 to 63 if
- HostFeaturesSel is set to 1. Also see [sub:Feature-Bits]
-
- 0x014 | W | HostFeaturesSel
- Device (Host) features word selection.
- Writing to this register selects a set of 32 device feature bits
- accessible by reading from HostFeatures register. Device driver
- must write a value to the HostFeaturesSel register before
- reading from the HostFeatures register.
-
- 0x020 | W | GuestFeatures
- Flags representing device features understood and activated by
- the driver.
- Writing to this register sets 32 consecutive flag bits, first
- bit depending on the last value written to GuestFeaturesSel
- register. Access to this register sets bits GuestFeaturesSel*32
-
- to (GuestFeaturesSel*32)+31
-, eg. feature bits 0 to 31 if
- GuestFeaturesSel is set to 0 and features bits 32 to 63 if
- GuestFeaturesSel is set to 1. Also see [sub:Feature-Bits]
-
- 0x024 | W | GuestFeaturesSel
- Activated (Guest) features word selection.
- Writing to this register selects a set of 32 activated feature
- bits accessible by writing to the GuestFeatures register.
- Device driver must write a value to the GuestFeaturesSel
- register before writing to the GuestFeatures register.
-
- 0x028 | W | GuestPageSize
- Guest page size.
- Device driver must write the guest page size in bytes to the
- register during initialization, before any queues are used.
- This value must be a power of 2 and is used by the Host to
- calculate Guest address of the first queue page (see QueuePFN).
-
- 0x030 | W | QueueSel
- Virtual queue index (first queue is 0).
- Writing to this register selects the virtual queue that the
- following operations on QueueNum, QueueAlign and QueuePFN apply
- to.
-
- 0x034 | R | QueueNumMax
- Maximum virtual queue size.
- Reading from the register returns the maximum size of the queue
- the Host is ready to process or zero (0x0) if the queue is not
- available. This applies to the queue selected by writing to
- QueueSel and is allowed only when QueuePFN is set to zero
- (0x0), so when the queue is not actively used.
-
- 0x038 | W | QueueNum
- Virtual queue size.
- Queue size is a number of elements in the queue, therefore size
- of the descriptor table and both available and used rings.
- Writing to this register notifies the Host what size of the
- queue the Guest will use. This applies to the queue selected by
- writing to QueueSel.
-
- 0x03c | W | QueueAlign
- Used Ring alignment in the virtual queue.
- Writing to this register notifies the Host about alignment
- boundary of the Used Ring in bytes. This value must be a power
- of 2 and applies to the queue selected by writing to QueueSel.
-
- 0x040 | RW | QueuePFN
- Guest physical page number of the virtual queue.
- Writing to this register notifies the host about location of the
- virtual queue in the Guest's physical address space. This value
- is the index number of a page starting with the queue
- Descriptor Table. Value zero (0x0) means physical address zero
- (0x00000000) and is illegal. When the Guest stops using the
- queue it must write zero (0x0) to this register.
- Reading from this register returns the currently used page
- number of the queue, therefore a value other than zero (0x0)
- means that the queue is in use.
- Both read and write accesses apply to the queue selected by
- writing to QueueSel.
-
- 0x050 | W | QueueNotify
- Queue notifier.
- Writing a queue index to this register notifies the Host that
- there are new buffers to process in the queue.
-
- 0x60 | R | InterruptStatus
-Interrupt status.
-Reading from this register returns a bit mask of interrupts
- asserted by the device. An interrupt is asserted if the
- corresponding bit is set, ie. equals one (1).
-
- Bit 0 | Used Ring Update
-This interrupt is asserted when the Host has updated the Used
- Ring in at least one of the active virtual queues.
-
- Bit 1 | Configuration change
-This interrupt is asserted when configuration of the device has
- changed.
-
- 0x064 | W | InterruptACK
- Interrupt acknowledge.
- Writing to this register notifies the Host that the Guest
- finished handling interrupts. Set bits in the value clear the
- corresponding bits of the InterruptStatus register.
-
- 0x070 | RW | Status
- Device status.
- Reading from this register returns the current device status
- flags.
- Writing non-zero values to this register sets the status flags,
- indicating the Guest progress. Writing zero (0x0) to this
- register triggers a device reset.
- Also see [sub:Device-Initialization-Sequence]
-
- 0x100+ | RW | Config
- Device-specific configuration space starts at an offset 0x100
- and is accessed with byte alignment. Its meaning and size
- depends on the device and the driver.
-
-Virtual queue size is a number of elements in the queue,
-therefore size of the descriptor table and both available and
-used rings.
-
-The endianness of the registers follows the native endianness of
-the Guest. Writing to registers described as “R” and reading from
-registers described as “W” is not permitted and can cause
-undefined behavior.
-
-The device initialization is performed as described in [sub:Device-Initialization-Sequence]
- with one exception: the Guest must notify the Host about its
-page size, writing the size in bytes to GuestPageSize register
-before the initialization is finished.
-
-The memory mapped virtio devices generate single interrupt only,
-therefore no special configuration is required.
-
- Virtqueue Configuration
-
-The virtual queue configuration is performed in a similar way to
-the one described in [sec:Virtqueue-Configuration] with a few
-additional operations:
-
- Select the queue writing its index (first queue is 0) to the
- QueueSel register.
-
- Check if the queue is not already in use: read QueuePFN
- register, returned value should be zero (0x0).
-
- Read maximum queue size (number of elements) from the
- QueueNumMax register. If the returned value is zero (0x0) the
- queue is not available.
-
- Allocate and zero the queue pages in contiguous virtual memory,
- aligning the Used Ring to an optimal boundary (usually page
- size). Size of the allocated queue may be smaller than or equal
- to the maximum size returned by the Host.
-
- Notify the Host about the queue size by writing the size to
- QueueNum register.
-
- Notify the Host about the used alignment by writing its value
- in bytes to QueueAlign register.
-
- Write the physical number of the first page of the queue to the
- QueuePFN register.
-
-The queue and the device are ready to begin normal operations
-now.
-
- Device Operation
-
-The memory mapped virtio device behaves in the same way as
-described in [sec:Device-Operation], with the following
-exceptions:
-
- The device is notified about new buffers available in a queue
- by writing the queue index to register QueueNum instead of the
- virtio header in PCI I/O space ([sub:Notifying-The-Device]).
-
- The memory mapped virtio device is using single, dedicated
- interrupt signal, which is raised when at least one of the
- interrupts described in the InterruptStatus register
- description is asserted. After receiving an interrupt, the
- driver must read the InterruptStatus register to check what
- caused the interrupt (see the register description). After the
- interrupt is handled, the driver must acknowledge it by writing
- a bit mask corresponding to the serviced interrupt to the
- InterruptACK register.
-
diff --git a/MAINTAINERS b/MAINTAINERS
index ee468fa..59d708f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3919,7 +3919,6 @@ F: drivers/i2c/i2c-stub.c
I2C SUBSYSTEM
M: Wolfram Sang <wsa@the-dreams.de>
-M: "Ben Dooks (embedded platforms)" <ben-linux@fluff.org>
L: linux-i2c@vger.kernel.org
W: http://i2c.wiki.kernel.org/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/wsa/linux.git
@@ -8743,6 +8742,7 @@ F: drivers/virtio/
F: drivers/net/virtio_net.c
F: drivers/block/virtio_blk.c
F: include/linux/virtio_*.h
+F: include/uapi/linux/virtio_*.h
VIRTIO HOST (VHOST)
M: "Michael S. Tsirkin" <mst@redhat.com>
diff --git a/arch/arm/mach-s3c24xx/mach-rx1950.c b/arch/arm/mach-s3c24xx/mach-rx1950.c
index 1f9ba2a..43f3ac5 100644
--- a/arch/arm/mach-s3c24xx/mach-rx1950.c
+++ b/arch/arm/mach-s3c24xx/mach-rx1950.c
@@ -56,7 +56,6 @@
#include <plat/cpu.h>
#include <plat/devs.h>
#include <plat/pm.h>
-#include <plat/regs-iic.h>
#include <plat/regs-serial.h>
#include "common.h"
diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c
index 84deeab..9cf1ab1 100644
--- a/arch/arm/mach-tegra/tegra.c
+++ b/arch/arm/mach-tegra/tegra.c
@@ -31,8 +31,6 @@
#include <linux/pda_power.h>
#include <linux/platform_data/tegra_usb.h>
#include <linux/io.h>
-#include <linux/i2c.h>
-#include <linux/i2c-tegra.h>
#include <linux/slab.h>
#include <linux/sys_soc.h>
#include <linux/usb/tegra_usb_phy.h>
diff --git a/arch/arm/plat-samsung/devs.c b/arch/arm/plat-samsung/devs.c
index f55d3f9..8f456be 100644
--- a/arch/arm/plat-samsung/devs.c
+++ b/arch/arm/plat-samsung/devs.c
@@ -63,7 +63,6 @@
#include <linux/platform_data/usb-s3c2410_udc.h>
#include <linux/platform_data/usb-ohci-s3c2410.h>
#include <plat/usb-phy.h>
-#include <plat/regs-iic.h>
#include <plat/regs-serial.h>
#include <plat/regs-spi.h>
#include <linux/platform_data/spi-s3c64xx.h>
diff --git a/arch/arm/plat-samsung/include/plat/regs-iic.h b/arch/arm/plat-samsung/include/plat/regs-iic.h
deleted file mode 100644
index 2f7c17d..0000000
--- a/arch/arm/plat-samsung/include/plat/regs-iic.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* arch/arm/mach-s3c2410/include/mach/regs-iic.h
- *
- * Copyright (c) 2004 Simtec Electronics <linux@simtec.co.uk>
- * http://www.simtec.co.uk/products/SWLINUX/
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * S3C2410 I2C Controller
-*/
-
-#ifndef __ASM_ARCH_REGS_IIC_H
-#define __ASM_ARCH_REGS_IIC_H __FILE__
-
-/* see s3c2410x user guide, v1.1, section 9 (p447) for more info */
-
-#define S3C2410_IICREG(x) (x)
-
-#define S3C2410_IICCON S3C2410_IICREG(0x00)
-#define S3C2410_IICSTAT S3C2410_IICREG(0x04)
-#define S3C2410_IICADD S3C2410_IICREG(0x08)
-#define S3C2410_IICDS S3C2410_IICREG(0x0C)
-#define S3C2440_IICLC S3C2410_IICREG(0x10)
-
-#define S3C2410_IICCON_ACKEN (1<<7)
-#define S3C2410_IICCON_TXDIV_16 (0<<6)
-#define S3C2410_IICCON_TXDIV_512 (1<<6)
-#define S3C2410_IICCON_IRQEN (1<<5)
-#define S3C2410_IICCON_IRQPEND (1<<4)
-#define S3C2410_IICCON_SCALE(x) ((x)&15)
-#define S3C2410_IICCON_SCALEMASK (0xf)
-
-#define S3C2410_IICSTAT_MASTER_RX (2<<6)
-#define S3C2410_IICSTAT_MASTER_TX (3<<6)
-#define S3C2410_IICSTAT_SLAVE_RX (0<<6)
-#define S3C2410_IICSTAT_SLAVE_TX (1<<6)
-#define S3C2410_IICSTAT_MODEMASK (3<<6)
-
-#define S3C2410_IICSTAT_START (1<<5)
-#define S3C2410_IICSTAT_BUSBUSY (1<<5)
-#define S3C2410_IICSTAT_TXRXEN (1<<4)
-#define S3C2410_IICSTAT_ARBITR (1<<3)
-#define S3C2410_IICSTAT_ASSLAVE (1<<2)
-#define S3C2410_IICSTAT_ADDR0 (1<<1)
-#define S3C2410_IICSTAT_LASTBIT (1<<0)
-
-#define S3C2410_IICLC_SDA_DELAY0 (0 << 0)
-#define S3C2410_IICLC_SDA_DELAY5 (1 << 0)
-#define S3C2410_IICLC_SDA_DELAY10 (2 << 0)
-#define S3C2410_IICLC_SDA_DELAY15 (3 << 0)
-#define S3C2410_IICLC_SDA_DELAY_MASK (3 << 0)
-
-#define S3C2410_IICLC_FILTER_ON (1<<2)
-
-#endif /* __ASM_ARCH_REGS_IIC_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 0d97deb..e2d4a4a 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -11,18 +11,11 @@
#define GUEST_PL 1
-/* Every guest maps the core switcher code. */
-#define SHARED_SWITCHER_PAGES \
- DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
-/* Pages for switcher itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids)
-
-/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */
-#ifdef CONFIG_X86_PAE
-#define SWITCHER_ADDR 0xFFE00000
-#else
-#define SWITCHER_ADDR 0xFFC00000
-#endif
+/* Page for Switcher text itself, then two pages per cpu */
+#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+
+/* Where we map the Switcher, in both Host and Guest. */
+extern unsigned long switcher_addr;
/* Found in switcher.S */
extern unsigned long default_idt_entries[];
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index dabd221..03cf717 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -110,7 +110,7 @@ new_segment:
if (!sg)
sg = sglist;
else {
- sg->page_link &= ~0x02;
+ sg_unmark_end(sg);
sg = sg_next(sg);
}
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110..5f24482 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -143,7 +143,7 @@ new_segment:
* termination bit to avoid doing a full
* sg_init_table() in drivers for each command.
*/
- (*sg)->page_link &= ~0x02;
+ sg_unmark_end(*sg);
*sg = sg_next(*sg);
}
diff --git a/drivers/Makefile b/drivers/Makefile
index 33360de..8e57688 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -124,7 +124,7 @@ obj-$(CONFIG_PPC_PS3) += ps3/
obj-$(CONFIG_OF) += of/
obj-$(CONFIG_SSB) += ssb/
obj-$(CONFIG_BCMA) += bcma/
-obj-$(CONFIG_VHOST_NET) += vhost/
+obj-$(CONFIG_VHOST_RING) += vhost/
obj-$(CONFIG_VLYNQ) += vlynq/
obj-$(CONFIG_STAGING) += staging/
obj-y += platform/
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 8ad21a2..6472395 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -100,96 +100,103 @@ static inline struct virtblk_req *virtblk_alloc_req(struct virtio_blk *vblk,
return vbr;
}
-static void virtblk_add_buf_wait(struct virtio_blk *vblk,
- struct virtblk_req *vbr,
- unsigned long out,
- unsigned long in)
+static int __virtblk_add_req(struct virtqueue *vq,
+ struct virtblk_req *vbr,
+ struct scatterlist *data_sg,
+ bool have_data)
{
- DEFINE_WAIT(wait);
+ struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6];
+ unsigned int num_out = 0, num_in = 0;
+ int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT;
- for (;;) {
- prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
- TASK_UNINTERRUPTIBLE);
+ sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+ sgs[num_out++] = &hdr;
- spin_lock_irq(vblk->disk->queue->queue_lock);
- if (virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- io_schedule();
- } else {
- virtqueue_kick(vblk->vq);
- spin_unlock_irq(vblk->disk->queue->queue_lock);
- break;
- }
+ /*
+ * If this is a packet command we need a couple of additional headers.
+ * Behind the normal outhdr we put a segment with the scsi command
+ * block, and before the normal inhdr we put the sense data and the
+ * inhdr with additional status information.
+ */
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len);
+ sgs[num_out++] = &cmd;
+ }
+ if (have_data) {
+ if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT)
+ sgs[num_out++] = data_sg;
+ else
+ sgs[num_out + num_in++] = data_sg;
}
- finish_wait(&vblk->queue_wait, &wait);
+ if (type == VIRTIO_BLK_T_SCSI_CMD) {
+ sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
+ sgs[num_out + num_in++] = &sense;
+ sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr));
+ sgs[num_out + num_in++] = &inhdr;
+ }
+
+ sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+ sgs[num_out + num_in++] = &status;
+
+ return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
}
-static inline void virtblk_add_req(struct virtblk_req *vbr,
- unsigned int out, unsigned int in)
+static void virtblk_add_req(struct virtblk_req *vbr, bool have_data)
{
struct virtio_blk *vblk = vbr->vblk;
+ DEFINE_WAIT(wait);
+ int ret;
spin_lock_irq(vblk->disk->queue->queue_lock);
- if (unlikely(virtqueue_add_buf(vblk->vq, vbr->sg, out, in, vbr,
- GFP_ATOMIC) < 0)) {
+ while (unlikely((ret = __virtblk_add_req(vblk->vq, vbr, vbr->sg,
+ have_data)) < 0)) {
+ prepare_to_wait_exclusive(&vblk->queue_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
spin_unlock_irq(vblk->disk->queue->queue_lock);
- virtblk_add_buf_wait(vblk, vbr, out, in);
- return;
+ io_schedule();
+ spin_lock_irq(vblk->disk->queue->queue_lock);
+
+ finish_wait(&vblk->queue_wait, &wait);
}
+
virtqueue_kick(vblk->vq);
spin_unlock_irq(vblk->disk->queue->queue_lock);
}
-static int virtblk_bio_send_flush(struct virtblk_req *vbr)
+static void virtblk_bio_send_flush(struct virtblk_req *vbr)
{
- unsigned int out = 0, in = 0;
-
vbr->flags |= VBLK_IS_FLUSH;
vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
vbr->out_hdr.sector = 0;
vbr->out_hdr.ioprio = 0;
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
- sg_set_buf(&vbr->sg[out + in++], &vbr->status, sizeof(vbr->status));
-
- virtblk_add_req(vbr, out, in);
- return 0;
+ virtblk_add_req(vbr, false);
}
-static int virtblk_bio_send_data(struct virtblk_req *vbr)
+static void virtblk_bio_send_data(struct virtblk_req *vbr)
{
struct virtio_blk *vblk = vbr->vblk;
- unsigned int num, out = 0, in = 0;
struct bio *bio = vbr->bio;
+ bool have_data;
vbr->flags &= ~VBLK_IS_FLUSH;
vbr->out_hdr.type = 0;
vbr->out_hdr.sector = bio->bi_sector;
vbr->out_hdr.ioprio = bio_prio(bio);
- sg_set_buf(&vbr->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- num = blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg + out);
-
- sg_set_buf(&vbr->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
- if (num) {
- if (bio->bi_rw & REQ_WRITE) {
+ if (blk_bio_map_sg(vblk->disk->queue, bio, vbr->sg)) {
+ have_data = true;
+ if (bio->bi_rw & REQ_WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
- }
+ } else
+ have_data = false;
- virtblk_add_req(vbr, out, in);
-
- return 0;
+ virtblk_add_req(vbr, have_data);
}
static void virtblk_bio_send_data_work(struct work_struct *work)
@@ -298,7 +305,7 @@ static void virtblk_done(struct virtqueue *vq)
static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
struct request *req)
{
- unsigned long num, out = 0, in = 0;
+ unsigned int num;
struct virtblk_req *vbr;
vbr = virtblk_alloc_req(vblk, GFP_ATOMIC);
@@ -335,40 +342,15 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
-
- /*
- * If this is a packet command we need a couple of additional headers.
- * Behind the normal outhdr we put a segment with the scsi command
- * block, and before the normal inhdr we put the sense data and the
- * inhdr with additional status information before the normal inhdr.
- */
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
- sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
-
- num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
-
- if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
- sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE);
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
- sizeof(vbr->in_hdr));
- }
-
- sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
- sizeof(vbr->status));
-
+ num = blk_rq_map_sg(q, vbr->req, vblk->sg);
if (num) {
- if (rq_data_dir(vbr->req) == WRITE) {
+ if (rq_data_dir(vbr->req) == WRITE)
vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
- out += num;
- } else {
+ else
vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
- in += num;
- }
}
- if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr,
- GFP_ATOMIC) < 0) {
+ if (__virtblk_add_req(vblk->vq, vbr, vblk->sg, num) < 0) {
mempool_free(vbr, vblk->pool);
return false;
}
@@ -539,6 +521,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
struct virtio_device *vdev = vblk->vdev;
struct request_queue *q = vblk->disk->queue;
char cap_str_2[10], cap_str_10[10];
+ char *envp[] = { "RESIZE=1", NULL };
u64 capacity, size;
mutex_lock(&vblk->config_lock);
@@ -568,6 +551,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
set_capacity(vblk->disk, capacity);
revalidate_disk(vblk->disk);
+ kobject_uevent_env(&disk_to_dev(vblk->disk)->kobj, KOBJ_CHANGE, envp);
done:
mutex_unlock(&vblk->config_lock);
}
diff --git a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
index 6bf4d47..ef46a9c 100644
--- a/drivers/char/hw_random/virtio-rng.c
+++ b/drivers/char/hw_random/virtio-rng.c
@@ -47,7 +47,7 @@ static void register_buffer(u8 *buf, size_t size)
sg_init_one(&sg, buf, size);
/* There should always be room for one buffer. */
- if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
+ if (virtqueue_add_inbuf(vq, &sg, 1, buf, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index ce5f3fc..1b456fe 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -78,8 +78,8 @@ struct ports_driver_data {
};
static struct ports_driver_data pdrvdata;
-DEFINE_SPINLOCK(pdrvdata_lock);
-DECLARE_COMPLETION(early_console_added);
+static DEFINE_SPINLOCK(pdrvdata_lock);
+static DECLARE_COMPLETION(early_console_added);
/* This struct holds information that's relevant only for console ports */
struct console {
@@ -503,7 +503,7 @@ static int add_inbuf(struct virtqueue *vq, struct port_buffer *buf)
sg_init_one(sg, buf->buf, buf->size);
- ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
+ ret = virtqueue_add_inbuf(vq, sg, 1, buf, GFP_ATOMIC);
virtqueue_kick(vq);
if (!ret)
ret = vq->num_free;
@@ -572,7 +572,7 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
sg_init_one(sg, &cpkt, sizeof(cpkt));
spin_lock(&portdev->c_ovq_lock);
- if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) == 0) {
+ if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) {
virtqueue_kick(vq);
while (!virtqueue_get_buf(vq, &len))
cpu_relax();
@@ -622,7 +622,7 @@ static ssize_t __send_to_port(struct port *port, struct scatterlist *sg,
reclaim_consumed_buffers(port);
- err = virtqueue_add_buf(out_vq, sg, nents, 0, data, GFP_ATOMIC);
+ err = virtqueue_add_outbuf(out_vq, sg, nents, data, GFP_ATOMIC);
/* Tell Host to go! */
virtqueue_kick(out_vq);
@@ -1040,7 +1040,7 @@ static int port_fops_open(struct inode *inode, struct file *filp)
spin_lock_irq(&port->inbuf_lock);
if (port->guest_connected) {
spin_unlock_irq(&port->inbuf_lock);
- ret = -EMFILE;
+ ret = -EBUSY;
goto out;
}
@@ -1202,7 +1202,7 @@ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int))
return hvc_instantiate(0, 0, &hv_ops);
}
-int init_port_console(struct port *port)
+static int init_port_console(struct port *port)
{
int ret;
diff --git a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
index 88627e3..1eb86c7 100644
--- a/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
+++ b/drivers/gpu/drm/gma500/oaktrail_hdmi_i2c.c
@@ -319,8 +319,7 @@ void oaktrail_hdmi_i2c_exit(struct pci_dev *dev)
struct hdmi_i2c_dev *i2c_dev;
hdmi_dev = pci_get_drvdata(dev);
- if (i2c_del_adapter(&oaktrail_hdmi_i2c_adapter))
- DRM_DEBUG_DRIVER("Failed to delete hdmi-i2c adapter\n");
+ i2c_del_adapter(&oaktrail_hdmi_i2c_adapter);
i2c_dev = hdmi_dev->i2c_dev;
kfree(i2c_dev);
diff --git a/drivers/i2c/busses/i2c-amd756-s4882.c b/drivers/i2c/busses/i2c-amd756-s4882.c
index 378fcb5..07f01ac 100644
--- a/drivers/i2c/busses/i2c-amd756-s4882.c
+++ b/drivers/i2c/busses/i2c-amd756-s4882.c
@@ -169,11 +169,7 @@ static int __init amd756_s4882_init(void)
}
/* Unregister physical bus */
- error = i2c_del_adapter(&amd756_smbus);
- if (error) {
- dev_err(&amd756_smbus.dev, "Physical bus removal failed\n");
- goto ERROR0;
- }
+ i2c_del_adapter(&amd756_smbus);
printk(KERN_INFO "Enabling SMBus multiplexing for Tyan S4882\n");
/* Define the 5 virtual adapters and algorithms structures */
diff --git a/drivers/i2c/busses/i2c-at91.c b/drivers/i2c/busses/i2c-at91.c
index 75195e3..6bb839b 100644
--- a/drivers/i2c/busses/i2c-at91.c
+++ b/drivers/i2c/busses/i2c-at91.c
@@ -603,15 +603,18 @@ static const struct of_device_id atmel_twi_dt_ids[] = {
}
};
MODULE_DEVICE_TABLE(of, atmel_twi_dt_ids);
-#else
-#define atmel_twi_dt_ids NULL
#endif
-static bool filter(struct dma_chan *chan, void *slave)
+static bool filter(struct dma_chan *chan, void *pdata)
{
- struct at_dma_slave *sl = slave;
+ struct at91_twi_pdata *sl_pdata = pdata;
+ struct at_dma_slave *sl;
+
+ if (!sl_pdata)
+ return false;
- if (sl->dma_dev == chan->device->dev) {
+ sl = &sl_pdata->dma_slave;
+ if (sl && (sl->dma_dev == chan->device->dev)) {
chan->private = sl;
return true;
} else {
@@ -622,11 +625,10 @@ static bool filter(struct dma_chan *chan, void *slave)
static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr)
{
int ret = 0;
- struct at_dma_slave *sdata;
+ struct at91_twi_pdata *pdata = dev->pdata;
struct dma_slave_config slave_config;
struct at91_twi_dma *dma = &dev->dma;
-
- sdata = &dev->pdata->dma_slave;
+ dma_cap_mask_t mask;
memset(&slave_config, 0, sizeof(slave_config));
slave_config.src_addr = (dma_addr_t)phy_addr + AT91_TWI_RHR;
@@ -637,25 +639,22 @@ static int at91_twi_configure_dma(struct at91_twi_dev *dev, u32 phy_addr)
slave_config.dst_maxburst = 1;
slave_config.device_fc = false;
- if (sdata && sdata->dma_dev) {
- dma_cap_mask_t mask;
+ dma_cap_zero(mask);
+ dma_cap_set(DMA_SLAVE, mask);
- dma_cap_zero(mask);
- dma_cap_set(DMA_SLAVE, mask);
- dma->chan_tx = dma_request_channel(mask, filter, sdata);
- if (!dma->chan_tx) {
- dev_err(dev->dev, "no DMA channel available for tx\n");
- ret = -EBUSY;
- goto error;
- }
- dma->chan_rx = dma_request_channel(mask, filter, sdata);
- if (!dma->chan_rx) {
- dev_err(dev->dev, "no DMA channel available for rx\n");
- ret = -EBUSY;
- goto error;
- }
- } else {
- ret = -EINVAL;
+ dma->chan_tx = dma_request_slave_channel_compat(mask, filter, pdata,
+ dev->dev, "tx");
+ if (!dma->chan_tx) {
+ dev_err(dev->dev, "can't get a DMA channel for tx\n");
+ ret = -EBUSY;
+ goto error;
+ }
+
+ dma->chan_rx = dma_request_slave_channel_compat(mask, filter, pdata,
+ dev->dev, "rx");
+ if (!dma->chan_rx) {
+ dev_err(dev->dev, "can't get a DMA channel for rx\n");
+ ret = -EBUSY;
goto error;
}
@@ -785,12 +784,11 @@ static int at91_twi_probe(struct platform_device *pdev)
static int at91_twi_remove(struct platform_device *pdev)
{
struct at91_twi_dev *dev = platform_get_drvdata(pdev);
- int rc;
- rc = i2c_del_adapter(&dev->adapter);
+ i2c_del_adapter(&dev->adapter);
clk_disable_unprepare(dev->clk);
- return rc;
+ return 0;
}
#ifdef CONFIG_PM
@@ -828,7 +826,7 @@ static struct platform_driver at91_twi_driver = {
.driver = {
.name = "at91_i2c",
.owner = THIS_MODULE,
- .of_match_table = atmel_twi_dt_ids,
+ .of_match_table = of_match_ptr(atmel_twi_dt_ids),
.pm = at91_twi_pm_ops,
},
};
diff --git a/drivers/i2c/busses/i2c-cbus-gpio.c b/drivers/i2c/busses/i2c-cbus-gpio.c
index 98386d6..1be13ac 100644
--- a/drivers/i2c/busses/i2c-cbus-gpio.c
+++ b/drivers/i2c/busses/i2c-cbus-gpio.c
@@ -206,7 +206,9 @@ static int cbus_i2c_remove(struct platform_device *pdev)
{
struct i2c_adapter *adapter = platform_get_drvdata(pdev);
- return i2c_del_adapter(adapter);
+ i2c_del_adapter(adapter);
+
+ return 0;
}
static int cbus_i2c_probe(struct platform_device *pdev)
diff --git a/drivers/i2c/busses/i2c-davinci.c b/drivers/i2c/busses/i2c-davinci.c
index 7d1e590..cf20e06 100644
--- a/drivers/i2c/busses/i2c-davinci.c
+++ b/drivers/i2c/busses/i2c-davinci.c
@@ -137,7 +137,7 @@ static inline u16 davinci_i2c_read_reg(struct davinci_i2c_dev *i2c_dev, int reg)
}
/* Generate a pulse on the i2c clock pin. */
-static void generic_i2c_clock_pulse(unsigned int scl_pin)
+static void davinci_i2c_clock_pulse(unsigned int scl_pin)
{
u16 i;
@@ -155,7 +155,7 @@ static void generic_i2c_clock_pulse(unsigned int scl_pin)
/* This routine does i2c bus recovery as specified in the
* i2c protocol Rev. 03 section 3.16 titled "Bus clear"
*/
-static void i2c_recover_bus(struct davinci_i2c_dev *dev)
+static void davinci_i2c_recover_bus(struct davinci_i2c_dev *dev)
{
u32 flag = 0;
struct davinci_i2c_platform_data *pdata = dev->pdata;
@@ -166,7 +166,7 @@ static void i2c_recover_bus(struct davinci_i2c_dev *dev)
flag |= DAVINCI_I2C_MDR_NACK;
/* write the data into mode register */
davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, flag);
- generic_i2c_clock_pulse(pdata->scl_pin);
+ davinci_i2c_clock_pulse(pdata->scl_pin);
/* Send STOP */
flag = davinci_i2c_read_reg(dev, DAVINCI_I2C_MDR_REG);
flag |= DAVINCI_I2C_MDR_STP;
@@ -289,7 +289,7 @@ static int i2c_davinci_wait_bus_not_busy(struct davinci_i2c_dev *dev,
return -ETIMEDOUT;
} else {
to_cnt = 0;
- i2c_recover_bus(dev);
+ davinci_i2c_recover_bus(dev);
i2c_davinci_init(dev);
}
}
@@ -379,7 +379,7 @@ i2c_davinci_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg, int stop)
dev->adapter.timeout);
if (r == 0) {
dev_err(dev->dev, "controller timed out\n");
- i2c_recover_bus(dev);
+ davinci_i2c_recover_bus(dev);
i2c_davinci_init(dev);
dev->buf_len = 0;
return -ETIMEDOUT;
@@ -643,7 +643,7 @@ static int davinci_i2c_probe(struct platform_device *pdev)
{
struct davinci_i2c_dev *dev;
struct i2c_adapter *adap;
- struct resource *mem, *irq, *ioarea;
+ struct resource *mem, *irq;
int r;
/* NOTE: driver uses the static register mapping */
@@ -659,24 +659,18 @@ static int davinci_i2c_probe(struct platform_device *pdev)
return -ENODEV;
}
- ioarea = request_mem_region(mem->start, resource_size(mem),
- pdev->name);
- if (!ioarea) {
- dev_err(&pdev->dev, "I2C region already claimed\n");
- return -EBUSY;
- }
-
- dev = kzalloc(sizeof(struct davinci_i2c_dev), GFP_KERNEL);
+ dev = devm_kzalloc(&pdev->dev, sizeof(struct davinci_i2c_dev),
+ GFP_KERNEL);
if (!dev) {
- r = -ENOMEM;
- goto err_release_region;
+ dev_err(&pdev->dev, "Memory allocation failed\n");
+ return -ENOMEM;
}
init_completion(&dev->cmd_complete);
#ifdef CONFIG_CPU_FREQ
init_completion(&dev->xfr_complete);
#endif
- dev->dev = get_device(&pdev->dev);
+ dev->dev = &pdev->dev;
dev->irq = irq->start;
dev->pdata = dev->dev->platform_data;
platform_set_drvdata(pdev, dev);
@@ -686,10 +680,9 @@ static int davinci_i2c_probe(struct platform_device *pdev)
dev->pdata = devm_kzalloc(&pdev->dev,
sizeof(struct davinci_i2c_platform_data), GFP_KERNEL);
- if (!dev->pdata) {
- r = -ENOMEM;
- goto err_free_mem;
- }
+ if (!dev->pdata)
+ return -ENOMEM;
+
memcpy(dev->pdata, &davinci_i2c_platform_data_default,
sizeof(struct davinci_i2c_platform_data));
if (!of_property_read_u32(pdev->dev.of_node, "clock-frequency",
@@ -699,22 +692,21 @@ static int davinci_i2c_probe(struct platform_device *pdev)
dev->pdata = &davinci_i2c_platform_data_default;
}
- dev->clk = clk_get(&pdev->dev, NULL);
- if (IS_ERR(dev->clk)) {
- r = -ENODEV;
- goto err_free_mem;
- }
+ dev->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(dev->clk))
+ return -ENODEV;
clk_prepare_enable(dev->clk);
- dev->base = ioremap(mem->start, resource_size(mem));
- if (!dev->base) {
- r = -EBUSY;
- goto err_mem_ioremap;
+ dev->base = devm_ioremap_resource(&pdev->dev, mem);
+ if (IS_ERR(dev->base)) {
+ r = PTR_ERR(dev->base);
+ goto err_unuse_clocks;
}
i2c_davinci_init(dev);
- r = request_irq(dev->irq, i2c_davinci_isr, 0, pdev->name, dev);
+ r = devm_request_irq(&pdev->dev, dev->irq, i2c_davinci_isr, 0,
+ pdev->name, dev);
if (r) {
dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq);
goto err_unuse_clocks;
@@ -723,7 +715,7 @@ static int davinci_i2c_probe(struct platform_device *pdev)
r = i2c_davinci_cpufreq_register(dev);
if (r) {
dev_err(&pdev->dev, "failed to register cpufreq\n");
- goto err_free_irq;
+ goto err_unuse_clocks;
}
adap = &dev->adapter;
@@ -740,50 +732,31 @@ static int davinci_i2c_probe(struct platform_device *pdev)
r = i2c_add_numbered_adapter(adap);
if (r) {
dev_err(&pdev->dev, "failure adding adapter\n");
- goto err_free_irq;
+ goto err_unuse_clocks;
}
of_i2c_register_devices(adap);
return 0;
-err_free_irq:
- free_irq(dev->irq, dev);
err_unuse_clocks:
- iounmap(dev->base);
-err_mem_ioremap:
clk_disable_unprepare(dev->clk);
- clk_put(dev->clk);
dev->clk = NULL;
-err_free_mem:
- put_device(&pdev->dev);
- kfree(dev);
-err_release_region:
- release_mem_region(mem->start, resource_size(mem));
-
return r;
}
static int davinci_i2c_remove(struct platform_device *pdev)
{
struct davinci_i2c_dev *dev = platform_get_drvdata(pdev);
- struct resource *mem;
i2c_davinci_cpufreq_deregister(dev);
i2c_del_adapter(&dev->adapter);
- put_device(&pdev->dev);
clk_disable_unprepare(dev->clk);
- clk_put(dev->clk);
dev->clk = NULL;
davinci_i2c_write_reg(dev, DAVINCI_I2C_MDR_REG, 0);
- free_irq(dev->irq, dev);
- iounmap(dev->base);
- kfree(dev);
- mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- release_mem_region(mem->start, resource_size(mem));
return 0;
}
diff --git a/drivers/i2c/busses/i2c-designware-core.c b/drivers/i2c/busses/i2c-designware-core.c
index 94fd818..21fbb34 100644
--- a/drivers/i2c/busses/i2c-designware-core.c
+++ b/drivers/i2c/busses/i2c-designware-core.c
@@ -68,6 +68,7 @@
#define DW_IC_TXFLR 0x74
#define DW_IC_RXFLR 0x78
#define DW_IC_TX_ABRT_SOURCE 0x80
+#define DW_IC_ENABLE_STATUS 0x9c
#define DW_IC_COMP_PARAM_1 0xf4
#define DW_IC_COMP_TYPE 0xfc
#define DW_IC_COMP_TYPE_VALUE 0x44570140
@@ -248,6 +249,27 @@ static u32 i2c_dw_scl_lcnt(u32 ic_clk, u32 tLOW, u32 tf, int offset)
return ((ic_clk * (tLOW + tf) + 5000) / 10000) - 1 + offset;
}
+static void __i2c_dw_enable(struct dw_i2c_dev *dev, bool enable)
+{
+ int timeout = 100;
+
+ do {
+ dw_writel(dev, enable, DW_IC_ENABLE);
+ if ((dw_readl(dev, DW_IC_ENABLE_STATUS) & 1) == enable)
+ return;
+
+ /*
+ * Wait 10 times the signaling period of the highest I2C
+ * transfer supported by the driver (for 400KHz this is
+ * 25us) as described in the DesignWare I2C databook.
+ */
+ usleep_range(25, 250);
+ } while (timeout--);
+
+ dev_warn(dev->dev, "timeout in %sabling adapter\n",
+ enable ? "en" : "dis");
+}
+
/**
* i2c_dw_init() - initialize the designware i2c master hardware
* @dev: device private data
@@ -278,7 +300,7 @@ int i2c_dw_init(struct dw_i2c_dev *dev)
}
/* Disable the adapter */
- dw_writel(dev, 0, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, false);
/* set standard and fast speed deviders for high/low periods */
@@ -333,7 +355,7 @@ static int i2c_dw_wait_bus_not_busy(struct dw_i2c_dev *dev)
return -ETIMEDOUT;
}
timeout--;
- mdelay(1);
+ usleep_range(1000, 1100);
}
return 0;
@@ -345,7 +367,7 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev)
u32 ic_con;
/* Disable the adapter */
- dw_writel(dev, 0, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, false);
/* set the slave (target) address */
dw_writel(dev, msgs[dev->msg_write_idx].addr, DW_IC_TAR);
@@ -359,7 +381,7 @@ static void i2c_dw_xfer_init(struct dw_i2c_dev *dev)
dw_writel(dev, ic_con, DW_IC_CON);
/* Enable the adapter */
- dw_writel(dev, 1, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, true);
/* Enable interrupts */
dw_writel(dev, DW_IC_INTR_DEFAULT_MASK, DW_IC_INTR_MASK);
@@ -565,7 +587,7 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
/* no error */
if (likely(!dev->cmd_err)) {
/* Disable the adapter */
- dw_writel(dev, 0, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, false);
ret = num;
goto done;
}
@@ -578,7 +600,8 @@ i2c_dw_xfer(struct i2c_adapter *adap, struct i2c_msg msgs[], int num)
ret = -EIO;
done:
- pm_runtime_put(dev->dev);
+ pm_runtime_mark_last_busy(dev->dev);
+ pm_runtime_put_autosuspend(dev->dev);
mutex_unlock(&dev->lock);
return ret;
@@ -700,7 +723,7 @@ EXPORT_SYMBOL_GPL(i2c_dw_isr);
void i2c_dw_enable(struct dw_i2c_dev *dev)
{
/* Enable the adapter */
- dw_writel(dev, 1, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, true);
}
EXPORT_SYMBOL_GPL(i2c_dw_enable);
@@ -713,7 +736,7 @@ EXPORT_SYMBOL_GPL(i2c_dw_is_enabled);
void i2c_dw_disable(struct dw_i2c_dev *dev)
{
/* Disable controller */
- dw_writel(dev, 0, DW_IC_ENABLE);
+ __i2c_dw_enable(dev, false);
/* Disable all interupts */
dw_writel(dev, 0, DW_IC_INTR_MASK);
diff --git a/drivers/i2c/busses/i2c-designware-pcidrv.c b/drivers/i2c/busses/i2c-designware-pcidrv.c
index 7c5e383..f6ed06c 100644
--- a/drivers/i2c/busses/i2c-designware-pcidrv.c
+++ b/drivers/i2c/busses/i2c-designware-pcidrv.c
@@ -208,68 +208,45 @@ static u32 i2c_dw_get_clk_rate_khz(struct dw_i2c_dev *dev)
}
static int i2c_dw_pci_probe(struct pci_dev *pdev,
-const struct pci_device_id *id)
+ const struct pci_device_id *id)
{
struct dw_i2c_dev *dev;
struct i2c_adapter *adap;
- unsigned long start, len;
- void __iomem *base;
int r;
struct dw_pci_controller *controller;
if (id->driver_data >= ARRAY_SIZE(dw_pci_controllers)) {
- printk(KERN_ERR "dw_i2c_pci_probe: invalid driver data %ld\n",
+ dev_err(&pdev->dev, "%s: invalid driver data %ld\n", __func__,
id->driver_data);
return -EINVAL;
}
controller = &dw_pci_controllers[id->driver_data];
- r = pci_enable_device(pdev);
+ r = pcim_enable_device(pdev);
if (r) {
dev_err(&pdev->dev, "Failed to enable I2C PCI device (%d)\n",
r);
- goto exit;
+ return r;
}
- /* Determine the address of the I2C area */
- start = pci_resource_start(pdev, 0);
- len = pci_resource_len(pdev, 0);
- if (!start || len == 0) {
- dev_err(&pdev->dev, "base address not set\n");
- r = -ENODEV;
- goto exit;
- }
-
- r = pci_request_region(pdev, 0, DRIVER_NAME);
+ r = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev));
if (r) {
- dev_err(&pdev->dev, "failed to request I2C region "
- "0x%lx-0x%lx\n", start,
- (unsigned long)pci_resource_end(pdev, 0));
- goto exit;
- }
-
- base = ioremap_nocache(start, len);
- if (!base) {
dev_err(&pdev->dev, "I/O memory remapping failed\n");
- r = -ENOMEM;
- goto err_release_region;
+ return r;
}
-
- dev = kzalloc(sizeof(struct dw_i2c_dev), GFP_KERNEL);
- if (!dev) {
- r = -ENOMEM;
- goto err_release_region;
- }
+ dev = devm_kzalloc(&pdev->dev, sizeof(struct dw_i2c_dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
init_completion(&dev->cmd_complete);
mutex_init(&dev->lock);
dev->clk = NULL;
dev->controller = controller;
dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz;
- dev->base = base;
- dev->dev = get_device(&pdev->dev);
+ dev->base = pcim_iomap_table(pdev)[0];
+ dev->dev = &pdev->dev;
dev->functionality =
I2C_FUNC_I2C |
I2C_FUNC_SMBUS_BYTE |
@@ -284,7 +261,7 @@ const struct pci_device_id *id)
dev->rx_fifo_depth = controller->rx_fifo_depth;
r = i2c_dw_init(dev);
if (r)
- goto err_iounmap;
+ return r;
adap = &dev->adapter;
i2c_set_adapdata(adap, dev);
@@ -296,10 +273,11 @@ const struct pci_device_id *id)
snprintf(adap->name, sizeof(adap->name), "i2c-designware-pci-%d",
adap->nr);
- r = request_irq(pdev->irq, i2c_dw_isr, IRQF_SHARED, adap->name, dev);
+ r = devm_request_irq(&pdev->dev, pdev->irq, i2c_dw_isr, IRQF_SHARED,
+ adap->name, dev);
if (r) {
dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq);
- goto err_iounmap;
+ return r;
}
i2c_dw_disable_int(dev);
@@ -307,24 +285,14 @@ const struct pci_device_id *id)
r = i2c_add_numbered_adapter(adap);
if (r) {
dev_err(&pdev->dev, "failure adding adapter\n");
- goto err_free_irq;
+ return r;
}
- pm_runtime_put_noidle(&pdev->dev);
+ pm_runtime_set_autosuspend_delay(&pdev->dev, 1000);
+ pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_allow(&pdev->dev);
return 0;
-
-err_free_irq:
- free_irq(pdev->irq, dev);
-err_iounmap:
- iounmap(dev->base);
- put_device(&pdev->dev);
- kfree(dev);
-err_release_region:
- pci_release_region(pdev, 0);
-exit:
- return r;
}
static void i2c_dw_pci_remove(struct pci_dev *pdev)
@@ -336,11 +304,6 @@ static void i2c_dw_pci_remove(struct pci_dev *pdev)
pm_runtime_get_noresume(&pdev->dev);
i2c_del_adapter(&dev->adapter);
- put_device(&pdev->dev);
-
- free_irq(dev->irq, dev);
- kfree(dev);
- pci_release_region(pdev, 0);
}
/* work with hotplug and coldplug */
diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c
index e3085c4..8ec9133 100644
--- a/drivers/i2c/busses/i2c-designware-platdrv.c
+++ b/drivers/i2c/busses/i2c-designware-platdrv.c
@@ -56,20 +56,11 @@ static u32 i2c_dw_get_clk_rate_khz(struct dw_i2c_dev *dev)
static int dw_i2c_acpi_configure(struct platform_device *pdev)
{
struct dw_i2c_dev *dev = platform_get_drvdata(pdev);
- struct acpi_device *adev;
- int busno, ret;
if (!ACPI_HANDLE(&pdev->dev))
return -ENODEV;
- ret = acpi_bus_get_device(ACPI_HANDLE(&pdev->dev), &adev);
- if (ret)
- return -ENODEV;
-
dev->adapter.nr = -1;
- if (adev->pnp.unique_id && !kstrtoint(adev->pnp.unique_id, 0, &busno))
- dev->adapter.nr = busno;
-
dev->tx_fifo_depth = 32;
dev->rx_fifo_depth = 32;
return 0;
@@ -92,7 +83,7 @@ static int dw_i2c_probe(struct platform_device *pdev)
{
struct dw_i2c_dev *dev;
struct i2c_adapter *adap;
- struct resource *mem, *ioarea;
+ struct resource *mem;
int irq, r;
/* NOTE: driver uses the static register mapping */
@@ -108,32 +99,25 @@ static int dw_i2c_probe(struct platform_device *pdev)
return irq; /* -ENXIO */
}
- ioarea = request_mem_region(mem->start, resource_size(mem),
- pdev->name);
- if (!ioarea) {
- dev_err(&pdev->dev, "I2C region already claimed\n");
- return -EBUSY;
- }
+ dev = devm_kzalloc(&pdev->dev, sizeof(struct dw_i2c_dev), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
- dev = kzalloc(sizeof(struct dw_i2c_dev), GFP_KERNEL);
- if (!dev) {
- r = -ENOMEM;
- goto err_release_region;
- }
+ dev->base = devm_ioremap_resource(&pdev->dev, mem);
+ if (IS_ERR(dev->base))
+ return PTR_ERR(dev->base);
init_completion(&dev->cmd_complete);
mutex_init(&dev->lock);
- dev->dev = get_device(&pdev->dev);
+ dev->dev = &pdev->dev;
dev->irq = irq;
platform_set_drvdata(pdev, dev);
- dev->clk = clk_get(&pdev->dev, NULL);
+ dev->clk = devm_clk_get(&pdev->dev, NULL);
dev->get_clk_rate_khz = i2c_dw_get_clk_rate_khz;
- if (IS_ERR(dev->clk)) {
- r = -ENODEV;
- goto err_free_mem;
- }
+ if (IS_ERR(dev->clk))
+ return PTR_ERR(dev->clk);
clk_prepare_enable(dev->clk);
dev->functionality =
@@ -146,13 +130,6 @@ static int dw_i2c_probe(struct platform_device *pdev)
dev->master_cfg = DW_IC_CON_MASTER | DW_IC_CON_SLAVE_DISABLE |
DW_IC_CON_RESTART_EN | DW_IC_CON_SPEED_FAST;
- dev->base = ioremap(mem->start, resource_size(mem));
- if (dev->base == NULL) {
- dev_err(&pdev->dev, "failure mapping io resources\n");
- r = -EBUSY;
- goto err_unuse_clocks;
- }
-
/* Try first if we can configure the device from ACPI */
r = dw_i2c_acpi_configure(pdev);
if (r) {
@@ -164,13 +141,14 @@ static int dw_i2c_probe(struct platform_device *pdev)
}
r = i2c_dw_init(dev);
if (r)
- goto err_iounmap;
+ return r;
i2c_dw_disable_int(dev);
- r = request_irq(dev->irq, i2c_dw_isr, IRQF_SHARED, pdev->name, dev);
+ r = devm_request_irq(&pdev->dev, dev->irq, i2c_dw_isr, IRQF_SHARED,
+ pdev->name, dev);
if (r) {
dev_err(&pdev->dev, "failure requesting irq %i\n", dev->irq);
- goto err_iounmap;
+ return r;
}
adap = &dev->adapter;
@@ -186,57 +164,32 @@ static int dw_i2c_probe(struct platform_device *pdev)
r = i2c_add_numbered_adapter(adap);
if (r) {
dev_err(&pdev->dev, "failure adding adapter\n");
- goto err_free_irq;
+ return r;
}
of_i2c_register_devices(adap);
acpi_i2c_register_devices(adap);
+ pm_runtime_set_autosuspend_delay(&pdev->dev, 1000);
+ pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_set_active(&pdev->dev);
pm_runtime_enable(&pdev->dev);
- pm_runtime_put(&pdev->dev);
return 0;
-
-err_free_irq:
- free_irq(dev->irq, dev);
-err_iounmap:
- iounmap(dev->base);
-err_unuse_clocks:
- clk_disable_unprepare(dev->clk);
- clk_put(dev->clk);
- dev->clk = NULL;
-err_free_mem:
- put_device(&pdev->dev);
- kfree(dev);
-err_release_region:
- release_mem_region(mem->start, resource_size(mem));
-
- return r;
}
static int dw_i2c_remove(struct platform_device *pdev)
{
struct dw_i2c_dev *dev = platform_get_drvdata(pdev);
- struct resource *mem;
pm_runtime_get_sync(&pdev->dev);
i2c_del_adapter(&dev->adapter);
- put_device(&pdev->dev);
-
- clk_disable_unprepare(dev->clk);
- clk_put(dev->clk);
- dev->clk = NULL;
i2c_dw_disable(dev);
- free_irq(dev->irq, dev);
- kfree(dev);
pm_runtime_put(&pdev->dev);
pm_runtime_disable(&pdev->dev);
- mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
- release_mem_region(mem->start, resource_size(mem));
return 0;
}
diff --git a/drivers/i2c/busses/i2c-gpio.c b/drivers/i2c/busses/i2c-gpio.c
index f3fa433..bc6e139 100644
--- a/drivers/i2c/busses/i2c-gpio.c
+++ b/drivers/i2c/busses/i2c-gpio.c
@@ -85,23 +85,29 @@ static int i2c_gpio_getscl(void *data)
return gpio_get_value(pdata->scl_pin);
}
-static int of_i2c_gpio_probe(struct device_node *np,
- struct i2c_gpio_platform_data *pdata)
+static int of_i2c_gpio_get_pins(struct device_node *np,
+ unsigned int *sda_pin, unsigned int *scl_pin)
{
- u32 reg;
-
if (of_gpio_count(np) < 2)
return -ENODEV;
- pdata->sda_pin = of_get_gpio(np, 0);
- pdata->scl_pin = of_get_gpio(np, 1);
+ *sda_pin = of_get_gpio(np, 0);
+ *scl_pin = of_get_gpio(np, 1);
- if (!gpio_is_valid(pdata->sda_pin) || !gpio_is_valid(pdata->scl_pin)) {
+ if (!gpio_is_valid(*sda_pin) || !gpio_is_valid(*scl_pin)) {
pr_err("%s: invalid GPIO pins, sda=%d/scl=%d\n",
- np->full_name, pdata->sda_pin, pdata->scl_pin);
+ np->full_name, *sda_pin, *scl_pin);
return -ENODEV;
}
+ return 0;
+}
+
+static void of_i2c_gpio_get_props(struct device_node *np,
+ struct i2c_gpio_platform_data *pdata)
+{
+ u32 reg;
+
of_property_read_u32(np, "i2c-gpio,delay-us", &pdata->udelay);
if (!of_property_read_u32(np, "i2c-gpio,timeout-ms", &reg))
@@ -113,8 +119,6 @@ static int of_i2c_gpio_probe(struct device_node *np,
of_property_read_bool(np, "i2c-gpio,scl-open-drain");
pdata->scl_is_output_only =
of_property_read_bool(np, "i2c-gpio,scl-output-only");
-
- return 0;
}
static int i2c_gpio_probe(struct platform_device *pdev)
@@ -123,31 +127,52 @@ static int i2c_gpio_probe(struct platform_device *pdev)
struct i2c_gpio_platform_data *pdata;
struct i2c_algo_bit_data *bit_data;
struct i2c_adapter *adap;
+ unsigned int sda_pin, scl_pin;
int ret;
- priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
- if (!priv)
- return -ENOMEM;
- adap = &priv->adap;
- bit_data = &priv->bit_data;
- pdata = &priv->pdata;
-
+ /* First get the GPIO pins; if it fails, we'll defer the probe. */
if (pdev->dev.of_node) {
- ret = of_i2c_gpio_probe(pdev->dev.of_node, pdata);
+ ret = of_i2c_gpio_get_pins(pdev->dev.of_node,
+ &sda_pin, &scl_pin);
if (ret)
return ret;
} else {
if (!pdev->dev.platform_data)
return -ENXIO;
- memcpy(pdata, pdev->dev.platform_data, sizeof(*pdata));
+ pdata = pdev->dev.platform_data;
+ sda_pin = pdata->sda_pin;
+ scl_pin = pdata->scl_pin;
}
- ret = gpio_request(pdata->sda_pin, "sda");
- if (ret)
+ ret = gpio_request(sda_pin, "sda");
+ if (ret) {
+ if (ret == -EINVAL)
+ ret = -EPROBE_DEFER; /* Try again later */
goto err_request_sda;
- ret = gpio_request(pdata->scl_pin, "scl");
- if (ret)
+ }
+ ret = gpio_request(scl_pin, "scl");
+ if (ret) {
+ if (ret == -EINVAL)
+ ret = -EPROBE_DEFER; /* Try again later */
goto err_request_scl;
+ }
+
+ priv = devm_kzalloc(&pdev->dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv) {
+ ret = -ENOMEM;
+ goto err_add_bus;
+ }
+ adap = &priv->adap;
+ bit_data = &priv->bit_data;
+ pdata = &priv->pdata;
+
+ if (pdev->dev.of_node) {
+ pdata->sda_pin = sda_pin;
+ pdata->scl_pin = scl_pin;
+ of_i2c_gpio_get_props(pdev->dev.of_node, pdata);
+ } else {
+ memcpy(pdata, pdev->dev.platform_data, sizeof(*pdata));
+ }
if (pdata->sda_is_open_drain) {
gpio_direction_output(pdata->sda_pin, 1);
@@ -211,9 +236,9 @@ static int i2c_gpio_probe(struct platform_device *pdev)
return 0;
err_add_bus:
- gpio_free(pdata->scl_pin);
+ gpio_free(scl_pin);
err_request_scl:
- gpio_free(pdata->sda_pin);
+ gpio_free(sda_pin);
err_request_sda:
return ret;
}
diff --git a/drivers/i2c/busses/i2c-intel-mid.c b/drivers/i2c/busses/i2c-intel-mid.c
index 323fa01..0fb6597 100644
--- a/drivers/i2c/busses/i2c-intel-mid.c
+++ b/drivers/i2c/busses/i2c-intel-mid.c
@@ -1082,8 +1082,7 @@ static void intel_mid_i2c_remove(struct pci_dev *dev)
{
struct intel_mid_i2c_private *mrst = pci_get_drvdata(dev);
intel_mid_i2c_disable(&mrst->adap);
- if (i2c_del_adapter(&mrst->adap))
- dev_err(&dev->dev, "Failed to delete i2c adapter");
+ i2c_del_adapter(&mrst->adap);
free_irq(dev->irq, mrst);
iounmap(mrst->base);
diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c
index 130f02c..cd82eb4 100644
--- a/drivers/i2c/busses/i2c-ismt.c
+++ b/drivers/i2c/busses/i2c-ismt.c
@@ -183,7 +183,7 @@ struct ismt_priv {
/**
* ismt_ids - PCI device IDs supported by this driver
*/
-static const DEFINE_PCI_DEVICE_TABLE(ismt_ids) = {
+static DEFINE_PCI_DEVICE_TABLE(ismt_ids) = {
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT0) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_S1200_SMT1) },
{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AVOTON_SMT) },
diff --git a/drivers/i2c/busses/i2c-mv64xxx.c b/drivers/i2c/busses/i2c-mv64xxx.c
index 8b20ef8..3bbd65d 100644
--- a/drivers/i2c/busses/i2c-mv64xxx.c
+++ b/drivers/i2c/busses/i2c-mv64xxx.c
@@ -701,9 +701,8 @@ static int
mv64xxx_i2c_remove(struct platform_device *dev)
{
struct mv64xxx_i2c_data *drv_data = platform_get_drvdata(dev);
- int rc;
- rc = i2c_del_adapter(&drv_data->adapter);
+ i2c_del_adapter(&drv_data->adapter);
free_irq(drv_data->irq, drv_data);
mv64xxx_i2c_unmap_regs(drv_data);
#if defined(CONFIG_HAVE_CLK)
@@ -715,7 +714,7 @@ mv64xxx_i2c_remove(struct platform_device *dev)
#endif
kfree(drv_data);
- return rc;
+ return 0;
}
static const struct of_device_id mv64xxx_i2c_of_match_table[] = {
diff --git a/drivers/i2c/busses/i2c-mxs.c b/drivers/i2c/busses/i2c-mxs.c
index 120f246..c67d89f 100644
--- a/drivers/i2c/busses/i2c-mxs.c
+++ b/drivers/i2c/busses/i2c-mxs.c
@@ -56,6 +56,7 @@
#define MXS_I2C_CTRL1_SET (0x44)
#define MXS_I2C_CTRL1_CLR (0x48)
+#define MXS_I2C_CTRL1_CLR_GOT_A_NAK 0x10000000
#define MXS_I2C_CTRL1_BUS_FREE_IRQ 0x80
#define MXS_I2C_CTRL1_DATA_ENGINE_CMPLT_IRQ 0x40
#define MXS_I2C_CTRL1_NO_SLAVE_ACK_IRQ 0x20
@@ -65,6 +66,10 @@
#define MXS_I2C_CTRL1_SLAVE_STOP_IRQ 0x02
#define MXS_I2C_CTRL1_SLAVE_IRQ 0x01
+#define MXS_I2C_STAT (0x50)
+#define MXS_I2C_STAT_BUS_BUSY 0x00000800
+#define MXS_I2C_STAT_CLK_GEN_BUSY 0x00000400
+
#define MXS_I2C_DATA (0xa0)
#define MXS_I2C_DEBUG0 (0xb0)
@@ -297,12 +302,10 @@ static int mxs_i2c_pio_wait_dmareq(struct mxs_i2c_dev *i2c)
cond_resched();
}
- writel(MXS_I2C_DEBUG0_DMAREQ, i2c->regs + MXS_I2C_DEBUG0_CLR);
-
return 0;
}
-static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c)
+static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c, int last)
{
unsigned long timeout = jiffies + msecs_to_jiffies(1000);
@@ -323,9 +326,50 @@ static int mxs_i2c_pio_wait_cplt(struct mxs_i2c_dev *i2c)
writel(MXS_I2C_CTRL1_DATA_ENGINE_CMPLT_IRQ,
i2c->regs + MXS_I2C_CTRL1_CLR);
+ /*
+ * When ending a transfer with a stop, we have to wait for the bus to
+ * go idle before we report the transfer as completed. Otherwise the
+ * start of the next transfer may race with the end of the current one.
+ */
+ while (last && (readl(i2c->regs + MXS_I2C_STAT) &
+ (MXS_I2C_STAT_BUS_BUSY | MXS_I2C_STAT_CLK_GEN_BUSY))) {
+ if (time_after(jiffies, timeout))
+ return -ETIMEDOUT;
+ cond_resched();
+ }
+
return 0;
}
+static int mxs_i2c_pio_check_error_state(struct mxs_i2c_dev *i2c)
+{
+ u32 state;
+
+ state = readl(i2c->regs + MXS_I2C_CTRL1_CLR) & MXS_I2C_IRQ_MASK;
+
+ if (state & MXS_I2C_CTRL1_NO_SLAVE_ACK_IRQ)
+ i2c->cmd_err = -ENXIO;
+ else if (state & (MXS_I2C_CTRL1_EARLY_TERM_IRQ |
+ MXS_I2C_CTRL1_MASTER_LOSS_IRQ |
+ MXS_I2C_CTRL1_SLAVE_STOP_IRQ |
+ MXS_I2C_CTRL1_SLAVE_IRQ))
+ i2c->cmd_err = -EIO;
+
+ return i2c->cmd_err;
+}
+
+static void mxs_i2c_pio_trigger_cmd(struct mxs_i2c_dev *i2c, u32 cmd)
+{
+ u32 reg;
+
+ writel(cmd, i2c->regs + MXS_I2C_CTRL0);
+
+ /* readback makes sure the write is latched into hardware */
+ reg = readl(i2c->regs + MXS_I2C_CTRL0);
+ reg |= MXS_I2C_CTRL0_RUN;
+ writel(reg, i2c->regs + MXS_I2C_CTRL0);
+}
+
static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
struct i2c_msg *msg, uint32_t flags)
{
@@ -341,23 +385,26 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
addr_data |= I2C_SMBUS_READ;
/* SELECT command. */
- writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_SELECT,
- i2c->regs + MXS_I2C_CTRL0);
+ mxs_i2c_pio_trigger_cmd(i2c, MXS_CMD_I2C_SELECT);
ret = mxs_i2c_pio_wait_dmareq(i2c);
if (ret)
return ret;
writel(addr_data, i2c->regs + MXS_I2C_DATA);
+ writel(MXS_I2C_DEBUG0_DMAREQ, i2c->regs + MXS_I2C_DEBUG0_CLR);
- ret = mxs_i2c_pio_wait_cplt(i2c);
+ ret = mxs_i2c_pio_wait_cplt(i2c, 0);
if (ret)
return ret;
+ if (mxs_i2c_pio_check_error_state(i2c))
+ goto cleanup;
+
/* READ command. */
- writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_READ | flags |
- MXS_I2C_CTRL0_XFER_COUNT(msg->len),
- i2c->regs + MXS_I2C_CTRL0);
+ mxs_i2c_pio_trigger_cmd(i2c,
+ MXS_CMD_I2C_READ | flags |
+ MXS_I2C_CTRL0_XFER_COUNT(msg->len));
for (i = 0; i < msg->len; i++) {
if ((i & 3) == 0) {
@@ -365,6 +412,8 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
if (ret)
return ret;
data = readl(i2c->regs + MXS_I2C_DATA);
+ writel(MXS_I2C_DEBUG0_DMAREQ,
+ i2c->regs + MXS_I2C_DEBUG0_CLR);
}
msg->buf[i] = data & 0xff;
data >>= 8;
@@ -373,9 +422,9 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
addr_data |= I2C_SMBUS_WRITE;
/* WRITE command. */
- writel(MXS_I2C_CTRL0_RUN | MXS_CMD_I2C_WRITE | flags |
- MXS_I2C_CTRL0_XFER_COUNT(msg->len + 1),
- i2c->regs + MXS_I2C_CTRL0);
+ mxs_i2c_pio_trigger_cmd(i2c,
+ MXS_CMD_I2C_WRITE | flags |
+ MXS_I2C_CTRL0_XFER_COUNT(msg->len + 1));
/*
* The LSB of data buffer is the first byte blasted across
@@ -391,6 +440,8 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
if (ret)
return ret;
writel(data, i2c->regs + MXS_I2C_DATA);
+ writel(MXS_I2C_DEBUG0_DMAREQ,
+ i2c->regs + MXS_I2C_DEBUG0_CLR);
}
}
@@ -401,13 +452,19 @@ static int mxs_i2c_pio_setup_xfer(struct i2c_adapter *adap,
if (ret)
return ret;
writel(data, i2c->regs + MXS_I2C_DATA);
+ writel(MXS_I2C_DEBUG0_DMAREQ,
+ i2c->regs + MXS_I2C_DEBUG0_CLR);
}
}
- ret = mxs_i2c_pio_wait_cplt(i2c);
+ ret = mxs_i2c_pio_wait_cplt(i2c, flags & MXS_I2C_CTRL0_POST_SEND_STOP);
if (ret)
return ret;
+ /* make sure we capture any occurred error into cmd_err */
+ mxs_i2c_pio_check_error_state(i2c);
+
+cleanup:
/* Clear any dangling IRQs and re-enable interrupts. */
writel(MXS_I2C_IRQ_MASK, i2c->regs + MXS_I2C_CTRL1_CLR);
writel(MXS_I2C_IRQ_MASK << 8, i2c->regs + MXS_I2C_CTRL1_SET);
@@ -439,12 +496,12 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg,
* using PIO mode while longer transfers use DMA. The 8 byte border is
* based on this empirical measurement and a lot of previous frobbing.
*/
+ i2c->cmd_err = 0;
if (msg->len < 8) {
ret = mxs_i2c_pio_setup_xfer(adap, msg, flags);
if (ret)
mxs_i2c_reset(i2c);
} else {
- i2c->cmd_err = 0;
INIT_COMPLETION(i2c->cmd_complete);
ret = mxs_i2c_dma_setup_xfer(adap, msg, flags);
if (ret)
@@ -454,13 +511,19 @@ static int mxs_i2c_xfer_msg(struct i2c_adapter *adap, struct i2c_msg *msg,
msecs_to_jiffies(1000));
if (ret == 0)
goto timeout;
+ }
- if (i2c->cmd_err == -ENXIO)
- mxs_i2c_reset(i2c);
-
- ret = i2c->cmd_err;
+ if (i2c->cmd_err == -ENXIO) {
+ /*
+ * If the transfer fails with a NAK from the slave the
+ * controller halts until it gets told to return to idle state.
+ */
+ writel(MXS_I2C_CTRL1_CLR_GOT_A_NAK,
+ i2c->regs + MXS_I2C_CTRL1_SET);
}
+ ret = i2c->cmd_err;
+
dev_dbg(i2c->dev, "Done with err=%d\n", ret);
return ret;
@@ -686,11 +749,8 @@ static int mxs_i2c_probe(struct platform_device *pdev)
static int mxs_i2c_remove(struct platform_device *pdev)
{
struct mxs_i2c_dev *i2c = platform_get_drvdata(pdev);
- int ret;
- ret = i2c_del_adapter(&i2c->adapter);
- if (ret)
- return -EBUSY;
+ i2c_del_adapter(&i2c->adapter);
if (i2c->dmach)
dma_release_channel(i2c->dmach);
diff --git a/drivers/i2c/busses/i2c-nforce2-s4985.c b/drivers/i2c/busses/i2c-nforce2-s4985.c
index 29015eb..2ca268d 100644
--- a/drivers/i2c/busses/i2c-nforce2-s4985.c
+++ b/drivers/i2c/busses/i2c-nforce2-s4985.c
@@ -164,11 +164,7 @@ static int __init nforce2_s4985_init(void)
}
/* Unregister physical bus */
- error = i2c_del_adapter(nforce2_smbus);
- if (error) {
- dev_err(&nforce2_smbus->dev, "Physical bus removal failed\n");
- goto ERROR0;
- }
+ i2c_del_adapter(nforce2_smbus);
printk(KERN_INFO "Enabling SMBus multiplexing for Tyan S4985\n");
/* Define the 5 virtual adapters and algorithms structures */
diff --git a/drivers/i2c/busses/i2c-octeon.c b/drivers/i2c/busses/i2c-octeon.c
index 935585e..956fe32 100644
--- a/drivers/i2c/busses/i2c-octeon.c
+++ b/drivers/i2c/busses/i2c-octeon.c
@@ -183,7 +183,7 @@ static irqreturn_t octeon_i2c_isr(int irq, void *dev_id)
struct octeon_i2c *i2c = dev_id;
octeon_i2c_int_disable(i2c);
- wake_up_interruptible(&i2c->queue);
+ wake_up(&i2c->queue);
return IRQ_HANDLED;
}
@@ -206,9 +206,9 @@ static int octeon_i2c_wait(struct octeon_i2c *i2c)
octeon_i2c_int_enable(i2c);
- result = wait_event_interruptible_timeout(i2c->queue,
- octeon_i2c_test_iflg(i2c),
- i2c->adap.timeout);
+ result = wait_event_timeout(i2c->queue,
+ octeon_i2c_test_iflg(i2c),
+ i2c->adap.timeout);
octeon_i2c_int_disable(i2c);
@@ -440,7 +440,7 @@ static struct i2c_adapter octeon_i2c_ops = {
.owner = THIS_MODULE,
.name = "OCTEON adapter",
.algo = &octeon_i2c_algo,
- .timeout = 2,
+ .timeout = HZ / 50,
};
/**
diff --git a/drivers/i2c/busses/i2c-powermac.c b/drivers/i2c/busses/i2c-powermac.c
index da54e67..8dc90da 100644
--- a/drivers/i2c/busses/i2c-powermac.c
+++ b/drivers/i2c/busses/i2c-powermac.c
@@ -213,14 +213,8 @@ static const struct i2c_algorithm i2c_powermac_algorithm = {
static int i2c_powermac_remove(struct platform_device *dev)
{
struct i2c_adapter *adapter = platform_get_drvdata(dev);
- int rc;
-
- rc = i2c_del_adapter(adapter);
- /* We aren't that prepared to deal with this... */
- if (rc)
- printk(KERN_WARNING
- "i2c-powermac.c: Failed to remove bus %s !\n",
- adapter->name);
+
+ i2c_del_adapter(adapter);
memset(adapter, 0, sizeof(*adapter));
return 0;
diff --git a/drivers/i2c/busses/i2c-puv3.c b/drivers/i2c/busses/i2c-puv3.c
index 8acef65..37a84c8 100644
--- a/drivers/i2c/busses/i2c-puv3.c
+++ b/drivers/i2c/busses/i2c-puv3.c
@@ -234,21 +234,15 @@ static int puv3_i2c_remove(struct platform_device *pdev)
{
struct i2c_adapter *adapter = platform_get_drvdata(pdev);
struct resource *mem;
- int rc;
- rc = i2c_del_adapter(adapter);
- if (rc) {
- dev_err(&pdev->dev, "Adapter '%s' delete fail\n",
- adapter->name);
- return rc;
- }
+ i2c_del_adapter(adapter);
put_device(&pdev->dev);
mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
release_mem_region(mem->start, resource_size(mem));
- return rc;
+ return 0;
}
#ifdef CONFIG_PM
diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c
index 1e88e8d..ea6d45d 100644
--- a/drivers/i2c/busses/i2c-pxa.c
+++ b/drivers/i2c/busses/i2c-pxa.c
@@ -1053,16 +1053,13 @@ static int i2c_pxa_probe_dt(struct platform_device *pdev, struct pxa_i2c *i2c,
struct device_node *np = pdev->dev.of_node;
const struct of_device_id *of_id =
of_match_device(i2c_pxa_dt_ids, &pdev->dev);
- int ret;
if (!of_id)
return 1;
- ret = of_alias_get_id(np, "i2c");
- if (ret < 0) {
- dev_err(&pdev->dev, "failed to get alias id, errno %d\n", ret);
- return ret;
- }
- pdev->id = ret;
+
+ /* For device tree we always use the dynamic or alias-assigned ID */
+ i2c->adap.nr = -1;
+
if (of_get_property(np, "mrvl,i2c-polling", NULL))
i2c->use_pio = 1;
if (of_get_property(np, "mrvl,i2c-fast-mode", NULL))
@@ -1100,6 +1097,9 @@ static int i2c_pxa_probe(struct platform_device *dev)
goto emalloc;
}
+ /* Default adapter num to device id; i2c_pxa_probe_dt can override. */
+ i2c->adap.nr = dev->id;
+
ret = i2c_pxa_probe_dt(dev, i2c, &i2c_type);
if (ret > 0)
ret = i2c_pxa_probe_pdata(dev, i2c, &i2c_type);
@@ -1124,9 +1124,7 @@ static int i2c_pxa_probe(struct platform_device *dev)
spin_lock_init(&i2c->lock);
init_waitqueue_head(&i2c->wait);
- i2c->adap.nr = dev->id;
- snprintf(i2c->adap.name, sizeof(i2c->adap.name), "pxa_i2c-i2c.%u",
- i2c->adap.nr);
+ strlcpy(i2c->adap.name, "pxa_i2c-i2c", sizeof(i2c->adap.name));
i2c->clk = clk_get(&dev->dev, NULL);
if (IS_ERR(i2c->clk)) {
@@ -1169,7 +1167,7 @@ static int i2c_pxa_probe(struct platform_device *dev)
} else {
i2c->adap.algo = &i2c_pxa_algorithm;
ret = request_irq(irq, i2c_pxa_handler, IRQF_SHARED,
- i2c->adap.name, i2c);
+ dev_name(&dev->dev), i2c);
if (ret)
goto ereqirq;
}
diff --git a/drivers/i2c/busses/i2c-s3c2410.c b/drivers/i2c/busses/i2c-s3c2410.c
index f6b880b..6e8ee92 100644
--- a/drivers/i2c/busses/i2c-s3c2410.c
+++ b/drivers/i2c/busses/i2c-s3c2410.c
@@ -42,9 +42,46 @@
#include <asm/irq.h>
-#include <plat/regs-iic.h>
#include <linux/platform_data/i2c-s3c2410.h>
+/* see s3c2410x user guide, v1.1, section 9 (p447) for more info */
+
+#define S3C2410_IICCON 0x00
+#define S3C2410_IICSTAT 0x04
+#define S3C2410_IICADD 0x08
+#define S3C2410_IICDS 0x0C
+#define S3C2440_IICLC 0x10
+
+#define S3C2410_IICCON_ACKEN (1 << 7)
+#define S3C2410_IICCON_TXDIV_16 (0 << 6)
+#define S3C2410_IICCON_TXDIV_512 (1 << 6)
+#define S3C2410_IICCON_IRQEN (1 << 5)
+#define S3C2410_IICCON_IRQPEND (1 << 4)
+#define S3C2410_IICCON_SCALE(x) ((x) & 0xf)
+#define S3C2410_IICCON_SCALEMASK (0xf)
+
+#define S3C2410_IICSTAT_MASTER_RX (2 << 6)
+#define S3C2410_IICSTAT_MASTER_TX (3 << 6)
+#define S3C2410_IICSTAT_SLAVE_RX (0 << 6)
+#define S3C2410_IICSTAT_SLAVE_TX (1 << 6)
+#define S3C2410_IICSTAT_MODEMASK (3 << 6)
+
+#define S3C2410_IICSTAT_START (1 << 5)
+#define S3C2410_IICSTAT_BUSBUSY (1 << 5)
+#define S3C2410_IICSTAT_TXRXEN (1 << 4)
+#define S3C2410_IICSTAT_ARBITR (1 << 3)
+#define S3C2410_IICSTAT_ASSLAVE (1 << 2)
+#define S3C2410_IICSTAT_ADDR0 (1 << 1)
+#define S3C2410_IICSTAT_LASTBIT (1 << 0)
+
+#define S3C2410_IICLC_SDA_DELAY0 (0 << 0)
+#define S3C2410_IICLC_SDA_DELAY5 (1 << 0)
+#define S3C2410_IICLC_SDA_DELAY10 (2 << 0)
+#define S3C2410_IICLC_SDA_DELAY15 (3 << 0)
+#define S3C2410_IICLC_SDA_DELAY_MASK (3 << 0)
+
+#define S3C2410_IICLC_FILTER_ON (1 << 2)
+
/* Treat S3C2410 as baseline hardware, anything else is supported via quirks */
#define QUIRK_S3C2440 (1 << 0)
#define QUIRK_HDMIPHY (1 << 1)
@@ -309,6 +346,12 @@ static inline int is_lastmsg(struct s3c24xx_i2c *i2c)
static inline int is_msglast(struct s3c24xx_i2c *i2c)
{
+ /* msg->len is always 1 for the first byte of smbus block read.
+ * Actual length will be read from slave. More bytes will be
+ * read according to the length then. */
+ if (i2c->msg->flags & I2C_M_RECV_LEN && i2c->msg->len == 1)
+ return 0;
+
return i2c->msg_ptr == i2c->msg->len-1;
}
@@ -448,6 +491,9 @@ static int i2c_s3c_irq_nextbyte(struct s3c24xx_i2c *i2c, unsigned long iicstat)
byte = readb(i2c->regs + S3C2410_IICDS);
i2c->msg->buf[i2c->msg_ptr++] = byte;
+ /* Add actual length to read for smbus block read */
+ if (i2c->msg->flags & I2C_M_RECV_LEN && i2c->msg->len == 1)
+ i2c->msg->len += byte;
prepare_read:
if (is_msglast(i2c)) {
/* last byte of buffer */
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index b714776..b60ff90 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -25,7 +25,6 @@
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/i2c-tegra.h>
#include <linux/of_i2c.h>
#include <linux/of_device.h>
#include <linux/module.h>
@@ -172,7 +171,7 @@ struct tegra_i2c_dev {
u8 *msg_buf;
size_t msg_buf_remaining;
int msg_read;
- unsigned long bus_clk_rate;
+ u32 bus_clk_rate;
bool is_suspended;
};
@@ -694,7 +693,6 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
.clk_divisor_std_fast_mode = 0x19,
};
-#if defined(CONFIG_OF)
/* Match table for of_platform binding */
static const struct of_device_id tegra_i2c_of_match[] = {
{ .compatible = "nvidia,tegra114-i2c", .data = &tegra114_i2c_hw, },
@@ -704,16 +702,13 @@ static const struct of_device_id tegra_i2c_of_match[] = {
{},
};
MODULE_DEVICE_TABLE(of, tegra_i2c_of_match);
-#endif
static int tegra_i2c_probe(struct platform_device *pdev)
{
struct tegra_i2c_dev *i2c_dev;
- struct tegra_i2c_platform_data *pdata = pdev->dev.platform_data;
struct resource *res;
struct clk *div_clk;
struct clk *fast_clk;
- const unsigned int *prop;
void __iomem *base;
int irq;
int ret = 0;
@@ -754,23 +749,16 @@ static int tegra_i2c_probe(struct platform_device *pdev)
i2c_dev->cont_id = pdev->id;
i2c_dev->dev = &pdev->dev;
- i2c_dev->bus_clk_rate = 100000; /* default clock rate */
- if (pdata) {
- i2c_dev->bus_clk_rate = pdata->bus_clk_rate;
-
- } else if (i2c_dev->dev->of_node) { /* if there is a device tree node ... */
- prop = of_get_property(i2c_dev->dev->of_node,
- "clock-frequency", NULL);
- if (prop)
- i2c_dev->bus_clk_rate = be32_to_cpup(prop);
- }
+ ret = of_property_read_u32(i2c_dev->dev->of_node, "clock-frequency",
+ &i2c_dev->bus_clk_rate);
+ if (ret)
+ i2c_dev->bus_clk_rate = 100000; /* default clock rate */
i2c_dev->hw = &tegra20_i2c_hw;
if (pdev->dev.of_node) {
const struct of_device_id *match;
- match = of_match_device(of_match_ptr(tegra_i2c_of_match),
- &pdev->dev);
+ match = of_match_device(tegra_i2c_of_match, &pdev->dev);
i2c_dev->hw = match->data;
i2c_dev->is_dvc = of_device_is_compatible(pdev->dev.of_node,
"nvidia,tegra20-i2c-dvc");
@@ -876,7 +864,7 @@ static struct platform_driver tegra_i2c_driver = {
.driver = {
.name = "tegra-i2c",
.owner = THIS_MODULE,
- .of_match_table = of_match_ptr(tegra_i2c_of_match),
+ .of_match_table = tegra_i2c_of_match,
.pm = TEGRA_I2C_PM,
},
};
diff --git a/drivers/i2c/busses/i2c-viperboard.c b/drivers/i2c/busses/i2c-viperboard.c
index f45c32c..c68450c 100644
--- a/drivers/i2c/busses/i2c-viperboard.c
+++ b/drivers/i2c/busses/i2c-viperboard.c
@@ -421,11 +421,10 @@ error:
static int vprbrd_i2c_remove(struct platform_device *pdev)
{
struct vprbrd_i2c *vb_i2c = platform_get_drvdata(pdev);
- int ret;
- ret = i2c_del_adapter(&vb_i2c->i2c);
+ i2c_del_adapter(&vb_i2c->i2c);
- return ret;
+ return 0;
}
static struct platform_driver vprbrd_i2c_driver = {
diff --git a/drivers/i2c/busses/i2c-xiic.c b/drivers/i2c/busses/i2c-xiic.c
index 332c720..3d0f052 100644
--- a/drivers/i2c/busses/i2c-xiic.c
+++ b/drivers/i2c/busses/i2c-xiic.c
@@ -312,10 +312,8 @@ static void xiic_fill_tx_fifo(struct xiic_i2c *i2c)
/* last message in transfer -> STOP */
data |= XIIC_TX_DYN_STOP_MASK;
dev_dbg(i2c->adap.dev.parent, "%s TX STOP\n", __func__);
-
- xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, data);
- } else
- xiic_setreg8(i2c, XIIC_DTR_REG_OFFSET, data);
+ }
+ xiic_setreg16(i2c, XIIC_DTR_REG_OFFSET, data);
}
}
diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 991d38d..6b63cc7 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -27,7 +27,9 @@
#include <linux/module.h>
#include <linux/kernel.h>
+#include <linux/delay.h>
#include <linux/errno.h>
+#include <linux/gpio.h>
#include <linux/slab.h>
#include <linux/i2c.h>
#include <linux/init.h>
@@ -47,7 +49,7 @@
/* core_lock protects i2c_adapter_idr, and guarantees
that device detection, deletion of detected devices, and attach_adapter
- and detach_adapter calls are serialized */
+ calls are serialized */
static DEFINE_MUTEX(core_lock);
static DEFINE_IDR(i2c_adapter_idr);
@@ -91,7 +93,6 @@ static int i2c_device_match(struct device *dev, struct device_driver *drv)
return 0;
}
-#ifdef CONFIG_HOTPLUG
/* uevent helps with hotplug: modprobe -q $(MODALIAS) */
static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env)
@@ -105,9 +106,129 @@ static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env)
return 0;
}
-#else
-#define i2c_device_uevent NULL
-#endif /* CONFIG_HOTPLUG */
+/* i2c bus recovery routines */
+static int get_scl_gpio_value(struct i2c_adapter *adap)
+{
+ return gpio_get_value(adap->bus_recovery_info->scl_gpio);
+}
+
+static void set_scl_gpio_value(struct i2c_adapter *adap, int val)
+{
+ gpio_set_value(adap->bus_recovery_info->scl_gpio, val);
+}
+
+static int get_sda_gpio_value(struct i2c_adapter *adap)
+{
+ return gpio_get_value(adap->bus_recovery_info->sda_gpio);
+}
+
+static int i2c_get_gpios_for_recovery(struct i2c_adapter *adap)
+{
+ struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+ struct device *dev = &adap->dev;
+ int ret = 0;
+
+ ret = gpio_request_one(bri->scl_gpio, GPIOF_OPEN_DRAIN |
+ GPIOF_OUT_INIT_HIGH, "i2c-scl");
+ if (ret) {
+ dev_warn(dev, "Can't get SCL gpio: %d\n", bri->scl_gpio);
+ return ret;
+ }
+
+ if (bri->get_sda) {
+ if (gpio_request_one(bri->sda_gpio, GPIOF_IN, "i2c-sda")) {
+ /* work without SDA polling */
+ dev_warn(dev, "Can't get SDA gpio: %d. Not using SDA polling\n",
+ bri->sda_gpio);
+ bri->get_sda = NULL;
+ }
+ }
+
+ return ret;
+}
+
+static void i2c_put_gpios_for_recovery(struct i2c_adapter *adap)
+{
+ struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+
+ if (bri->get_sda)
+ gpio_free(bri->sda_gpio);
+
+ gpio_free(bri->scl_gpio);
+}
+
+/*
+ * We are generating clock pulses. ndelay() determines durating of clk pulses.
+ * We will generate clock with rate 100 KHz and so duration of both clock levels
+ * is: delay in ns = (10^6 / 100) / 2
+ */
+#define RECOVERY_NDELAY 5000
+#define RECOVERY_CLK_CNT 9
+
+static int i2c_generic_recovery(struct i2c_adapter *adap)
+{
+ struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+ int i = 0, val = 1, ret = 0;
+
+ if (bri->prepare_recovery)
+ bri->prepare_recovery(bri);
+
+ /*
+ * By this time SCL is high, as we need to give 9 falling-rising edges
+ */
+ while (i++ < RECOVERY_CLK_CNT * 2) {
+ if (val) {
+ /* Break if SDA is high */
+ if (bri->get_sda && bri->get_sda(adap))
+ break;
+ /* SCL shouldn't be low here */
+ if (!bri->get_scl(adap)) {
+ dev_err(&adap->dev,
+ "SCL is stuck low, exit recovery\n");
+ ret = -EBUSY;
+ break;
+ }
+ }
+
+ val = !val;
+ bri->set_scl(adap, val);
+ ndelay(RECOVERY_NDELAY);
+ }
+
+ if (bri->unprepare_recovery)
+ bri->unprepare_recovery(bri);
+
+ return ret;
+}
+
+int i2c_generic_scl_recovery(struct i2c_adapter *adap)
+{
+ adap->bus_recovery_info->set_scl(adap, 1);
+ return i2c_generic_recovery(adap);
+}
+
+int i2c_generic_gpio_recovery(struct i2c_adapter *adap)
+{
+ int ret;
+
+ ret = i2c_get_gpios_for_recovery(adap);
+ if (ret)
+ return ret;
+
+ ret = i2c_generic_recovery(adap);
+ i2c_put_gpios_for_recovery(adap);
+
+ return ret;
+}
+
+int i2c_recover_bus(struct i2c_adapter *adap)
+{
+ if (!adap->bus_recovery_info)
+ return -EOPNOTSUPP;
+
+ dev_dbg(&adap->dev, "Trying i2c bus recovery\n");
+ return adap->bus_recovery_info->recover_bus(adap);
+}
static int i2c_device_probe(struct device *dev)
{
@@ -902,6 +1023,39 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
"Failed to create compatibility class link\n");
#endif
+ /* bus recovery specific initialization */
+ if (adap->bus_recovery_info) {
+ struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+
+ if (!bri->recover_bus) {
+ dev_err(&adap->dev, "No recover_bus() found, not using recovery\n");
+ adap->bus_recovery_info = NULL;
+ goto exit_recovery;
+ }
+
+ /* Generic GPIO recovery */
+ if (bri->recover_bus == i2c_generic_gpio_recovery) {
+ if (!gpio_is_valid(bri->scl_gpio)) {
+ dev_err(&adap->dev, "Invalid SCL gpio, not using recovery\n");
+ adap->bus_recovery_info = NULL;
+ goto exit_recovery;
+ }
+
+ if (gpio_is_valid(bri->sda_gpio))
+ bri->get_sda = get_sda_gpio_value;
+ else
+ bri->get_sda = NULL;
+
+ bri->get_scl = get_scl_gpio_value;
+ bri->set_scl = set_scl_gpio_value;
+ } else if (!bri->set_scl || !bri->get_scl) {
+ /* Generic SCL recovery */
+ dev_err(&adap->dev, "No {get|set}_gpio() found, not using recovery\n");
+ adap->bus_recovery_info = NULL;
+ }
+ }
+
+exit_recovery:
/* create pre-declared device nodes */
if (adap->nr < __i2c_first_dynamic_bus_num)
i2c_scan_static_board_info(adap);
@@ -921,13 +1075,35 @@ out_list:
}
/**
+ * __i2c_add_numbered_adapter - i2c_add_numbered_adapter where nr is never -1
+ * @adap: the adapter to register (with adap->nr initialized)
+ * Context: can sleep
+ *
+ * See i2c_add_numbered_adapter() for details.
+ */
+static int __i2c_add_numbered_adapter(struct i2c_adapter *adap)
+{
+ int id;
+
+ mutex_lock(&core_lock);
+ id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1,
+ GFP_KERNEL);
+ mutex_unlock(&core_lock);
+ if (id < 0)
+ return id == -ENOSPC ? -EBUSY : id;
+
+ return i2c_register_adapter(adap);
+}
+
+/**
* i2c_add_adapter - declare i2c adapter, use dynamic bus number
* @adapter: the adapter to add
* Context: can sleep
*
* This routine is used to declare an I2C adapter when its bus number
- * doesn't matter. Examples: for I2C adapters dynamically added by
- * USB links or PCI plugin cards.
+ * doesn't matter or when its bus number is specified by an dt alias.
+ * Examples of bases when the bus number doesn't matter: I2C adapters
+ * dynamically added by USB links or PCI plugin cards.
*
* When this returns zero, a new bus number was allocated and stored
* in adap->nr, and the specified adapter became available for clients.
@@ -935,8 +1111,17 @@ out_list:
*/
int i2c_add_adapter(struct i2c_adapter *adapter)
{
+ struct device *dev = &adapter->dev;
int id;
+ if (dev->of_node) {
+ id = of_alias_get_id(dev->of_node, "i2c");
+ if (id >= 0) {
+ adapter->nr = id;
+ return __i2c_add_numbered_adapter(adapter);
+ }
+ }
+
mutex_lock(&core_lock);
id = idr_alloc(&i2c_adapter_idr, adapter,
__i2c_first_dynamic_bus_num, 0, GFP_KERNEL);
@@ -975,26 +1160,17 @@ EXPORT_SYMBOL(i2c_add_adapter);
*/
int i2c_add_numbered_adapter(struct i2c_adapter *adap)
{
- int id;
-
if (adap->nr == -1) /* -1 means dynamically assign bus id */
return i2c_add_adapter(adap);
- mutex_lock(&core_lock);
- id = idr_alloc(&i2c_adapter_idr, adap, adap->nr, adap->nr + 1,
- GFP_KERNEL);
- mutex_unlock(&core_lock);
- if (id < 0)
- return id == -ENOSPC ? -EBUSY : id;
- return i2c_register_adapter(adap);
+ return __i2c_add_numbered_adapter(adap);
}
EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter);
-static int i2c_do_del_adapter(struct i2c_driver *driver,
+static void i2c_do_del_adapter(struct i2c_driver *driver,
struct i2c_adapter *adapter)
{
struct i2c_client *client, *_n;
- int res;
/* Remove the devices we created ourselves as the result of hardware
* probing (using a driver's detect method) */
@@ -1006,16 +1182,6 @@ static int i2c_do_del_adapter(struct i2c_driver *driver,
i2c_unregister_device(client);
}
}
-
- if (!driver->detach_adapter)
- return 0;
- dev_warn(&adapter->dev, "%s: detach_adapter method is deprecated\n",
- driver->driver.name);
- res = driver->detach_adapter(adapter);
- if (res)
- dev_err(&adapter->dev, "detach_adapter failed (%d) "
- "for driver [%s]\n", res, driver->driver.name);
- return res;
}
static int __unregister_client(struct device *dev, void *dummy)
@@ -1036,7 +1202,8 @@ static int __unregister_dummy(struct device *dev, void *dummy)
static int __process_removed_adapter(struct device_driver *d, void *data)
{
- return i2c_do_del_adapter(to_i2c_driver(d), data);
+ i2c_do_del_adapter(to_i2c_driver(d), data);
+ return 0;
}
/**
@@ -1047,9 +1214,8 @@ static int __process_removed_adapter(struct device_driver *d, void *data)
* This unregisters an I2C adapter which was previously registered
* by @i2c_add_adapter or @i2c_add_numbered_adapter.
*/
-int i2c_del_adapter(struct i2c_adapter *adap)
+void i2c_del_adapter(struct i2c_adapter *adap)
{
- int res = 0;
struct i2c_adapter *found;
struct i2c_client *client, *next;
@@ -1060,16 +1226,14 @@ int i2c_del_adapter(struct i2c_adapter *adap)
if (found != adap) {
pr_debug("i2c-core: attempting to delete unregistered "
"adapter [%s]\n", adap->name);
- return -EINVAL;
+ return;
}
/* Tell drivers about this removal */
mutex_lock(&core_lock);
- res = bus_for_each_drv(&i2c_bus_type, NULL, adap,
+ bus_for_each_drv(&i2c_bus_type, NULL, adap,
__process_removed_adapter);
mutex_unlock(&core_lock);
- if (res)
- return res;
/* Remove devices instantiated from sysfs */
mutex_lock_nested(&adap->userspace_clients_lock,
@@ -1088,8 +1252,8 @@ int i2c_del_adapter(struct i2c_adapter *adap)
* we can't remove the dummy devices during the first pass: they
* could have been instantiated by real devices wishing to clean
* them up properly, so we give them a chance to do that first. */
- res = device_for_each_child(&adap->dev, NULL, __unregister_client);
- res = device_for_each_child(&adap->dev, NULL, __unregister_dummy);
+ device_for_each_child(&adap->dev, NULL, __unregister_client);
+ device_for_each_child(&adap->dev, NULL, __unregister_dummy);
#ifdef CONFIG_I2C_COMPAT
class_compat_remove_link(i2c_adapter_compat_class, &adap->dev,
@@ -1114,8 +1278,6 @@ int i2c_del_adapter(struct i2c_adapter *adap)
/* Clear the device structure in case this adapter is ever going to be
added again */
memset(&adap->dev, 0, sizeof(adap->dev));
-
- return 0;
}
EXPORT_SYMBOL(i2c_del_adapter);
@@ -1185,9 +1347,9 @@ EXPORT_SYMBOL(i2c_register_driver);
static int __process_removed_driver(struct device *dev, void *data)
{
- if (dev->type != &i2c_adapter_type)
- return 0;
- return i2c_do_del_adapter(data, to_i2c_adapter(dev));
+ if (dev->type == &i2c_adapter_type)
+ i2c_do_del_adapter(data, to_i2c_adapter(dev));
+ return 0;
}
/**
diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index d94e0ce..7409ebb 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -191,17 +191,12 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
}
EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
-int i2c_del_mux_adapter(struct i2c_adapter *adap)
+void i2c_del_mux_adapter(struct i2c_adapter *adap)
{
struct i2c_mux_priv *priv = adap->algo_data;
- int ret;
- ret = i2c_del_adapter(adap);
- if (ret < 0)
- return ret;
+ i2c_del_adapter(adap);
kfree(priv);
-
- return 0;
}
EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 0be5b83..5faf244 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -5,6 +5,18 @@
menu "Multiplexer I2C Chip support"
depends on I2C_MUX
+config I2C_ARB_GPIO_CHALLENGE
+ tristate "GPIO-based I2C arbitration"
+ depends on GENERIC_GPIO && OF
+ help
+ If you say yes to this option, support will be included for an
+ I2C multimaster arbitration scheme using GPIOs and a challenge &
+ response mechanism where masters have to claim the bus by asserting
+ a GPIO.
+
+ This driver can also be built as a module. If so, the module
+ will be called i2c-arb-gpio-challenge.
+
config I2C_MUX_GPIO
tristate "GPIO-based I2C multiplexer"
depends on GENERIC_GPIO
diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile
index 76da869..465778b 100644
--- a/drivers/i2c/muxes/Makefile
+++ b/drivers/i2c/muxes/Makefile
@@ -1,6 +1,8 @@
#
# Makefile for multiplexer I2C chip drivers.
+obj-$(CONFIG_I2C_ARB_GPIO_CHALLENGE) += i2c-arb-gpio-challenge.o
+
obj-$(CONFIG_I2C_MUX_GPIO) += i2c-mux-gpio.o
obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o
obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o
diff --git a/drivers/i2c/muxes/i2c-arb-gpio-challenge.c b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c
new file mode 100644
index 0000000..210b6f7
--- /dev/null
+++ b/drivers/i2c/muxes/i2c-arb-gpio-challenge.c
@@ -0,0 +1,251 @@
+/*
+ * GPIO-based I2C Arbitration Using a Challenge & Response Mechanism
+ *
+ * Copyright (C) 2012 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gpio.h>
+#include <linux/kernel.h>
+#include <linux/i2c.h>
+#include <linux/i2c-mux.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/of_i2c.h>
+#include <linux/of_gpio.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+
+/**
+ * struct i2c_arbitrator_data - Driver data for I2C arbitrator
+ *
+ * @parent: Parent adapter
+ * @child: Child bus
+ * @our_gpio: GPIO we'll use to claim.
+ * @our_gpio_release: 0 if active high; 1 if active low; AKA if the GPIO ==
+ * this then consider it released.
+ * @their_gpio: GPIO that the other side will use to claim.
+ * @their_gpio_release: 0 if active high; 1 if active low; AKA if the GPIO ==
+ * this then consider it released.
+ * @slew_delay_us: microseconds to wait for a GPIO to go high.
+ * @wait_retry_us: we'll attempt another claim after this many microseconds.
+ * @wait_free_us: we'll give up after this many microseconds.
+ */
+
+struct i2c_arbitrator_data {
+ struct i2c_adapter *parent;
+ struct i2c_adapter *child;
+ int our_gpio;
+ int our_gpio_release;
+ int their_gpio;
+ int their_gpio_release;
+ unsigned int slew_delay_us;
+ unsigned int wait_retry_us;
+ unsigned int wait_free_us;
+};
+
+
+/**
+ * i2c_arbitrator_select - claim the I2C bus
+ *
+ * Use the GPIO-based signalling protocol; return -EBUSY if we fail.
+ */
+static int i2c_arbitrator_select(struct i2c_adapter *adap, void *data, u32 chan)
+{
+ const struct i2c_arbitrator_data *arb = data;
+ unsigned long stop_retry, stop_time;
+
+ /* Start a round of trying to claim the bus */
+ stop_time = jiffies + usecs_to_jiffies(arb->wait_free_us) + 1;
+ do {
+ /* Indicate that we want to claim the bus */
+ gpio_set_value(arb->our_gpio, !arb->our_gpio_release);
+ udelay(arb->slew_delay_us);
+
+ /* Wait for the other master to release it */
+ stop_retry = jiffies + usecs_to_jiffies(arb->wait_retry_us) + 1;
+ while (time_before(jiffies, stop_retry)) {
+ int gpio_val = !!gpio_get_value(arb->their_gpio);
+
+ if (gpio_val == arb->their_gpio_release) {
+ /* We got it, so return */
+ return 0;
+ }
+
+ usleep_range(50, 200);
+ }
+
+ /* It didn't release, so give up, wait, and try again */
+ gpio_set_value(arb->our_gpio, arb->our_gpio_release);
+
+ usleep_range(arb->wait_retry_us, arb->wait_retry_us * 2);
+ } while (time_before(jiffies, stop_time));
+
+ /* Give up, release our claim */
+ gpio_set_value(arb->our_gpio, arb->our_gpio_release);
+ udelay(arb->slew_delay_us);
+ dev_err(&adap->dev, "Could not claim bus, timeout\n");
+ return -EBUSY;
+}
+
+/**
+ * i2c_arbitrator_deselect - release the I2C bus
+ *
+ * Release the I2C bus using the GPIO-based signalling protocol.
+ */
+static int i2c_arbitrator_deselect(struct i2c_adapter *adap, void *data,
+ u32 chan)
+{
+ const struct i2c_arbitrator_data *arb = data;
+
+ /* Release the bus and wait for the other master to notice */
+ gpio_set_value(arb->our_gpio, arb->our_gpio_release);
+ udelay(arb->slew_delay_us);
+
+ return 0;
+}
+
+static int i2c_arbitrator_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct device_node *parent_np;
+ struct i2c_arbitrator_data *arb;
+ enum of_gpio_flags gpio_flags;
+ unsigned long out_init;
+ int ret;
+
+ /* We only support probing from device tree; no platform_data */
+ if (!np) {
+ dev_err(dev, "Cannot find device tree node\n");
+ return -ENODEV;
+ }
+ if (dev->platform_data) {
+ dev_err(dev, "Platform data is not supported\n");
+ return -EINVAL;
+ }
+
+ arb = devm_kzalloc(dev, sizeof(*arb), GFP_KERNEL);
+ if (!arb) {
+ dev_err(dev, "Cannot allocate i2c_arbitrator_data\n");
+ return -ENOMEM;
+ }
+ platform_set_drvdata(pdev, arb);
+
+ /* Request GPIOs */
+ ret = of_get_named_gpio_flags(np, "our-claim-gpio", 0, &gpio_flags);
+ if (!gpio_is_valid(ret)) {
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "Error getting our-claim-gpio\n");
+ return ret;
+ }
+ arb->our_gpio = ret;
+ arb->our_gpio_release = !!(gpio_flags & OF_GPIO_ACTIVE_LOW);
+ out_init = (gpio_flags & OF_GPIO_ACTIVE_LOW) ?
+ GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW;
+ ret = devm_gpio_request_one(dev, arb->our_gpio, out_init,
+ "our-claim-gpio");
+ if (ret) {
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "Error requesting our-claim-gpio\n");
+ return ret;
+ }
+
+ ret = of_get_named_gpio_flags(np, "their-claim-gpios", 0, &gpio_flags);
+ if (!gpio_is_valid(ret)) {
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "Error getting their-claim-gpio\n");
+ return ret;
+ }
+ arb->their_gpio = ret;
+ arb->their_gpio_release = !!(gpio_flags & OF_GPIO_ACTIVE_LOW);
+ ret = devm_gpio_request_one(dev, arb->their_gpio, GPIOF_IN,
+ "their-claim-gpio");
+ if (ret) {
+ if (ret != -EPROBE_DEFER)
+ dev_err(dev, "Error requesting their-claim-gpio\n");
+ return ret;
+ }
+
+ /* At the moment we only support a single two master (us + 1 other) */
+ if (gpio_is_valid(of_get_named_gpio(np, "their-claim-gpios", 1))) {
+ dev_err(dev, "Only one other master is supported\n");
+ return -EINVAL;
+ }
+
+ /* Arbitration parameters */
+ if (of_property_read_u32(np, "slew-delay-us", &arb->slew_delay_us))
+ arb->slew_delay_us = 10;
+ if (of_property_read_u32(np, "wait-retry-us", &arb->wait_retry_us))
+ arb->wait_retry_us = 3000;
+ if (of_property_read_u32(np, "wait-free-us", &arb->wait_free_us))
+ arb->wait_free_us = 50000;
+
+ /* Find our parent */
+ parent_np = of_parse_phandle(np, "i2c-parent", 0);
+ if (!parent_np) {
+ dev_err(dev, "Cannot parse i2c-parent\n");
+ return -EINVAL;
+ }
+ arb->parent = of_find_i2c_adapter_by_node(parent_np);
+ if (!arb->parent) {
+ dev_err(dev, "Cannot find parent bus\n");
+ return -EINVAL;
+ }
+
+ /* Actually add the mux adapter */
+ arb->child = i2c_add_mux_adapter(arb->parent, dev, arb, 0, 0, 0,
+ i2c_arbitrator_select,
+ i2c_arbitrator_deselect);
+ if (!arb->child) {
+ dev_err(dev, "Failed to add adapter\n");
+ ret = -ENODEV;
+ i2c_put_adapter(arb->parent);
+ }
+
+ return ret;
+}
+
+static int i2c_arbitrator_remove(struct platform_device *pdev)
+{
+ struct i2c_arbitrator_data *arb = platform_get_drvdata(pdev);
+
+ i2c_del_mux_adapter(arb->child);
+ i2c_put_adapter(arb->parent);
+
+ return 0;
+}
+
+static const struct of_device_id i2c_arbitrator_of_match[] = {
+ { .compatible = "i2c-arb-gpio-challenge", },
+ {},
+};
+MODULE_DEVICE_TABLE(of, i2c_arbitrator_of_match);
+
+static struct platform_driver i2c_arbitrator_driver = {
+ .probe = i2c_arbitrator_probe,
+ .remove = i2c_arbitrator_remove,
+ .driver = {
+ .owner = THIS_MODULE,
+ .name = "i2c-arb-gpio-challenge",
+ .of_match_table = of_match_ptr(i2c_arbitrator_of_match),
+ },
+};
+
+module_platform_driver(i2c_arbitrator_driver);
+
+MODULE_DESCRIPTION("GPIO-based I2C Arbitration");
+MODULE_AUTHOR("Doug Anderson <dianders@chromium.org>");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:i2c-arb-gpio-challenge");
diff --git a/drivers/i2c/muxes/i2c-mux-gpio.c b/drivers/i2c/muxes/i2c-mux-gpio.c
index abc2e55a..5a0ce00 100644
--- a/drivers/i2c/muxes/i2c-mux-gpio.c
+++ b/drivers/i2c/muxes/i2c-mux-gpio.c
@@ -201,10 +201,21 @@ static int i2c_mux_gpio_probe(struct platform_device *pdev)
for (i = 0; i < mux->data.n_gpios; i++) {
ret = gpio_request(gpio_base + mux->data.gpios[i], "i2c-mux-gpio");
- if (ret)
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to request GPIO %d\n",
+ mux->data.gpios[i]);
goto err_request_gpio;
- gpio_direction_output(gpio_base + mux->data.gpios[i],
- initial_state & (1 << i));
+ }
+
+ ret = gpio_direction_output(gpio_base + mux->data.gpios[i],
+ initial_state & (1 << i));
+ if (ret) {
+ dev_err(&pdev->dev,
+ "Failed to set direction of GPIO %d to output\n",
+ mux->data.gpios[i]);
+ i++; /* gpio_request above succeeded, so must free */
+ goto err_request_gpio;
+ }
}
for (i = 0; i < mux->data.n_values; i++) {
diff --git a/drivers/i2c/muxes/i2c-mux-pca954x.c b/drivers/i2c/muxes/i2c-mux-pca954x.c
index 8e43872..a531d80 100644
--- a/drivers/i2c/muxes/i2c-mux-pca954x.c
+++ b/drivers/i2c/muxes/i2c-mux-pca954x.c
@@ -262,13 +262,11 @@ static int pca954x_remove(struct i2c_client *client)
{
struct pca954x *data = i2c_get_clientdata(client);
const struct chip_desc *chip = &chips[data->type];
- int i, err;
+ int i;
for (i = 0; i < chip->nchans; ++i)
if (data->virt_adaps[i]) {
- err = i2c_del_mux_adapter(data->virt_adaps[i]);
- if (err)
- return err;
+ i2c_del_mux_adapter(data->virt_adaps[i]);
data->virt_adaps[i] = NULL;
}
diff --git a/drivers/lguest/Kconfig b/drivers/lguest/Kconfig
index 89875ea..ee035ec 100644
--- a/drivers/lguest/Kconfig
+++ b/drivers/lguest/Kconfig
@@ -5,10 +5,9 @@ config LGUEST
---help---
This is a very simple module which allows you to run
multiple instances of the same Linux kernel, using the
- "lguest" command found in the Documentation/virtual/lguest
- directory.
+ "lguest" command found in the tools/lguest directory.
Note that "lguest" is pronounced to rhyme with "fell quest",
- not "rustyvisor". See Documentation/virtual/lguest/lguest.txt.
+ not "rustyvisor". See tools/lguest/lguest.txt.
If unsure, say N. If curious, say M. If masochistic, say Y.
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a5ebc00..0bf1e4e 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -20,9 +20,9 @@
#include <asm/asm-offsets.h>
#include "lg.h"
-
+unsigned long switcher_addr;
+struct page **lg_switcher_pages;
static struct vm_struct *switcher_vma;
-static struct page **switcher_page;
/* This One Big lock protects all inter-guest data structures. */
DEFINE_MUTEX(lguest_lock);
@@ -52,13 +52,21 @@ static __init int map_switcher(void)
* easy.
*/
+ /* We assume Switcher text fits into a single page. */
+ if (end_switcher_text - start_switcher_text > PAGE_SIZE) {
+ printk(KERN_ERR "lguest: switcher text too large (%zu)\n",
+ end_switcher_text - start_switcher_text);
+ return -EINVAL;
+ }
+
/*
* We allocate an array of struct page pointers. map_vm_area() wants
* this, rather than just an array of pages.
*/
- switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
- GFP_KERNEL);
- if (!switcher_page) {
+ lg_switcher_pages = kmalloc(sizeof(lg_switcher_pages[0])
+ * TOTAL_SWITCHER_PAGES,
+ GFP_KERNEL);
+ if (!lg_switcher_pages) {
err = -ENOMEM;
goto out;
}
@@ -68,32 +76,29 @@ static __init int map_switcher(void)
* so we make sure they're zeroed.
*/
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
- switcher_page[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!switcher_page[i]) {
+ lg_switcher_pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+ if (!lg_switcher_pages[i]) {
err = -ENOMEM;
goto free_some_pages;
}
}
/*
- * First we check that the Switcher won't overlap the fixmap area at
- * the top of memory. It's currently nowhere near, but it could have
- * very strange effects if it ever happened.
+ * We place the Switcher underneath the fixmap area, which is the
+ * highest virtual address we can get. This is important, since we
+ * tell the Guest it can't access this memory, so we want its ceiling
+ * as high as possible.
*/
- if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
- err = -ENOMEM;
- printk("lguest: mapping switcher would thwack fixmap\n");
- goto free_pages;
- }
+ switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
/*
- * Now we reserve the "virtual memory area" we want: 0xFFC00000
- * (SWITCHER_ADDR). We might not get it in theory, but in practice
- * it's worked so far. The end address needs +1 because __get_vm_area
- * allocates an extra guard page, so we need space for that.
+ * Now we reserve the "virtual memory area" we want. We might
+ * not get it in theory, but in practice it's worked so far.
+ * The end address needs +1 because __get_vm_area allocates an
+ * extra guard page, so we need space for that.
*/
switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
- VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
+ VM_ALLOC, switcher_addr, switcher_addr
+ (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
if (!switcher_vma) {
err = -ENOMEM;
@@ -103,12 +108,12 @@ static __init int map_switcher(void)
/*
* This code actually sets up the pages we've allocated to appear at
- * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
+ * switcher_addr. map_vm_area() takes the vma we allocated above, the
* kind of pages we're mapping (kernel pages), and a pointer to our
* array of struct pages. It increments that pointer, but we don't
* care.
*/
- pagep = switcher_page;
+ pagep = lg_switcher_pages;
err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
if (err) {
printk("lguest: map_vm_area failed: %i\n", err);
@@ -133,8 +138,8 @@ free_pages:
i = TOTAL_SWITCHER_PAGES;
free_some_pages:
for (--i; i >= 0; i--)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
out:
return err;
}
@@ -149,8 +154,8 @@ static void unmap_switcher(void)
vunmap(switcher_vma->addr);
/* Now we just need to free the pages we copied the switcher into */
for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
- __free_pages(switcher_page[i], 0);
- kfree(switcher_page);
+ __free_pages(lg_switcher_pages[i], 0);
+ kfree(lg_switcher_pages);
}
/*H:032
@@ -323,15 +328,10 @@ static int __init init(void)
if (err)
goto out;
- /* Now we set up the pagetable implementation for the Guests. */
- err = init_pagetables(switcher_page, SHARED_SWITCHER_PAGES);
- if (err)
- goto unmap;
-
/* We might need to reserve an interrupt vector. */
err = init_interrupts();
if (err)
- goto free_pgtables;
+ goto unmap;
/* /dev/lguest needs to be registered. */
err = lguest_device_init();
@@ -346,8 +346,6 @@ static int __init init(void)
free_interrupts:
free_interrupts();
-free_pgtables:
- free_pagetables();
unmap:
unmap_switcher();
out:
@@ -359,7 +357,6 @@ static void __exit fini(void)
{
lguest_device_remove();
free_interrupts();
- free_pagetables();
unmap_switcher();
lguest_arch_host_fini();
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 295df06..2eef40b 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -14,11 +14,10 @@
#include <asm/lguest.h>
-void free_pagetables(void);
-int init_pagetables(struct page **switcher_page, unsigned int pages);
-
struct pgdir {
unsigned long gpgdir;
+ bool switcher_mapped;
+ int last_host_cpu;
pgd_t *pgdir;
};
@@ -124,6 +123,7 @@ bool lguest_address_ok(const struct lguest *lg,
unsigned long addr, unsigned long len);
void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
+extern struct page **lg_switcher_pages;
/*H:035
* Using memory-copy operations like that is usually inconvient, so we
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index ff4a0bc..4263f4c 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -250,13 +250,13 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
*/
static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
{
- /* We have a limited number the number of CPUs in the lguest struct. */
+ /* We have a limited number of CPUs in the lguest struct. */
if (id >= ARRAY_SIZE(cpu->lg->cpus))
return -EINVAL;
/* Set up this CPU's id, and pointer back to the lguest struct. */
cpu->id = id;
- cpu->lg = container_of((cpu - id), struct lguest, cpus[0]);
+ cpu->lg = container_of(cpu, struct lguest, cpus[id]);
cpu->lg->nr_cpus++;
/* Each CPU has a timer it can set. */
@@ -270,7 +270,7 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
if (!cpu->regs_page)
return -ENOMEM;
- /* We actually put the registers at the bottom of the page. */
+ /* We actually put the registers at the end of the page. */
cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
/*
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index 864baab..699187a 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -7,7 +7,7 @@
* converted Guest pages when running the Guest.
:*/
-/* Copyright (C) Rusty Russell IBM Corporation 2006.
+/* Copyright (C) Rusty Russell IBM Corporation 2013.
* GPL v2 and any later version */
#include <linux/mm.h>
#include <linux/gfp.h>
@@ -62,22 +62,11 @@
* will need the last pmd entry of the last pmd page.
*/
#ifdef CONFIG_X86_PAE
-#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
-#define RESERVE_MEM 2U
#define CHECK_GPGD_MASK _PAGE_PRESENT
#else
-#define RESERVE_MEM 4U
#define CHECK_GPGD_MASK _PAGE_TABLE
#endif
-/*
- * We actually need a separate PTE page for each CPU. Remember that after the
- * Switcher code itself comes two pages for each CPU, and we don't want this
- * CPU's guest to see the pages of any other CPU.
- */
-static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
-#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
-
/*H:320
* The page table code is curly enough to need helper functions to keep it
* clear and clean. The kernel itself provides many of them; one advantage
@@ -95,13 +84,6 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
{
unsigned int index = pgd_index(vaddr);
-#ifndef CONFIG_X86_PAE
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (index >= SWITCHER_PGD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-#endif
/* Return a pointer index'th pgd entry for the i'th page table. */
return &cpu->lg->pgdirs[i].pgdir[index];
}
@@ -117,13 +99,6 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
unsigned int index = pmd_index(vaddr);
pmd_t *page;
- /* We kill any Guest trying to touch the Switcher addresses. */
- if (pgd_index(vaddr) == SWITCHER_PGD_INDEX &&
- index >= SWITCHER_PMD_INDEX) {
- kill_guest(cpu, "attempt to access switcher pages");
- index = 0;
- }
-
/* You should never call this if the PGD entry wasn't valid */
BUG_ON(!(pgd_flags(spgd) & _PAGE_PRESENT));
page = __va(pgd_pfn(spgd) << PAGE_SHIFT);
@@ -275,122 +250,177 @@ static void release_pte(pte_t pte)
}
/*:*/
-static void check_gpte(struct lg_cpu *cpu, pte_t gpte)
+static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & _PAGE_PSE) ||
- pte_pfn(gpte) >= cpu->lg->pfn_limit)
+ pte_pfn(gpte) >= cpu->lg->pfn_limit) {
kill_guest(cpu, "bad page table entry");
+ return false;
+ }
+ return true;
}
-static void check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
+static bool check_gpgd(struct lg_cpu *cpu, pgd_t gpgd)
{
if ((pgd_flags(gpgd) & ~CHECK_GPGD_MASK) ||
- (pgd_pfn(gpgd) >= cpu->lg->pfn_limit))
+ (pgd_pfn(gpgd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page directory entry");
+ return false;
+ }
+ return true;
}
#ifdef CONFIG_X86_PAE
-static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
+static bool check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
{
if ((pmd_flags(gpmd) & ~_PAGE_TABLE) ||
- (pmd_pfn(gpmd) >= cpu->lg->pfn_limit))
+ (pmd_pfn(gpmd) >= cpu->lg->pfn_limit)) {
kill_guest(cpu, "bad page middle directory entry");
+ return false;
+ }
+ return true;
}
#endif
-/*H:330
- * (i) Looking up a page table entry when the Guest faults.
- *
- * We saw this call in run_guest(): when we see a page fault in the Guest, we
- * come here. That's because we only set up the shadow page tables lazily as
- * they're needed, so we get page faults all the time and quietly fix them up
- * and return to the Guest without it knowing.
+/*H:331
+ * This is the core routine to walk the shadow page tables and find the page
+ * table entry for a specific address.
*
- * If we fixed up the fault (ie. we mapped the address), this routine returns
- * true. Otherwise, it was a real fault and we need to tell the Guest.
+ * If allocate is set, then we allocate any missing levels, setting the flags
+ * on the new page directory and mid-level directories using the arguments
+ * (which are copied from the Guest's page table entries).
*/
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
+ int pgd_flags, int pmd_flags)
{
- pgd_t gpgd;
pgd_t *spgd;
- unsigned long gpte_ptr;
- pte_t gpte;
- pte_t *spte;
-
/* Mid level for PAE. */
#ifdef CONFIG_X86_PAE
pmd_t *spmd;
- pmd_t gpmd;
#endif
- /* First step: get the top-level Guest page table entry. */
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpgd = __pgd(CHECK_GPGD_MASK);
- } else {
- gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
- /* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Get top level entry. */
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ return NULL;
}
- /* We check that the Guest pgd is OK. */
- check_gpgd(cpu, gpgd);
/*
* And we copy the flags to the shadow PGD entry. The page
* number in the shadow PGD is the page we just allocated.
*/
- set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
+ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags));
}
+ /*
+ * Intel's Physical Address Extension actually uses three levels of
+ * page tables, so we need to look in the mid-level.
+ */
#ifdef CONFIG_X86_PAE
- if (unlikely(cpu->linear_pages)) {
- /* Faking up a linear mapping. */
- gpmd = __pmd(_PAGE_TABLE);
- } else {
- gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- /* Middle level not present? We can't map it in. */
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
- return false;
- }
-
- /* Now look at the matching shadow entry. */
+ /* Now look at the mid-level shadow entry. */
spmd = spmd_addr(cpu, *spgd, vaddr);
if (!(pmd_flags(*spmd) & _PAGE_PRESENT)) {
/* No shadow entry: allocate a new shadow PTE page. */
- unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
+ unsigned long ptepage;
+
+ /* If they didn't want us to allocate anything, stop. */
+ if (!allocate)
+ return NULL;
+
+ ptepage = get_zeroed_page(GFP_KERNEL);
/*
* This is not really the Guest's fault, but killing it is
* simple for this corner case.
*/
if (!ptepage) {
- kill_guest(cpu, "out of memory allocating pte page");
- return false;
+ kill_guest(cpu, "out of memory allocating pmd page");
+ return NULL;
}
- /* We check that the Guest pmd is OK. */
- check_gpmd(cpu, gpmd);
-
/*
* And we copy the flags to the shadow PMD entry. The page
* number in the shadow PMD is the page we just allocated.
*/
- set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
+ set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags));
+ }
+#endif
+
+ /* Get the pointer to the shadow PTE entry we're going to set. */
+ return spte_addr(cpu, *spgd, vaddr);
+}
+
+/*H:330
+ * (i) Looking up a page table entry when the Guest faults.
+ *
+ * We saw this call in run_guest(): when we see a page fault in the Guest, we
+ * come here. That's because we only set up the shadow page tables lazily as
+ * they're needed, so we get page faults all the time and quietly fix them up
+ * and return to the Guest without it knowing.
+ *
+ * If we fixed up the fault (ie. we mapped the address), this routine returns
+ * true. Otherwise, it was a real fault and we need to tell the Guest.
+ */
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+{
+ unsigned long gpte_ptr;
+ pte_t gpte;
+ pte_t *spte;
+ pmd_t gpmd;
+ pgd_t gpgd;
+
+ /* We never demand page the Switcher, so trying is a mistake. */
+ if (vaddr >= switcher_addr)
+ return false;
+
+ /* First step: get the top-level Guest page table entry. */
+ if (unlikely(cpu->linear_pages)) {
+ /* Faking up a linear mapping. */
+ gpgd = __pgd(CHECK_GPGD_MASK);
+ } else {
+ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
+ /* Toplevel not present? We can't map it in. */
+ if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpgd(cpu, gpgd))
+ return false;
+ }
+
+ /* This "mid-level" entry is only used for non-linear, PAE mode. */
+ gpmd = __pmd(_PAGE_TABLE);
+
+#ifdef CONFIG_X86_PAE
+ if (likely(!cpu->linear_pages)) {
+ gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
+ /* Middle level not present? We can't map it in. */
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ return false;
+
+ /*
+ * This kills the Guest if it has weird flags or tries to
+ * refer to a "physical" address outside the bounds.
+ */
+ if (!check_gpmd(cpu, gpmd))
+ return false;
}
/*
@@ -433,7 +463,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary).
*/
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return false;
/* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
gpte = pte_mkyoung(gpte);
@@ -441,7 +472,9 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
gpte = pte_mkdirty(gpte);
/* Get the pointer to the shadow PTE entry we're going to set. */
- spte = spte_addr(cpu, *spgd, vaddr);
+ spte = find_spte(cpu, vaddr, true, pgd_flags(gpgd), pmd_flags(gpmd));
+ if (!spte)
+ return false;
/*
* If there was a valid shadow PTE entry here before, we release it.
@@ -493,29 +526,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
*/
static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
{
- pgd_t *spgd;
+ pte_t *spte;
unsigned long flags;
-#ifdef CONFIG_X86_PAE
- pmd_t *spmd;
-#endif
- /* Look at the current top level entry: is it present? */
- spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
- if (!(pgd_flags(*spgd) & _PAGE_PRESENT))
+ /* You can't put your stack in the Switcher! */
+ if (vaddr >= switcher_addr)
return false;
-#ifdef CONFIG_X86_PAE
- spmd = spmd_addr(cpu, *spgd, vaddr);
- if (!(pmd_flags(*spmd) & _PAGE_PRESENT))
+ /* If there's no shadow PTE, it's not writable. */
+ spte = find_spte(cpu, vaddr, false, 0, 0);
+ if (!spte)
return false;
-#endif
/*
* Check the flags on the pte entry itself: it must be present and
* writable.
*/
- flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
-
+ flags = pte_flags(*spte);
return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
}
@@ -678,9 +705,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
int *blank_pgdir)
{
unsigned int next;
-#ifdef CONFIG_X86_PAE
- pmd_t *pmd_table;
-#endif
/*
* We pick one entry at random to throw out. Choosing the Least
@@ -695,29 +719,11 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
if (!cpu->lg->pgdirs[next].pgdir)
next = cpu->cpu_pgd;
else {
-#ifdef CONFIG_X86_PAE
/*
- * In PAE mode, allocate a pmd page and populate the
- * last pgd entry.
+ * This is a blank page, so there are no kernel
+ * mappings: caller must map the stack!
*/
- pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
- if (!pmd_table) {
- free_page((long)cpu->lg->pgdirs[next].pgdir);
- set_pgd(cpu->lg->pgdirs[next].pgdir, __pgd(0));
- next = cpu->cpu_pgd;
- } else {
- set_pgd(cpu->lg->pgdirs[next].pgdir +
- SWITCHER_PGD_INDEX,
- __pgd(__pa(pmd_table) | _PAGE_PRESENT));
- /*
- * This is a blank page, so there are no kernel
- * mappings: caller must map the stack!
- */
- *blank_pgdir = 1;
- }
-#else
*blank_pgdir = 1;
-#endif
}
}
/* Record which Guest toplevel this shadows. */
@@ -725,9 +731,50 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
/* Release all the non-kernel mappings. */
flush_user_mappings(cpu->lg, next);
+ /* This hasn't run on any CPU at all. */
+ cpu->lg->pgdirs[next].last_host_cpu = -1;
+
return next;
}
+/*H:501
+ * We do need the Switcher code mapped at all times, so we allocate that
+ * part of the Guest page table here. We map the Switcher code immediately,
+ * but defer mapping of the guest register page and IDT/LDT etc page until
+ * just before we run the guest in map_switcher_in_guest().
+ *
+ * We *could* do this setup in map_switcher_in_guest(), but at that point
+ * we've interrupts disabled, and allocating pages like that is fraught: we
+ * can't sleep if we need to free up some memory.
+ */
+static bool allocate_switcher_mapping(struct lg_cpu *cpu)
+{
+ int i;
+
+ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
+ pte_t *pte = find_spte(cpu, switcher_addr + i * PAGE_SIZE, true,
+ CHECK_GPGD_MASK, _PAGE_TABLE);
+ if (!pte)
+ return false;
+
+ /*
+ * Map the switcher page if not already there. It might
+ * already be there because we call allocate_switcher_mapping()
+ * in guest_set_pgd() just in case it did discard our Switcher
+ * mapping, but it probably didn't.
+ */
+ if (i == 0 && !(pte_flags(*pte) & _PAGE_PRESENT)) {
+ /* Get a reference to the Switcher page. */
+ get_page(lg_switcher_pages[0]);
+ /* Create a read-only, exectuable, kernel-style PTE */
+ set_pte(pte,
+ mk_pte(lg_switcher_pages[0], PAGE_KERNEL_RX));
+ }
+ }
+ cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped = true;
+ return true;
+}
+
/*H:470
* Finally, a routine which throws away everything: all PGD entries in all
* the shadow page tables, including the Guest's kernel mappings. This is used
@@ -738,28 +785,16 @@ static void release_all_pagetables(struct lguest *lg)
unsigned int i, j;
/* Every shadow pagetable this Guest has */
- for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
- if (lg->pgdirs[i].pgdir) {
-#ifdef CONFIG_X86_PAE
- pgd_t *spgd;
- pmd_t *pmdpage;
- unsigned int k;
-
- /* Get the last pmd page. */
- spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
- pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
-
- /*
- * And release the pmd entries of that pmd page,
- * except for the switcher pmd.
- */
- for (k = 0; k < SWITCHER_PMD_INDEX; k++)
- release_pmd(&pmdpage[k]);
-#endif
- /* Every PGD entry except the Switcher at the top */
- for (j = 0; j < SWITCHER_PGD_INDEX; j++)
- release_pgd(lg->pgdirs[i].pgdir + j);
- }
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++) {
+ if (!lg->pgdirs[i].pgdir)
+ continue;
+
+ /* Every PGD entry. */
+ for (j = 0; j < PTRS_PER_PGD; j++)
+ release_pgd(lg->pgdirs[i].pgdir + j);
+ lg->pgdirs[i].switcher_mapped = false;
+ lg->pgdirs[i].last_host_cpu = -1;
+ }
}
/*
@@ -773,6 +808,9 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
release_all_pagetables(cpu->lg);
/* We need the Guest kernel stack mapped again. */
pin_stack_pages(cpu);
+ /* And we need Switcher allocated. */
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
}
/*H:430
@@ -808,9 +846,17 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
newpgdir = new_pgdir(cpu, pgtable, &repin);
/* Change the current pgd index to the new one. */
cpu->cpu_pgd = newpgdir;
- /* If it was completely blank, we map in the Guest kernel stack */
+ /*
+ * If it was completely blank, we map in the Guest kernel stack and
+ * the Switcher.
+ */
if (repin)
pin_stack_pages(cpu);
+
+ if (!cpu->lg->pgdirs[cpu->cpu_pgd].switcher_mapped) {
+ if (!allocate_switcher_mapping(cpu))
+ kill_guest(cpu, "Cannot populate switcher mapping");
+ }
}
/*:*/
@@ -865,7 +911,8 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
* micro-benchmark.
*/
if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
- check_gpte(cpu, gpte);
+ if (!check_gpte(cpu, gpte))
+ return;
set_pte(spte,
gpte_to_spte(cpu, gpte,
pte_flags(gpte) & _PAGE_DIRTY));
@@ -897,6 +944,12 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
void guest_set_pte(struct lg_cpu *cpu,
unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
{
+ /* We don't let you remap the Switcher; we need it to get back! */
+ if (vaddr >= switcher_addr) {
+ kill_guest(cpu, "attempt to set pte into Switcher pages");
+ return;
+ }
+
/*
* Kernel mappings must be changed on all top levels. Slow, but doesn't
* happen often.
@@ -933,14 +986,23 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
{
int pgdir;
- if (idx >= SWITCHER_PGD_INDEX)
+ if (idx > PTRS_PER_PGD) {
+ kill_guest(&lg->cpus[0], "Attempt to set pgd %u/%u",
+ idx, PTRS_PER_PGD);
return;
+ }
/* If they're talking about a page table we have a shadow for... */
pgdir = find_pgdir(lg, gpgdir);
- if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ if (pgdir < ARRAY_SIZE(lg->pgdirs)) {
/* ... throw it away. */
release_pgd(lg->pgdirs[pgdir].pgdir + idx);
+ /* That might have been the Switcher mapping, remap it. */
+ if (!allocate_switcher_mapping(&lg->cpus[0])) {
+ kill_guest(&lg->cpus[0],
+ "Cannot populate switcher mapping");
+ }
+ }
}
#ifdef CONFIG_X86_PAE
@@ -958,6 +1020,9 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
* we will populate on future faults. The Guest doesn't have any actual
* pagetables yet, so we set linear_pages to tell demand_page() to fake it
* for the moment.
+ *
+ * We do need the Switcher to be mapped at all times, so we allocate that
+ * part of the Guest page table here.
*/
int init_guest_pagetable(struct lguest *lg)
{
@@ -971,21 +1036,34 @@ int init_guest_pagetable(struct lguest *lg)
/* We start with a linear mapping until the initialize. */
cpu->linear_pages = true;
+
+ /* Allocate the page tables for the Switcher. */
+ if (!allocate_switcher_mapping(cpu)) {
+ release_all_pagetables(lg);
+ return -ENOMEM;
+ }
+
return 0;
}
/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
void page_table_guest_data_init(struct lg_cpu *cpu)
{
+ /*
+ * We tell the Guest that it can't use the virtual addresses
+ * used by the Switcher. This trick is equivalent to 4GB -
+ * switcher_addr.
+ */
+ u32 top = ~switcher_addr + 1;
+
/* We get the kernel address: above this is all kernel memory. */
if (get_user(cpu->lg->kernel_address,
- &cpu->lg->lguest_data->kernel_address)
+ &cpu->lg->lguest_data->kernel_address)
/*
- * We tell the Guest that it can't use the top 2 or 4 MB
- * of virtual addresses used by the Switcher.
+ * We tell the Guest that it can't use the top virtual
+ * addresses (used by the Switcher).
*/
- || put_user(RESERVE_MEM * 1024 * 1024,
- &cpu->lg->lguest_data->reserve_mem)) {
+ || put_user(top, &cpu->lg->lguest_data->reserve_mem)) {
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
return;
}
@@ -995,12 +1073,7 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
* "pgd_index(lg->kernel_address)". This assumes it won't hit the
* Switcher mappings, so check that now.
*/
-#ifdef CONFIG_X86_PAE
- if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
- pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
-#else
- if (pgd_index(cpu->lg->kernel_address) >= SWITCHER_PGD_INDEX)
-#endif
+ if (cpu->lg->kernel_address >= switcher_addr)
kill_guest(cpu, "bad kernel address %#lx",
cpu->lg->kernel_address);
}
@@ -1017,102 +1090,96 @@ void free_guest_pagetable(struct lguest *lg)
free_page((long)lg->pgdirs[i].pgdir);
}
-/*H:480
- * (vi) Mapping the Switcher when the Guest is about to run.
- *
- * The Switcher and the two pages for this CPU need to be visible in the
- * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
- * for each CPU already set up, we just need to hook them in now we know which
- * Guest is about to run on this CPU.
+/*H:481
+ * This clears the Switcher mappings for cpu #i.
*/
-void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
+static void remove_switcher_percpu_map(struct lg_cpu *cpu, unsigned int i)
{
- pte_t *switcher_pte_page = __this_cpu_read(switcher_pte_pages);
- pte_t regs_pte;
+ unsigned long base = switcher_addr + PAGE_SIZE + i * PAGE_SIZE*2;
+ pte_t *pte;
-#ifdef CONFIG_X86_PAE
- pmd_t switcher_pmd;
- pmd_t *pmd_table;
-
- switcher_pmd = pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT,
- PAGE_KERNEL_EXEC);
-
- /* Figure out where the pmd page is, by reading the PGD, and converting
- * it to a virtual address. */
- pmd_table = __va(pgd_pfn(cpu->lg->
- pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
- << PAGE_SHIFT);
- /* Now write it into the shadow page table. */
- set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
-#else
- pgd_t switcher_pgd;
+ /* Clear the mappings for both pages. */
+ pte = find_spte(cpu, base, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
- /*
- * Make the last PGD entry for this Guest point to the Switcher's PTE
- * page for this CPU (with appropriate flags).
- */
- switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
-
- cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
-
-#endif
- /*
- * We also change the Switcher PTE page. When we're running the Guest,
- * we want the Guest's "regs" page to appear where the first Switcher
- * page for this CPU is. This is an optimization: when the Switcher
- * saves the Guest registers, it saves them into the first page of this
- * CPU's "struct lguest_pages": if we make sure the Guest's register
- * page is already mapped there, we don't have to copy them out
- * again.
- */
- regs_pte = pfn_pte(__pa(cpu->regs_page) >> PAGE_SHIFT, PAGE_KERNEL);
- set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], regs_pte);
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ release_pte(*pte);
+ set_pte(pte, __pte(0));
}
-/*:*/
-static void free_switcher_pte_pages(void)
-{
- unsigned int i;
-
- for_each_possible_cpu(i)
- free_page((long)switcher_pte_page(i));
-}
-
-/*H:520
- * Setting up the Switcher PTE page for given CPU is fairly easy, given
- * the CPU number and the "struct page"s for the Switcher code itself.
+/*H:480
+ * (vi) Mapping the Switcher when the Guest is about to run.
+ *
+ * The Switcher and the two pages for this CPU need to be visible in the Guest
+ * (and not the pages for other CPUs).
*
- * Currently the Switcher is less than a page long, so "pages" is always 1.
+ * The pages for the pagetables have all been allocated before: we just need
+ * to make sure the actual PTEs are up-to-date for the CPU we're about to run
+ * on.
*/
-static __init void populate_switcher_pte_page(unsigned int cpu,
- struct page *switcher_page[],
- unsigned int pages)
+void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
{
- unsigned int i;
- pte_t *pte = switcher_pte_page(cpu);
+ unsigned long base;
+ struct page *percpu_switcher_page, *regs_page;
+ pte_t *pte;
+ struct pgdir *pgdir = &cpu->lg->pgdirs[cpu->cpu_pgd];
+
+ /* Switcher page should always be mapped by now! */
+ BUG_ON(!pgdir->switcher_mapped);
+
+ /*
+ * Remember that we have two pages for each Host CPU, so we can run a
+ * Guest on each CPU without them interfering. We need to make sure
+ * those pages are mapped correctly in the Guest, but since we usually
+ * run on the same CPU, we cache that, and only update the mappings
+ * when we move.
+ */
+ if (pgdir->last_host_cpu == raw_smp_processor_id())
+ return;
- /* The first entries are easy: they map the Switcher code. */
- for (i = 0; i < pages; i++) {
- set_pte(&pte[i], mk_pte(switcher_page[i],
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ /* -1 means unknown so we remove everything. */
+ if (pgdir->last_host_cpu == -1) {
+ unsigned int i;
+ for_each_possible_cpu(i)
+ remove_switcher_percpu_map(cpu, i);
+ } else {
+ /* We know exactly what CPU mapping to remove. */
+ remove_switcher_percpu_map(cpu, pgdir->last_host_cpu);
}
- /* The only other thing we map is this CPU's pair of pages. */
- i = pages + cpu*2;
-
- /* First page (Guest registers) is writable from the Guest */
- set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
+ /*
+ * When we're running the Guest, we want the Guest's "regs" page to
+ * appear where the first Switcher page for this CPU is. This is an
+ * optimization: when the Switcher saves the Guest registers, it saves
+ * them into the first page of this CPU's "struct lguest_pages": if we
+ * make sure the Guest's register page is already mapped there, we
+ * don't have to copy them out again.
+ */
+ /* Find the shadow PTE for this regs page. */
+ base = switcher_addr + PAGE_SIZE
+ + raw_smp_processor_id() * sizeof(struct lguest_pages);
+ pte = find_spte(cpu, base, false, 0, 0);
+ regs_page = pfn_to_page(__pa(cpu->regs_page) >> PAGE_SHIFT);
+ get_page(regs_page);
+ set_pte(pte, mk_pte(regs_page, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL)));
/*
- * The second page contains the "struct lguest_ro_state", and is
- * read-only.
+ * We map the second page of the struct lguest_pages read-only in
+ * the Guest: the IDT, GDT and other things it's not supposed to
+ * change.
*/
- set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
- __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
+ pte = find_spte(cpu, base + PAGE_SIZE, false, 0, 0);
+ percpu_switcher_page
+ = lg_switcher_pages[1 + raw_smp_processor_id()*2 + 1];
+ get_page(percpu_switcher_page);
+ set_pte(pte, mk_pte(percpu_switcher_page,
+ __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)));
+
+ pgdir->last_host_cpu = raw_smp_processor_id();
}
-/*
+/*H:490
* We've made it through the page table code. Perhaps our tired brains are
* still processing the details, or perhaps we're simply glad it's over.
*
@@ -1124,29 +1191,3 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
*
* There is just one file remaining in the Host.
*/
-
-/*H:510
- * At boot or module load time, init_pagetables() allocates and populates
- * the Switcher PTE page for each CPU.
- */
-__init int init_pagetables(struct page **switcher_page, unsigned int pages)
-{
- unsigned int i;
-
- for_each_possible_cpu(i) {
- switcher_pte_page(i) = (pte_t *)get_zeroed_page(GFP_KERNEL);
- if (!switcher_pte_page(i)) {
- free_switcher_pte_pages();
- return -ENOMEM;
- }
- populate_switcher_pte_page(i, switcher_page, pages);
- }
- return 0;
-}
-/*:*/
-
-/* Cleaning up simply involves freeing the PTE page for each CPU. */
-void free_pagetables(void)
-{
- free_switcher_pte_pages();
-}
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 4af12e1..f0a3347 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -59,14 +59,13 @@ static struct {
/* Offset from where switcher.S was compiled to where we've copied it */
static unsigned long switcher_offset(void)
{
- return SWITCHER_ADDR - (unsigned long)start_switcher_text;
+ return switcher_addr - (unsigned long)start_switcher_text;
}
-/* This cpu's struct lguest_pages. */
+/* This cpu's struct lguest_pages (after the Switcher text page) */
static struct lguest_pages *lguest_pages(unsigned int cpu)
{
- return &(((struct lguest_pages *)
- (SWITCHER_ADDR + SHARED_SWITCHER_PAGES*PAGE_SIZE))[cpu]);
+ return &(((struct lguest_pages *)(switcher_addr + PAGE_SIZE))[cpu]);
}
static DEFINE_PER_CPU(struct lg_cpu *, lg_last_cpu);
diff --git a/drivers/media/pci/bt8xx/bttv-i2c.c b/drivers/media/pci/bt8xx/bttv-i2c.c
index b7c52dc..d43911d 100644
--- a/drivers/media/pci/bt8xx/bttv-i2c.c
+++ b/drivers/media/pci/bt8xx/bttv-i2c.c
@@ -397,8 +397,8 @@ int init_bttv_i2c(struct bttv *btv)
int fini_bttv_i2c(struct bttv *btv)
{
- if (0 != btv->i2c_rc)
- return 0;
+ if (btv->i2c_rc == 0)
+ i2c_del_adapter(&btv->c.i2c_adap);
- return i2c_del_adapter(&btv->c.i2c_adap);
+ return 0;
}
diff --git a/drivers/media/pci/mantis/mantis_i2c.c b/drivers/media/pci/mantis/mantis_i2c.c
index 937fb9d..895ddba 100644
--- a/drivers/media/pci/mantis/mantis_i2c.c
+++ b/drivers/media/pci/mantis/mantis_i2c.c
@@ -261,6 +261,8 @@ int mantis_i2c_exit(struct mantis_pci *mantis)
mmwrite((intmask & ~MANTIS_INT_I2CDONE), MANTIS_INT_MASK);
dprintk(MANTIS_DEBUG, 1, "Removing I2C adapter");
- return i2c_del_adapter(&mantis->adapter);
+ i2c_del_adapter(&mantis->adapter);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(mantis_i2c_exit);
diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index a966128..7ffc756 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -40,3 +40,17 @@ config CAIF_HSI
The caif low level driver for CAIF over HSI.
Be aware that if you enable this then you also need to
enable a low-level HSI driver.
+
+config CAIF_VIRTIO
+ tristate "CAIF virtio transport driver"
+ depends on CAIF
+ select VHOST_RING
+ select VIRTIO
+ select GENERIC_ALLOCATOR
+ default n
+ ---help---
+ The caif driver for CAIF over Virtio.
+
+if CAIF_VIRTIO
+source "drivers/vhost/Kconfig"
+endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 15a9d2f..9bbd453 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -9,3 +9,6 @@ obj-$(CONFIG_CAIF_SPI_SLAVE) += cfspi_slave.o
# HSI interface
obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
+
+# Virtio interface
+obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 0000000..b9ed128
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,790 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2013
+ * Authors: Vicram Arv
+ * Dmitry Tarnyagin <dmitry.tarnyagin@lockless.no>
+ * Sjur Brendeland
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_caif.h>
+#include <linux/virtio_ring.h>
+#include <linux/dma-mapping.h>
+#include <net/caif/caif_dev.h>
+#include <linux/virtio_config.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vicram Arv");
+MODULE_AUTHOR("Sjur Brendeland");
+MODULE_DESCRIPTION("Virtio CAIF Driver");
+
+/* NAPI schedule quota */
+#define CFV_DEFAULT_QUOTA 32
+
+/* Defaults used if virtio config space is unavailable */
+#define CFV_DEF_MTU_SIZE 4096
+#define CFV_DEF_HEADROOM 32
+#define CFV_DEF_TAILROOM 32
+
+/* Required IP header alignment */
+#define IP_HDR_ALIGN 4
+
+/* struct cfv_napi_contxt - NAPI context info
+ * @riov: IOV holding data read from the ring. Note that riov may
+ * still hold data when cfv_rx_poll() returns.
+ * @head: Last descriptor ID we received from vringh_getdesc_kern.
+ * We use this to put descriptor back on the used ring. USHRT_MAX is
+ * used to indicate invalid head-id.
+ */
+struct cfv_napi_context {
+ struct vringh_kiov riov;
+ unsigned short head;
+};
+
+/* struct cfv_stats - statistics for debugfs
+ * @rx_napi_complete: Number of NAPI completions (RX)
+ * @rx_napi_resched: Number of calls where the full quota was used (RX)
+ * @rx_nomem: Number of SKB alloc failures (RX)
+ * @rx_kicks: Number of RX kicks
+ * @tx_full_ring: Number times TX ring was full
+ * @tx_no_mem: Number of times TX went out of memory
+ * @tx_flow_on: Number of flow on (TX)
+ * @tx_kicks: Number of TX kicks
+ */
+struct cfv_stats {
+ u32 rx_napi_complete;
+ u32 rx_napi_resched;
+ u32 rx_nomem;
+ u32 rx_kicks;
+ u32 tx_full_ring;
+ u32 tx_no_mem;
+ u32 tx_flow_on;
+ u32 tx_kicks;
+};
+
+/* struct cfv_info - Caif Virtio control structure
+ * @cfdev: caif common header
+ * @vdev: Associated virtio device
+ * @vr_rx: rx/downlink host vring
+ * @vq_tx: tx/uplink virtqueue
+ * @ndev: CAIF link layer device
+ * @watermark_tx: indicates number of free descriptors we need
+ * to reopen the tx-queues after overload.
+ * @tx_lock: protects vq_tx from concurrent use
+ * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
+ * @napi: Napi context used in cfv_rx_poll()
+ * @ctx: Context data used in cfv_rx_poll()
+ * @tx_hr: transmit headroom
+ * @rx_hr: receive headroom
+ * @tx_tr: transmit tail room
+ * @rx_tr: receive tail room
+ * @mtu: transmit max size
+ * @mru: receive max size
+ * @allocsz: size of dma memory reserved for TX buffers
+ * @alloc_addr: virtual address to dma memory for TX buffers
+ * @alloc_dma: dma address to dma memory for TX buffers
+ * @genpool: Gen Pool used for allocating TX buffers
+ * @reserved_mem: Pointer to memory reserve allocated from genpool
+ * @reserved_size: Size of memory reserve allocated from genpool
+ * @stats: Statistics exposed in sysfs
+ * @debugfs: Debugfs dentry for statistic counters
+ */
+struct cfv_info {
+ struct caif_dev_common cfdev;
+ struct virtio_device *vdev;
+ struct vringh *vr_rx;
+ struct virtqueue *vq_tx;
+ struct net_device *ndev;
+ unsigned int watermark_tx;
+ /* Protect access to vq_tx */
+ spinlock_t tx_lock;
+ struct tasklet_struct tx_release_tasklet;
+ struct napi_struct napi;
+ struct cfv_napi_context ctx;
+ u16 tx_hr;
+ u16 rx_hr;
+ u16 tx_tr;
+ u16 rx_tr;
+ u32 mtu;
+ u32 mru;
+ size_t allocsz;
+ void *alloc_addr;
+ dma_addr_t alloc_dma;
+ struct gen_pool *genpool;
+ unsigned long reserved_mem;
+ size_t reserved_size;
+ struct cfv_stats stats;
+ struct dentry *debugfs;
+};
+
+/* struct buf_info - maintains transmit buffer data handle
+ * @size: size of transmit buffer
+ * @dma_handle: handle to allocated dma device memory area
+ * @vaddr: virtual address mapping to allocated memory area
+ */
+struct buf_info {
+ size_t size;
+ u8 *vaddr;
+};
+
+/* Called from virtio device, in IRQ context */
+static void cfv_release_cb(struct virtqueue *vq_tx)
+{
+ struct cfv_info *cfv = vq_tx->vdev->priv;
+
+ ++cfv->stats.tx_kicks;
+ tasklet_schedule(&cfv->tx_release_tasklet);
+}
+
+static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
+{
+ if (!buf_info)
+ return;
+ gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
+ buf_info->size);
+ kfree(buf_info);
+}
+
+/* This is invoked whenever the remote processor completed processing
+ * a TX msg we just sent, and the buffer is put back to the used ring.
+ */
+static void cfv_release_used_buf(struct virtqueue *vq_tx)
+{
+ struct cfv_info *cfv = vq_tx->vdev->priv;
+ unsigned long flags;
+
+ BUG_ON(vq_tx != cfv->vq_tx);
+
+ for (;;) {
+ unsigned int len;
+ struct buf_info *buf_info;
+
+ /* Get used buffer from used ring to recycle used descriptors */
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+ buf_info = virtqueue_get_buf(vq_tx, &len);
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* Stop looping if there are no more buffers to free */
+ if (!buf_info)
+ break;
+
+ free_buf_info(cfv, buf_info);
+
+ /* watermark_tx indicates if we previously stopped the tx
+ * queues. If we have enough free stots in the virtio ring,
+ * re-establish memory reserved and open up tx queues.
+ */
+ if (cfv->vq_tx->num_free <= cfv->watermark_tx)
+ continue;
+
+ /* Re-establish memory reserve */
+ if (cfv->reserved_mem == 0 && cfv->genpool)
+ cfv->reserved_mem =
+ gen_pool_alloc(cfv->genpool,
+ cfv->reserved_size);
+
+ /* Open up the tx queues */
+ if (cfv->reserved_mem) {
+ cfv->watermark_tx =
+ virtqueue_get_vring_size(cfv->vq_tx);
+ netif_tx_wake_all_queues(cfv->ndev);
+ /* Buffers are recycled in cfv_netdev_tx, so
+ * disable notifications when queues are opened.
+ */
+ virtqueue_disable_cb(cfv->vq_tx);
+ ++cfv->stats.tx_flow_on;
+ } else {
+ /* if no memory reserve, wait for more free slots */
+ WARN_ON(cfv->watermark_tx >
+ virtqueue_get_vring_size(cfv->vq_tx));
+ cfv->watermark_tx +=
+ virtqueue_get_vring_size(cfv->vq_tx) / 4;
+ }
+ }
+}
+
+/* Allocate a SKB and copy packet data to it */
+static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
+ struct cfv_info *cfv,
+ u8 *frm, u32 frm_len)
+{
+ struct sk_buff *skb;
+ u32 cfpkt_len, pad_len;
+
+ *err = 0;
+ /* Verify that packet size with down-link header and mtu size */
+ if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
+ netdev_err(cfv->ndev,
+ "Invalid frmlen:%u mtu:%u hr:%d tr:%d\n",
+ frm_len, cfv->mru, cfv->rx_hr,
+ cfv->rx_tr);
+ *err = -EPROTO;
+ return NULL;
+ }
+
+ cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
+ pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
+
+ skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
+ if (!skb) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ skb_reserve(skb, cfv->rx_hr + pad_len);
+
+ memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
+ return skb;
+}
+
+/* Get packets from the host vring */
+static int cfv_rx_poll(struct napi_struct *napi, int quota)
+{
+ struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
+ int rxcnt = 0;
+ int err = 0;
+ void *buf;
+ struct sk_buff *skb;
+ struct vringh_kiov *riov = &cfv->ctx.riov;
+ unsigned int skb_len;
+
+again:
+ do {
+ skb = NULL;
+
+ /* Put the previous iovec back on the used ring and
+ * fetch a new iovec if we have processed all elements.
+ */
+ if (riov->i == riov->used) {
+ if (cfv->ctx.head != USHRT_MAX) {
+ vringh_complete_kern(cfv->vr_rx,
+ cfv->ctx.head,
+ 0);
+ cfv->ctx.head = USHRT_MAX;
+ }
+
+ err = vringh_getdesc_kern(
+ cfv->vr_rx,
+ riov,
+ NULL,
+ &cfv->ctx.head,
+ GFP_ATOMIC);
+
+ if (err <= 0)
+ goto exit;
+ }
+
+ buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
+ /* TODO: Add check on valid buffer address */
+
+ skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
+ riov->iov[riov->i].iov_len);
+ if (unlikely(err))
+ goto exit;
+
+ /* Push received packet up the stack. */
+ skb_len = skb->len;
+ skb->protocol = htons(ETH_P_CAIF);
+ skb_reset_mac_header(skb);
+ skb->dev = cfv->ndev;
+ err = netif_receive_skb(skb);
+ if (unlikely(err)) {
+ ++cfv->ndev->stats.rx_dropped;
+ } else {
+ ++cfv->ndev->stats.rx_packets;
+ cfv->ndev->stats.rx_bytes += skb_len;
+ }
+
+ ++riov->i;
+ ++rxcnt;
+ } while (rxcnt < quota);
+
+ ++cfv->stats.rx_napi_resched;
+ goto out;
+
+exit:
+ switch (err) {
+ case 0:
+ ++cfv->stats.rx_napi_complete;
+
+ /* Really out of patckets? (stolen from virtio_net)*/
+ napi_complete(napi);
+ if (unlikely(!vringh_notify_enable_kern(cfv->vr_rx)) &&
+ napi_schedule_prep(napi)) {
+ vringh_notify_disable_kern(cfv->vr_rx);
+ __napi_schedule(napi);
+ goto again;
+ }
+ break;
+
+ case -ENOMEM:
+ ++cfv->stats.rx_nomem;
+ dev_kfree_skb(skb);
+ /* Stop NAPI poll on OOM, we hope to be polled later */
+ napi_complete(napi);
+ vringh_notify_enable_kern(cfv->vr_rx);
+ break;
+
+ default:
+ /* We're doomed, any modem fault is fatal */
+ netdev_warn(cfv->ndev, "Bad ring, disable device\n");
+ cfv->ndev->stats.rx_dropped = riov->used - riov->i;
+ napi_complete(napi);
+ vringh_notify_disable_kern(cfv->vr_rx);
+ netif_carrier_off(cfv->ndev);
+ break;
+ }
+out:
+ if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
+ vringh_notify(cfv->vr_rx);
+ return rxcnt;
+}
+
+static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
+{
+ struct cfv_info *cfv = vdev->priv;
+
+ ++cfv->stats.rx_kicks;
+ vringh_notify_disable_kern(cfv->vr_rx);
+ napi_schedule(&cfv->napi);
+}
+
+static void cfv_destroy_genpool(struct cfv_info *cfv)
+{
+ if (cfv->alloc_addr)
+ dma_free_coherent(cfv->vdev->dev.parent->parent,
+ cfv->allocsz, cfv->alloc_addr,
+ cfv->alloc_dma);
+
+ if (!cfv->genpool)
+ return;
+ gen_pool_free(cfv->genpool, cfv->reserved_mem,
+ cfv->reserved_size);
+ gen_pool_destroy(cfv->genpool);
+ cfv->genpool = NULL;
+}
+
+static int cfv_create_genpool(struct cfv_info *cfv)
+{
+ int err;
+
+ /* dma_alloc can only allocate whole pages, and we need a more
+ * fine graned allocation so we use genpool. We ask for space needed
+ * by IP and a full ring. If the dma allcoation fails we retry with a
+ * smaller allocation size.
+ */
+ err = -ENOMEM;
+ cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
+ (ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
+ if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
+ return -EINVAL;
+
+ for (;;) {
+ if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
+ netdev_info(cfv->ndev, "Not enough device memory\n");
+ return -ENOMEM;
+ }
+
+ cfv->alloc_addr = dma_alloc_coherent(
+ cfv->vdev->dev.parent->parent,
+ cfv->allocsz, &cfv->alloc_dma,
+ GFP_ATOMIC);
+ if (cfv->alloc_addr)
+ break;
+
+ cfv->allocsz = (cfv->allocsz * 3) >> 2;
+ }
+
+ netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
+ cfv->allocsz);
+
+ /* Allocate on 128 bytes boundaries (1 << 7)*/
+ cfv->genpool = gen_pool_create(7, -1);
+ if (!cfv->genpool)
+ goto err;
+
+ err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
+ (phys_addr_t)virt_to_phys(cfv->alloc_addr),
+ cfv->allocsz, -1);
+ if (err)
+ goto err;
+
+ /* Reserve some memory for low memory situations. If we hit the roof
+ * in the memory pool, we stop TX flow and release the reserve.
+ */
+ cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
+ cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
+ cfv->reserved_size);
+ if (!cfv->reserved_mem) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
+ return 0;
+err:
+ cfv_destroy_genpool(cfv);
+ return err;
+}
+
+/* Enable the CAIF interface and allocate the memory-pool */
+static int cfv_netdev_open(struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+
+ if (cfv_create_genpool(cfv))
+ return -ENOMEM;
+
+ netif_carrier_on(netdev);
+ napi_enable(&cfv->napi);
+
+ /* Schedule NAPI to read any pending packets */
+ napi_schedule(&cfv->napi);
+ return 0;
+}
+
+/* Disable the CAIF interface and free the memory-pool */
+static int cfv_netdev_close(struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+ unsigned long flags;
+ struct buf_info *buf_info;
+
+ /* Disable interrupts, queues and NAPI polling */
+ netif_carrier_off(netdev);
+ virtqueue_disable_cb(cfv->vq_tx);
+ vringh_notify_disable_kern(cfv->vr_rx);
+ napi_disable(&cfv->napi);
+
+ /* Release any TX buffers on both used and avilable rings */
+ cfv_release_used_buf(cfv->vq_tx);
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+ while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
+ free_buf_info(cfv, buf_info);
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* Release all dma allocated memory and destroy the pool */
+ cfv_destroy_genpool(cfv);
+ return 0;
+}
+
+/* Allocate a buffer in dma-memory and copy skb to it */
+static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
+ struct sk_buff *skb,
+ struct scatterlist *sg)
+{
+ struct caif_payload_info *info = (void *)&skb->cb;
+ struct buf_info *buf_info = NULL;
+ u8 pad_len, hdr_ofs;
+
+ if (!cfv->genpool)
+ goto err;
+
+ if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
+ netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
+ cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
+ goto err;
+ }
+
+ buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
+ if (unlikely(!buf_info))
+ goto err;
+
+ /* Make the IP header aligned in tbe buffer */
+ hdr_ofs = cfv->tx_hr + info->hdr_len;
+ pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
+ buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
+
+ /* allocate dma memory buffer */
+ buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
+ if (unlikely(!buf_info->vaddr))
+ goto err;
+
+ /* copy skbuf contents to send buffer */
+ skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
+ sg_init_one(sg, buf_info->vaddr + pad_len,
+ skb->len + cfv->tx_hr + cfv->rx_hr);
+
+ return buf_info;
+err:
+ kfree(buf_info);
+ return NULL;
+}
+
+/* Put the CAIF packet on the virtio ring and kick the receiver */
+static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+ struct cfv_info *cfv = netdev_priv(netdev);
+ struct buf_info *buf_info;
+ struct scatterlist sg;
+ unsigned long flags;
+ bool flow_off = false;
+ int ret;
+
+ /* garbage collect released buffers */
+ cfv_release_used_buf(cfv->vq_tx);
+ spin_lock_irqsave(&cfv->tx_lock, flags);
+
+ /* Flow-off check takes into account number of cpus to make sure
+ * virtqueue will not be overfilled in any possible smp conditions.
+ *
+ * Flow-on is triggered when sufficient buffers are freed
+ */
+ if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
+ flow_off = true;
+ cfv->stats.tx_full_ring++;
+ }
+
+ /* If we run out of memory, we release the memory reserve and retry
+ * allocation.
+ */
+ buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+ if (unlikely(!buf_info)) {
+ cfv->stats.tx_no_mem++;
+ flow_off = true;
+
+ if (cfv->reserved_mem && cfv->genpool) {
+ gen_pool_free(cfv->genpool, cfv->reserved_mem,
+ cfv->reserved_size);
+ cfv->reserved_mem = 0;
+ buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+ }
+ }
+
+ if (unlikely(flow_off)) {
+ /* Turn flow on when a 1/4 of the descriptors are released */
+ cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
+ /* Enable notifications of recycled TX buffers */
+ virtqueue_enable_cb(cfv->vq_tx);
+ netif_tx_stop_all_queues(netdev);
+ }
+
+ if (unlikely(!buf_info)) {
+ /* If the memory reserve does it's job, this shouldn't happen */
+ netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
+ goto err;
+ }
+
+ ret = virtqueue_add_outbuf(cfv->vq_tx, &sg, 1, buf_info, GFP_ATOMIC);
+ if (unlikely((ret < 0))) {
+ /* If flow control works, this shouldn't happen */
+ netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
+ ret);
+ goto err;
+ }
+
+ /* update netdev statistics */
+ cfv->ndev->stats.tx_packets++;
+ cfv->ndev->stats.tx_bytes += skb->len;
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+ /* tell the remote processor it has a pending message to read */
+ virtqueue_kick(cfv->vq_tx);
+
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+err:
+ spin_unlock_irqrestore(&cfv->tx_lock, flags);
+ cfv->ndev->stats.tx_dropped++;
+ free_buf_info(cfv, buf_info);
+ dev_kfree_skb(skb);
+ return NETDEV_TX_OK;
+}
+
+static void cfv_tx_release_tasklet(unsigned long drv)
+{
+ struct cfv_info *cfv = (struct cfv_info *)drv;
+ cfv_release_used_buf(cfv->vq_tx);
+}
+
+static const struct net_device_ops cfv_netdev_ops = {
+ .ndo_open = cfv_netdev_open,
+ .ndo_stop = cfv_netdev_close,
+ .ndo_start_xmit = cfv_netdev_tx,
+};
+
+static void cfv_netdev_setup(struct net_device *netdev)
+{
+ netdev->netdev_ops = &cfv_netdev_ops;
+ netdev->type = ARPHRD_CAIF;
+ netdev->tx_queue_len = 100;
+ netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
+ netdev->mtu = CFV_DEF_MTU_SIZE;
+ netdev->destructor = free_netdev;
+}
+
+/* Create debugfs counters for the device */
+static inline void debugfs_init(struct cfv_info *cfv)
+{
+ cfv->debugfs =
+ debugfs_create_dir(netdev_name(cfv->ndev), NULL);
+
+ if (IS_ERR(cfv->debugfs))
+ return;
+
+ debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_napi_complete);
+ debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_napi_resched);
+ debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_nomem);
+ debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
+ &cfv->stats.rx_kicks);
+ debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_full_ring);
+ debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_no_mem);
+ debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_kicks);
+ debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
+ &cfv->stats.tx_flow_on);
+}
+
+/* Setup CAIF for the a virtio device */
+static int cfv_probe(struct virtio_device *vdev)
+{
+ vq_callback_t *vq_cbs = cfv_release_cb;
+ vrh_callback_t *vrh_cbs = cfv_recv;
+ const char *names = "output";
+ const char *cfv_netdev_name = "cfvrt";
+ struct net_device *netdev;
+ struct cfv_info *cfv;
+ int err = -EINVAL;
+
+ netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
+ cfv_netdev_setup);
+ if (!netdev)
+ return -ENOMEM;
+
+ cfv = netdev_priv(netdev);
+ cfv->vdev = vdev;
+ cfv->ndev = netdev;
+
+ spin_lock_init(&cfv->tx_lock);
+
+ /* Get the RX virtio ring. This is a "host side vring". */
+ err = -ENODEV;
+ if (!vdev->vringh_config || !vdev->vringh_config->find_vrhs)
+ goto err;
+
+ err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
+ if (err)
+ goto err;
+
+ /* Get the TX virtio ring. This is a "guest side vring". */
+ err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+ if (err)
+ goto err;
+
+ /* Get the CAIF configuration from virtio config space, if available */
+#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
+ ((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
+ &_var, \
+ FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
+
+ if (vdev->config->get) {
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
+ GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+ } else {
+ cfv->tx_hr = CFV_DEF_HEADROOM;
+ cfv->rx_hr = CFV_DEF_HEADROOM;
+ cfv->tx_tr = CFV_DEF_TAILROOM;
+ cfv->rx_tr = CFV_DEF_TAILROOM;
+ cfv->mtu = CFV_DEF_MTU_SIZE;
+ cfv->mru = CFV_DEF_MTU_SIZE;
+ }
+
+ netdev->needed_headroom = cfv->tx_hr;
+ netdev->needed_tailroom = cfv->tx_tr;
+
+ /* Disable buffer release interrupts unless we have stopped TX queues */
+ virtqueue_disable_cb(cfv->vq_tx);
+
+ netdev->mtu = cfv->mtu - cfv->tx_tr;
+ vdev->priv = cfv;
+
+ /* Initialize NAPI poll context data */
+ vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
+ cfv->ctx.head = USHRT_MAX;
+ netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
+
+ tasklet_init(&cfv->tx_release_tasklet,
+ cfv_tx_release_tasklet,
+ (unsigned long)cfv);
+
+ /* Carrier is off until netdevice is opened */
+ netif_carrier_off(netdev);
+
+ /* register Netdev */
+ err = register_netdev(netdev);
+ if (err) {
+ dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
+ goto err;
+ }
+
+ debugfs_init(cfv);
+
+ return 0;
+err:
+ netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
+
+ if (cfv->vr_rx)
+ vdev->vringh_config->del_vrhs(cfv->vdev);
+ if (cfv->vdev)
+ vdev->config->del_vqs(cfv->vdev);
+ free_netdev(netdev);
+ return err;
+}
+
+static void cfv_remove(struct virtio_device *vdev)
+{
+ struct cfv_info *cfv = vdev->priv;
+
+ rtnl_lock();
+ dev_close(cfv->ndev);
+ rtnl_unlock();
+
+ tasklet_kill(&cfv->tx_release_tasklet);
+ debugfs_remove_recursive(cfv->debugfs);
+
+ vringh_kiov_cleanup(&cfv->ctx.riov);
+ vdev->config->reset(vdev);
+ vdev->vringh_config->del_vrhs(cfv->vdev);
+ cfv->vr_rx = NULL;
+ vdev->config->del_vqs(cfv->vdev);
+ unregister_netdev(cfv->ndev);
+}
+
+static struct virtio_device_id id_table[] = {
+ { VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
+ { 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver caif_virtio_driver = {
+ .feature_table = features,
+ .feature_table_size = ARRAY_SIZE(features),
+ .driver.name = KBUILD_MODNAME,
+ .driver.owner = THIS_MODULE,
+ .id_table = id_table,
+ .probe = cfv_probe,
+ .remove = cfv_remove,
+};
+
+module_virtio_driver(caif_virtio_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/drivers/net/ethernet/sfc/falcon.c b/drivers/net/ethernet/sfc/falcon.c
index 4486102..71998e7 100644
--- a/drivers/net/ethernet/sfc/falcon.c
+++ b/drivers/net/ethernet/sfc/falcon.c
@@ -1528,7 +1528,7 @@ static int falcon_probe_nic(struct efx_nic *efx)
return 0;
fail6:
- BUG_ON(i2c_del_adapter(&board->i2c_adap));
+ i2c_del_adapter(&board->i2c_adap);
memset(&board->i2c_adap, 0, sizeof(board->i2c_adap));
fail5:
efx_nic_free_buffer(efx, &efx->irq_status);
@@ -1666,13 +1666,11 @@ static void falcon_remove_nic(struct efx_nic *efx)
{
struct falcon_nic_data *nic_data = efx->nic_data;
struct falcon_board *board = falcon_board(efx);
- int rc;
board->type->fini(efx);
/* Remove I2C adapter and clear it in preparation for a retry */
- rc = i2c_del_adapter(&board->i2c_adap);
- BUG_ON(rc);
+ i2c_del_adapter(&board->i2c_adap);
memset(&board->i2c_adap, 0, sizeof(board->i2c_adap));
efx_nic_free_buffer(efx, &efx->irq_status);
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5007775..3c23fdc 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -39,7 +39,6 @@ module_param(gso, bool, 0444);
#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
#define GOOD_COPY_LEN 128
-#define VIRTNET_SEND_COMMAND_SG_MAX 2
#define VIRTNET_DRIVER_VERSION "1.0.0"
struct virtnet_stats {
@@ -444,7 +443,7 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
skb_to_sgvec(skb, rq->sg + 1, 0, skb->len);
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, 2, skb, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
if (err < 0)
dev_kfree_skb(skb);
@@ -489,8 +488,8 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
/* chain first in list head */
first->private = (unsigned long)list;
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, MAX_SKB_FRAGS + 2,
- first, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, MAX_SKB_FRAGS + 2,
+ first, gfp);
if (err < 0)
give_pages(rq, first);
@@ -508,7 +507,7 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
sg_init_one(rq->sg, page_address(page), PAGE_SIZE);
- err = virtqueue_add_buf(rq->vq, rq->sg, 0, 1, page, gfp);
+ err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, page, gfp);
if (err < 0)
give_pages(rq, page);
@@ -582,7 +581,7 @@ static void refill_work(struct work_struct *work)
bool still_empty;
int i;
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
struct receive_queue *rq = &vi->rq[i];
napi_disable(&rq->napi);
@@ -637,7 +636,7 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i;
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
@@ -711,8 +710,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
sg_set_buf(sq->sg, &hdr->hdr, sizeof hdr->hdr);
num_sg = skb_to_sgvec(skb, sq->sg + 1, 0, skb->len) + 1;
- return virtqueue_add_buf(sq->vq, sq->sg, num_sg,
- 0, skb, GFP_ATOMIC);
+ return virtqueue_add_outbuf(sq->vq, sq->sg, num_sg, skb, GFP_ATOMIC);
}
static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -767,32 +765,35 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
* never fail unless improperly formated.
*/
static bool virtnet_send_command(struct virtnet_info *vi, u8 class, u8 cmd,
- struct scatterlist *data, int out, int in)
+ struct scatterlist *out,
+ struct scatterlist *in)
{
- struct scatterlist *s, sg[VIRTNET_SEND_COMMAND_SG_MAX + 2];
+ struct scatterlist *sgs[4], hdr, stat;
struct virtio_net_ctrl_hdr ctrl;
virtio_net_ctrl_ack status = ~0;
- unsigned int tmp;
- int i;
+ unsigned out_num = 0, in_num = 0, tmp;
/* Caller should know better */
- BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ) ||
- (out + in > VIRTNET_SEND_COMMAND_SG_MAX));
-
- out++; /* Add header */
- in++; /* Add return status */
+ BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ));
ctrl.class = class;
ctrl.cmd = cmd;
+ /* Add header */
+ sg_init_one(&hdr, &ctrl, sizeof(ctrl));
+ sgs[out_num++] = &hdr;
- sg_init_table(sg, out + in);
+ if (out)
+ sgs[out_num++] = out;
+ if (in)
+ sgs[out_num + in_num++] = in;
- sg_set_buf(&sg[0], &ctrl, sizeof(ctrl));
- for_each_sg(data, s, out + in - 2, i)
- sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
- sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
+ /* Add return status. */
+ sg_init_one(&stat, &status, sizeof(status));
+ sgs[out_num + in_num++] = &stat;
- BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
+ BUG_ON(out_num + in_num > ARRAY_SIZE(sgs));
+ BUG_ON(virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC)
+ < 0);
virtqueue_kick(vi->cvq);
@@ -821,7 +822,7 @@ static int virtnet_set_mac_address(struct net_device *dev, void *p)
sg_init_one(&sg, addr->sa_data, dev->addr_len);
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_ADDR_SET,
- &sg, 1, 0)) {
+ &sg, NULL)) {
dev_warn(&vdev->dev,
"Failed to set mac address by vq command.\n");
return -EINVAL;
@@ -889,8 +890,7 @@ static void virtnet_ack_link_announce(struct virtnet_info *vi)
{
rtnl_lock();
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_ANNOUNCE,
- VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL,
- 0, 0))
+ VIRTIO_NET_CTRL_ANNOUNCE_ACK, NULL, NULL))
dev_warn(&vi->dev->dev, "Failed to ack link announce.\n");
rtnl_unlock();
}
@@ -900,6 +900,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
struct scatterlist sg;
struct virtio_net_ctrl_mq s;
struct net_device *dev = vi->dev;
+ int i;
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0;
@@ -908,12 +909,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
sg_init_one(&sg, &s, sizeof(s));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MQ,
- VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, 1, 0)){
+ VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, &sg, NULL)) {
dev_warn(&dev->dev, "Fail to set num of queue pairs to %d\n",
queue_pairs);
return -EINVAL;
- } else
+ } else {
+ for (i = vi->curr_queue_pairs; i < queue_pairs; i++)
+ if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
+ schedule_delayed_work(&vi->refill, 0);
vi->curr_queue_pairs = queue_pairs;
+ }
return 0;
}
@@ -955,7 +960,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_PROMISC,
- sg, 1, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to %sable promisc mode.\n",
promisc ? "en" : "dis");
@@ -963,7 +968,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_RX,
VIRTIO_NET_CTRL_RX_ALLMULTI,
- sg, 1, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to %sable allmulti mode.\n",
allmulti ? "en" : "dis");
@@ -1000,7 +1005,7 @@ static void virtnet_set_rx_mode(struct net_device *dev)
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_MAC,
VIRTIO_NET_CTRL_MAC_TABLE_SET,
- sg, 2, 0))
+ sg, NULL))
dev_warn(&dev->dev, "Failed to set MAC fitler table.\n");
kfree(buf);
@@ -1015,7 +1020,7 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
- VIRTIO_NET_CTRL_VLAN_ADD, &sg, 1, 0))
+ VIRTIO_NET_CTRL_VLAN_ADD, &sg, NULL))
dev_warn(&dev->dev, "Failed to add VLAN ID %d.\n", vid);
return 0;
}
@@ -1029,7 +1034,7 @@ static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
sg_init_one(&sg, &vid, sizeof(vid));
if (!virtnet_send_command(vi, VIRTIO_NET_CTRL_VLAN,
- VIRTIO_NET_CTRL_VLAN_DEL, &sg, 1, 0))
+ VIRTIO_NET_CTRL_VLAN_DEL, &sg, NULL))
dev_warn(&dev->dev, "Failed to kill VLAN ID %d.\n", vid);
return 0;
}
@@ -1570,7 +1575,7 @@ static int virtnet_probe(struct virtio_device *vdev)
}
/* Last of all, set up some receive buffers. */
- for (i = 0; i < vi->max_queue_pairs; i++) {
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
try_fill_recv(&vi->rq[i], GFP_KERNEL);
/* If we didn't even get one input buffer, we're useless. */
@@ -1694,7 +1699,7 @@ static int virtnet_restore(struct virtio_device *vdev)
netif_device_attach(vi->dev);
- for (i = 0; i < vi->max_queue_pairs; i++)
+ for (i = 0; i < vi->curr_queue_pairs; i++)
if (!try_fill_recv(&vi->rq[i], GFP_KERNEL))
schedule_delayed_work(&vi->refill, 0);
diff --git a/drivers/pinctrl/core.c b/drivers/pinctrl/core.c
index c3d222e..5327f35 100644
--- a/drivers/pinctrl/core.c
+++ b/drivers/pinctrl/core.c
@@ -129,7 +129,7 @@ struct pinctrl_dev *get_pinctrl_dev_from_of_node(struct device_node *np)
return pctldev;
}
- mutex_lock(&pinctrldev_list_mutex);
+ mutex_unlock(&pinctrldev_list_mutex);
return NULL;
}
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index a4908ec..3e5a887 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -75,7 +75,7 @@ struct plgpio {
int (*o2p)(int offset); /* offset_to_pin */
u32 p2o_regs;
struct plgpio_regs regs;
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
struct plgpio_regs *csave_regs;
#endif
};
@@ -554,7 +554,7 @@ static int plgpio_probe(struct platform_device *pdev)
if (IS_ERR(plgpio->clk))
dev_warn(&pdev->dev, "clk_get() failed, work without it\n");
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
plgpio->csave_regs = devm_kzalloc(&pdev->dev,
sizeof(*plgpio->csave_regs) *
DIV_ROUND_UP(plgpio->chip.ngpio, MAX_GPIO_PER_REG),
@@ -641,7 +641,7 @@ unprepare_clk:
return ret;
}
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
static int plgpio_suspend(struct device *dev)
{
struct plgpio *plgpio = dev_get_drvdata(dev);
diff --git a/drivers/rpmsg/virtio_rpmsg_bus.c b/drivers/rpmsg/virtio_rpmsg_bus.c
index 7861f11..56fceaf 100644
--- a/drivers/rpmsg/virtio_rpmsg_bus.c
+++ b/drivers/rpmsg/virtio_rpmsg_bus.c
@@ -757,14 +757,14 @@ int rpmsg_send_offchannel_raw(struct rpmsg_channel *rpdev, u32 src, u32 dst,
mutex_lock(&vrp->tx_lock);
/* add message to the remote processor's virtqueue */
- err = virtqueue_add_buf(vrp->svq, &sg, 1, 0, msg, GFP_KERNEL);
+ err = virtqueue_add_outbuf(vrp->svq, &sg, 1, msg, GFP_KERNEL);
if (err) {
/*
* need to reclaim the buffer here, otherwise it's lost
* (memory won't leak, but rpmsg won't use it again for TX).
* this will wait for a buffer management overhaul.
*/
- dev_err(dev, "virtqueue_add_buf failed: %d\n", err);
+ dev_err(dev, "virtqueue_add_outbuf failed: %d\n", err);
goto out;
}
@@ -839,7 +839,7 @@ static void rpmsg_recv_done(struct virtqueue *rvq)
sg_init_one(&sg, msg, RPMSG_BUF_SIZE);
/* add the buffer back to the remote processor's virtqueue */
- err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, msg, GFP_KERNEL);
+ err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, msg, GFP_KERNEL);
if (err < 0) {
dev_err(dev, "failed to add a virtqueue buffer: %d\n", err);
return;
@@ -972,7 +972,7 @@ static int rpmsg_probe(struct virtio_device *vdev)
sg_init_one(&sg, cpu_addr, RPMSG_BUF_SIZE);
- err = virtqueue_add_buf(vrp->rvq, &sg, 0, 1, cpu_addr,
+ err = virtqueue_add_inbuf(vrp->rvq, &sg, 1, cpu_addr,
GFP_KERNEL);
WARN_ON(err); /* sanity check; this can't really happen */
}
diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
index 3449a1f..2168258 100644
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -13,6 +13,8 @@
*
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/mempool.h>
@@ -20,12 +22,14 @@
#include <linux/virtio_ids.h>
#include <linux/virtio_config.h>
#include <linux/virtio_scsi.h>
+#include <linux/cpu.h>
#include <scsi/scsi_host.h>
#include <scsi/scsi_device.h>
#include <scsi/scsi_cmnd.h>
#define VIRTIO_SCSI_MEMPOOL_SZ 64
#define VIRTIO_SCSI_EVENT_LEN 8
+#define VIRTIO_SCSI_VQ_BASE 2
/* Command queue element */
struct virtio_scsi_cmd {
@@ -57,27 +61,61 @@ struct virtio_scsi_vq {
struct virtqueue *vq;
};
-/* Per-target queue state */
+/*
+ * Per-target queue state.
+ *
+ * This struct holds the data needed by the queue steering policy. When a
+ * target is sent multiple requests, we need to drive them to the same queue so
+ * that FIFO processing order is kept. However, if a target was idle, we can
+ * choose a queue arbitrarily. In this case the queue is chosen according to
+ * the current VCPU, so the driver expects the number of request queues to be
+ * equal to the number of VCPUs. This makes it easy and fast to select the
+ * queue, and also lets the driver optimize the IRQ affinity for the virtqueues
+ * (each virtqueue's affinity is set to the CPU that "owns" the queue).
+ *
+ * An interesting effect of this policy is that only writes to req_vq need to
+ * take the tgt_lock. Read can be done outside the lock because:
+ *
+ * - writes of req_vq only occur when atomic_inc_return(&tgt->reqs) returns 1.
+ * In that case, no other CPU is reading req_vq: even if they were in
+ * virtscsi_queuecommand_multi, they would be spinning on tgt_lock.
+ *
+ * - reads of req_vq only occur when the target is not idle (reqs != 0).
+ * A CPU that enters virtscsi_queuecommand_multi will not modify req_vq.
+ *
+ * Similarly, decrements of reqs are never concurrent with writes of req_vq.
+ * Thus they can happen outside the tgt_lock, provided of course we make reqs
+ * an atomic_t.
+ */
struct virtio_scsi_target_state {
- /* Protects sg. Lock hierarchy is tgt_lock -> vq_lock. */
+ /* This spinlock never held at the same time as vq_lock. */
spinlock_t tgt_lock;
- /* For sglist construction when adding commands to the virtqueue. */
- struct scatterlist sg[];
+ /* Count of outstanding requests. */
+ atomic_t reqs;
+
+ /* Currently active virtqueue for requests sent to this target. */
+ struct virtio_scsi_vq *req_vq;
};
/* Driver instance state */
struct virtio_scsi {
struct virtio_device *vdev;
- struct virtio_scsi_vq ctrl_vq;
- struct virtio_scsi_vq event_vq;
- struct virtio_scsi_vq req_vq;
-
/* Get some buffers ready for event vq */
struct virtio_scsi_event_node event_list[VIRTIO_SCSI_EVENT_LEN];
- struct virtio_scsi_target_state *tgt[];
+ u32 num_queues;
+
+ /* If the affinity hint is set for virtqueues */
+ bool affinity_hint_set;
+
+ /* CPU hotplug notifier */
+ struct notifier_block nb;
+
+ struct virtio_scsi_vq ctrl_vq;
+ struct virtio_scsi_vq event_vq;
+ struct virtio_scsi_vq req_vqs[];
};
static struct kmem_cache *virtscsi_cmd_cache;
@@ -107,11 +145,13 @@ static void virtscsi_compute_resid(struct scsi_cmnd *sc, u32 resid)
*
* Called with vq_lock held.
*/
-static void virtscsi_complete_cmd(void *buf)
+static void virtscsi_complete_cmd(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_cmd *cmd = buf;
struct scsi_cmnd *sc = cmd->sc;
struct virtio_scsi_cmd_resp *resp = &cmd->resp.cmd;
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
dev_dbg(&sc->device->sdev_gendev,
"cmd %p response %u status %#02x sense_len %u\n",
@@ -166,32 +206,71 @@ static void virtscsi_complete_cmd(void *buf)
mempool_free(cmd, virtscsi_cmd_pool);
sc->scsi_done(sc);
+
+ atomic_dec(&tgt->reqs);
}
-static void virtscsi_vq_done(struct virtqueue *vq, void (*fn)(void *buf))
+static void virtscsi_vq_done(struct virtio_scsi *vscsi,
+ struct virtio_scsi_vq *virtscsi_vq,
+ void (*fn)(struct virtio_scsi *vscsi, void *buf))
{
void *buf;
unsigned int len;
+ unsigned long flags;
+ struct virtqueue *vq = virtscsi_vq->vq;
+ spin_lock_irqsave(&virtscsi_vq->vq_lock, flags);
do {
virtqueue_disable_cb(vq);
while ((buf = virtqueue_get_buf(vq, &len)) != NULL)
- fn(buf);
+ fn(vscsi, buf);
} while (!virtqueue_enable_cb(vq));
+ spin_unlock_irqrestore(&virtscsi_vq->vq_lock, flags);
}
static void virtscsi_req_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
+ int index = vq->index - VIRTIO_SCSI_VQ_BASE;
+ struct virtio_scsi_vq *req_vq = &vscsi->req_vqs[index];
- spin_lock_irqsave(&vscsi->req_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_cmd);
- spin_unlock_irqrestore(&vscsi->req_vq.vq_lock, flags);
+ /*
+ * Read req_vq before decrementing the reqs field in
+ * virtscsi_complete_cmd.
+ *
+ * With barriers:
+ *
+ * CPU #0 virtscsi_queuecommand_multi (CPU #1)
+ * ------------------------------------------------------------
+ * lock vq_lock
+ * read req_vq
+ * read reqs (reqs = 1)
+ * write reqs (reqs = 0)
+ * increment reqs (reqs = 1)
+ * write req_vq
+ *
+ * Possible reordering without barriers:
+ *
+ * CPU #0 virtscsi_queuecommand_multi (CPU #1)
+ * ------------------------------------------------------------
+ * lock vq_lock
+ * read reqs (reqs = 1)
+ * write reqs (reqs = 0)
+ * increment reqs (reqs = 1)
+ * write req_vq
+ * read (wrong) req_vq
+ *
+ * We do not need a full smp_rmb, because req_vq is required to get
+ * to tgt->reqs: tgt is &vscsi->tgt[sc->device->id], where sc is stored
+ * in the virtqueue as the user token.
+ */
+ smp_read_barrier_depends();
+
+ virtscsi_vq_done(vscsi, req_vq, virtscsi_complete_cmd);
};
-static void virtscsi_complete_free(void *buf)
+static void virtscsi_complete_free(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_cmd *cmd = buf;
@@ -205,11 +284,8 @@ static void virtscsi_ctrl_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
- spin_lock_irqsave(&vscsi->ctrl_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_free);
- spin_unlock_irqrestore(&vscsi->ctrl_vq.vq_lock, flags);
+ virtscsi_vq_done(vscsi, &vscsi->ctrl_vq, virtscsi_complete_free);
};
static int virtscsi_kick_event(struct virtio_scsi *vscsi,
@@ -223,8 +299,8 @@ static int virtscsi_kick_event(struct virtio_scsi *vscsi,
spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
- err = virtqueue_add_buf(vscsi->event_vq.vq, &sg, 0, 1, event_node,
- GFP_ATOMIC);
+ err = virtqueue_add_inbuf(vscsi->event_vq.vq, &sg, 1, event_node,
+ GFP_ATOMIC);
if (!err)
virtqueue_kick(vscsi->event_vq.vq);
@@ -254,7 +330,7 @@ static void virtscsi_cancel_event_work(struct virtio_scsi *vscsi)
}
static void virtscsi_handle_transport_reset(struct virtio_scsi *vscsi,
- struct virtio_scsi_event *event)
+ struct virtio_scsi_event *event)
{
struct scsi_device *sdev;
struct Scsi_Host *shost = virtio_scsi_host(vscsi->vdev);
@@ -332,7 +408,7 @@ static void virtscsi_handle_event(struct work_struct *work)
virtscsi_kick_event(vscsi, event_node);
}
-static void virtscsi_complete_event(void *buf)
+static void virtscsi_complete_event(struct virtio_scsi *vscsi, void *buf)
{
struct virtio_scsi_event_node *event_node = buf;
@@ -344,82 +420,65 @@ static void virtscsi_event_done(struct virtqueue *vq)
{
struct Scsi_Host *sh = virtio_scsi_host(vq->vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- unsigned long flags;
- spin_lock_irqsave(&vscsi->event_vq.vq_lock, flags);
- virtscsi_vq_done(vq, virtscsi_complete_event);
- spin_unlock_irqrestore(&vscsi->event_vq.vq_lock, flags);
+ virtscsi_vq_done(vscsi, &vscsi->event_vq, virtscsi_complete_event);
};
-static void virtscsi_map_sgl(struct scatterlist *sg, unsigned int *p_idx,
- struct scsi_data_buffer *sdb)
-{
- struct sg_table *table = &sdb->table;
- struct scatterlist *sg_elem;
- unsigned int idx = *p_idx;
- int i;
-
- for_each_sg(table->sgl, sg_elem, table->nents, i)
- sg[idx++] = *sg_elem;
-
- *p_idx = idx;
-}
-
/**
- * virtscsi_map_cmd - map a scsi_cmd to a virtqueue scatterlist
- * @vscsi : virtio_scsi state
+ * virtscsi_add_cmd - add a virtio_scsi_cmd to a virtqueue
+ * @vq : the struct virtqueue we're talking about
* @cmd : command structure
- * @out_num : number of read-only elements
- * @in_num : number of write-only elements
* @req_size : size of the request buffer
* @resp_size : size of the response buffer
- *
- * Called with tgt_lock held.
+ * @gfp : flags to use for memory allocations
*/
-static void virtscsi_map_cmd(struct virtio_scsi_target_state *tgt,
- struct virtio_scsi_cmd *cmd,
- unsigned *out_num, unsigned *in_num,
- size_t req_size, size_t resp_size)
+static int virtscsi_add_cmd(struct virtqueue *vq,
+ struct virtio_scsi_cmd *cmd,
+ size_t req_size, size_t resp_size, gfp_t gfp)
{
struct scsi_cmnd *sc = cmd->sc;
- struct scatterlist *sg = tgt->sg;
- unsigned int idx = 0;
+ struct scatterlist *sgs[4], req, resp;
+ struct sg_table *out, *in;
+ unsigned out_num = 0, in_num = 0;
+
+ out = in = NULL;
+
+ if (sc && sc->sc_data_direction != DMA_NONE) {
+ if (sc->sc_data_direction != DMA_FROM_DEVICE)
+ out = &scsi_out(sc)->table;
+ if (sc->sc_data_direction != DMA_TO_DEVICE)
+ in = &scsi_in(sc)->table;
+ }
/* Request header. */
- sg_set_buf(&sg[idx++], &cmd->req, req_size);
+ sg_init_one(&req, &cmd->req, req_size);
+ sgs[out_num++] = &req;
/* Data-out buffer. */
- if (sc && sc->sc_data_direction != DMA_FROM_DEVICE)
- virtscsi_map_sgl(sg, &idx, scsi_out(sc));
-
- *out_num = idx;
+ if (out)
+ sgs[out_num++] = out->sgl;
/* Response header. */
- sg_set_buf(&sg[idx++], &cmd->resp, resp_size);
+ sg_init_one(&resp, &cmd->resp, resp_size);
+ sgs[out_num + in_num++] = &resp;
/* Data-in buffer */
- if (sc && sc->sc_data_direction != DMA_TO_DEVICE)
- virtscsi_map_sgl(sg, &idx, scsi_in(sc));
+ if (in)
+ sgs[out_num + in_num++] = in->sgl;
- *in_num = idx - *out_num;
+ return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, gfp);
}
-static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
- struct virtio_scsi_vq *vq,
+static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
struct virtio_scsi_cmd *cmd,
size_t req_size, size_t resp_size, gfp_t gfp)
{
- unsigned int out_num, in_num;
unsigned long flags;
int err;
bool needs_kick = false;
- spin_lock_irqsave(&tgt->tgt_lock, flags);
- virtscsi_map_cmd(tgt, cmd, &out_num, &in_num, req_size, resp_size);
-
- spin_lock(&vq->vq_lock);
- err = virtqueue_add_buf(vq->vq, tgt->sg, out_num, in_num, cmd, gfp);
- spin_unlock(&tgt->tgt_lock);
+ spin_lock_irqsave(&vq->vq_lock, flags);
+ err = virtscsi_add_cmd(vq->vq, cmd, req_size, resp_size, gfp);
if (!err)
needs_kick = virtqueue_kick_prepare(vq->vq);
@@ -430,10 +489,10 @@ static int virtscsi_kick_cmd(struct virtio_scsi_target_state *tgt,
return err;
}
-static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
+static int virtscsi_queuecommand(struct virtio_scsi *vscsi,
+ struct virtio_scsi_vq *req_vq,
+ struct scsi_cmnd *sc)
{
- struct virtio_scsi *vscsi = shost_priv(sh);
- struct virtio_scsi_target_state *tgt = vscsi->tgt[sc->device->id];
struct virtio_scsi_cmd *cmd;
int ret;
@@ -467,7 +526,7 @@ static int virtscsi_queuecommand(struct Scsi_Host *sh, struct scsi_cmnd *sc)
BUG_ON(sc->cmd_len > VIRTIO_SCSI_CDB_SIZE);
memcpy(cmd->req.cmd.cdb, sc->cmnd, sc->cmd_len);
- if (virtscsi_kick_cmd(tgt, &vscsi->req_vq, cmd,
+ if (virtscsi_kick_cmd(req_vq, cmd,
sizeof cmd->req.cmd, sizeof cmd->resp.cmd,
GFP_ATOMIC) == 0)
ret = 0;
@@ -478,14 +537,62 @@ out:
return ret;
}
+static int virtscsi_queuecommand_single(struct Scsi_Host *sh,
+ struct scsi_cmnd *sc)
+{
+ struct virtio_scsi *vscsi = shost_priv(sh);
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
+
+ atomic_inc(&tgt->reqs);
+ return virtscsi_queuecommand(vscsi, &vscsi->req_vqs[0], sc);
+}
+
+static struct virtio_scsi_vq *virtscsi_pick_vq(struct virtio_scsi *vscsi,
+ struct virtio_scsi_target_state *tgt)
+{
+ struct virtio_scsi_vq *vq;
+ unsigned long flags;
+ u32 queue_num;
+
+ spin_lock_irqsave(&tgt->tgt_lock, flags);
+
+ /*
+ * The memory barrier after atomic_inc_return matches
+ * the smp_read_barrier_depends() in virtscsi_req_done.
+ */
+ if (atomic_inc_return(&tgt->reqs) > 1)
+ vq = ACCESS_ONCE(tgt->req_vq);
+ else {
+ queue_num = smp_processor_id();
+ while (unlikely(queue_num >= vscsi->num_queues))
+ queue_num -= vscsi->num_queues;
+
+ tgt->req_vq = vq = &vscsi->req_vqs[queue_num];
+ }
+
+ spin_unlock_irqrestore(&tgt->tgt_lock, flags);
+ return vq;
+}
+
+static int virtscsi_queuecommand_multi(struct Scsi_Host *sh,
+ struct scsi_cmnd *sc)
+{
+ struct virtio_scsi *vscsi = shost_priv(sh);
+ struct virtio_scsi_target_state *tgt =
+ scsi_target(sc->device)->hostdata;
+ struct virtio_scsi_vq *req_vq = virtscsi_pick_vq(vscsi, tgt);
+
+ return virtscsi_queuecommand(vscsi, req_vq, sc);
+}
+
static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
{
DECLARE_COMPLETION_ONSTACK(comp);
- struct virtio_scsi_target_state *tgt = vscsi->tgt[cmd->sc->device->id];
int ret = FAILED;
cmd->comp = &comp;
- if (virtscsi_kick_cmd(tgt, &vscsi->ctrl_vq, cmd,
+ if (virtscsi_kick_cmd(&vscsi->ctrl_vq, cmd,
sizeof cmd->req.tmf, sizeof cmd->resp.tmf,
GFP_NOIO) < 0)
goto out;
@@ -547,18 +654,57 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
return virtscsi_tmf(vscsi, cmd);
}
-static struct scsi_host_template virtscsi_host_template = {
+static int virtscsi_target_alloc(struct scsi_target *starget)
+{
+ struct virtio_scsi_target_state *tgt =
+ kmalloc(sizeof(*tgt), GFP_KERNEL);
+ if (!tgt)
+ return -ENOMEM;
+
+ spin_lock_init(&tgt->tgt_lock);
+ atomic_set(&tgt->reqs, 0);
+ tgt->req_vq = NULL;
+
+ starget->hostdata = tgt;
+ return 0;
+}
+
+static void virtscsi_target_destroy(struct scsi_target *starget)
+{
+ struct virtio_scsi_target_state *tgt = starget->hostdata;
+ kfree(tgt);
+}
+
+static struct scsi_host_template virtscsi_host_template_single = {
+ .module = THIS_MODULE,
+ .name = "Virtio SCSI HBA",
+ .proc_name = "virtio_scsi",
+ .this_id = -1,
+ .queuecommand = virtscsi_queuecommand_single,
+ .eh_abort_handler = virtscsi_abort,
+ .eh_device_reset_handler = virtscsi_device_reset,
+
+ .can_queue = 1024,
+ .dma_boundary = UINT_MAX,
+ .use_clustering = ENABLE_CLUSTERING,
+ .target_alloc = virtscsi_target_alloc,
+ .target_destroy = virtscsi_target_destroy,
+};
+
+static struct scsi_host_template virtscsi_host_template_multi = {
.module = THIS_MODULE,
.name = "Virtio SCSI HBA",
.proc_name = "virtio_scsi",
- .queuecommand = virtscsi_queuecommand,
.this_id = -1,
+ .queuecommand = virtscsi_queuecommand_multi,
.eh_abort_handler = virtscsi_abort,
.eh_device_reset_handler = virtscsi_device_reset,
.can_queue = 1024,
.dma_boundary = UINT_MAX,
.use_clustering = ENABLE_CLUSTERING,
+ .target_alloc = virtscsi_target_alloc,
+ .target_destroy = virtscsi_target_destroy,
};
#define virtscsi_config_get(vdev, fld) \
@@ -578,29 +724,69 @@ static struct scsi_host_template virtscsi_host_template = {
&__val, sizeof(__val)); \
})
-static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
- struct virtqueue *vq)
+static void __virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
- spin_lock_init(&virtscsi_vq->vq_lock);
- virtscsi_vq->vq = vq;
+ int i;
+ int cpu;
+
+ /* In multiqueue mode, when the number of cpu is equal
+ * to the number of request queues, we let the qeueues
+ * to be private to one cpu by setting the affinity hint
+ * to eliminate the contention.
+ */
+ if ((vscsi->num_queues == 1 ||
+ vscsi->num_queues != num_online_cpus()) && affinity) {
+ if (vscsi->affinity_hint_set)
+ affinity = false;
+ else
+ return;
+ }
+
+ if (affinity) {
+ i = 0;
+ for_each_online_cpu(cpu) {
+ virtqueue_set_affinity(vscsi->req_vqs[i].vq, cpu);
+ i++;
+ }
+
+ vscsi->affinity_hint_set = true;
+ } else {
+ for (i = 0; i < vscsi->num_queues - VIRTIO_SCSI_VQ_BASE; i++)
+ virtqueue_set_affinity(vscsi->req_vqs[i].vq, -1);
+
+ vscsi->affinity_hint_set = false;
+ }
}
-static struct virtio_scsi_target_state *virtscsi_alloc_tgt(
- struct virtio_device *vdev, int sg_elems)
+static void virtscsi_set_affinity(struct virtio_scsi *vscsi, bool affinity)
{
- struct virtio_scsi_target_state *tgt;
- gfp_t gfp_mask = GFP_KERNEL;
-
- /* We need extra sg elements at head and tail. */
- tgt = kmalloc(sizeof(*tgt) + sizeof(tgt->sg[0]) * (sg_elems + 2),
- gfp_mask);
+ get_online_cpus();
+ __virtscsi_set_affinity(vscsi, affinity);
+ put_online_cpus();
+}
- if (!tgt)
- return NULL;
+static int virtscsi_cpu_callback(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ struct virtio_scsi *vscsi = container_of(nfb, struct virtio_scsi, nb);
+ switch(action) {
+ case CPU_ONLINE:
+ case CPU_ONLINE_FROZEN:
+ case CPU_DEAD:
+ case CPU_DEAD_FROZEN:
+ __virtscsi_set_affinity(vscsi, true);
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
- spin_lock_init(&tgt->tgt_lock);
- sg_init_table(tgt->sg, sg_elems + 2);
- return tgt;
+static void virtscsi_init_vq(struct virtio_scsi_vq *virtscsi_vq,
+ struct virtqueue *vq)
+{
+ spin_lock_init(&virtscsi_vq->vq_lock);
+ virtscsi_vq->vq = vq;
}
static void virtscsi_scan(struct virtio_device *vdev)
@@ -614,46 +800,56 @@ static void virtscsi_remove_vqs(struct virtio_device *vdev)
{
struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- u32 i, num_targets;
+
+ virtscsi_set_affinity(vscsi, false);
/* Stop all the virtqueues. */
vdev->config->reset(vdev);
- num_targets = sh->max_id;
- for (i = 0; i < num_targets; i++) {
- kfree(vscsi->tgt[i]);
- vscsi->tgt[i] = NULL;
- }
-
vdev->config->del_vqs(vdev);
}
static int virtscsi_init(struct virtio_device *vdev,
- struct virtio_scsi *vscsi, int num_targets)
+ struct virtio_scsi *vscsi)
{
int err;
- struct virtqueue *vqs[3];
- u32 i, sg_elems;
+ u32 i;
+ u32 num_vqs;
+ vq_callback_t **callbacks;
+ const char **names;
+ struct virtqueue **vqs;
+
+ num_vqs = vscsi->num_queues + VIRTIO_SCSI_VQ_BASE;
+ vqs = kmalloc(num_vqs * sizeof(struct virtqueue *), GFP_KERNEL);
+ callbacks = kmalloc(num_vqs * sizeof(vq_callback_t *), GFP_KERNEL);
+ names = kmalloc(num_vqs * sizeof(char *), GFP_KERNEL);
+
+ if (!callbacks || !vqs || !names) {
+ err = -ENOMEM;
+ goto out;
+ }
- vq_callback_t *callbacks[] = {
- virtscsi_ctrl_done,
- virtscsi_event_done,
- virtscsi_req_done
- };
- const char *names[] = {
- "control",
- "event",
- "request"
- };
+ callbacks[0] = virtscsi_ctrl_done;
+ callbacks[1] = virtscsi_event_done;
+ names[0] = "control";
+ names[1] = "event";
+ for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++) {
+ callbacks[i] = virtscsi_req_done;
+ names[i] = "request";
+ }
/* Discover virtqueues and write information to configuration. */
- err = vdev->config->find_vqs(vdev, 3, vqs, callbacks, names);
+ err = vdev->config->find_vqs(vdev, num_vqs, vqs, callbacks, names);
if (err)
- return err;
+ goto out;
virtscsi_init_vq(&vscsi->ctrl_vq, vqs[0]);
virtscsi_init_vq(&vscsi->event_vq, vqs[1]);
- virtscsi_init_vq(&vscsi->req_vq, vqs[2]);
+ for (i = VIRTIO_SCSI_VQ_BASE; i < num_vqs; i++)
+ virtscsi_init_vq(&vscsi->req_vqs[i - VIRTIO_SCSI_VQ_BASE],
+ vqs[i]);
+
+ virtscsi_set_affinity(vscsi, true);
virtscsi_config_set(vdev, cdb_size, VIRTIO_SCSI_CDB_SIZE);
virtscsi_config_set(vdev, sense_size, VIRTIO_SCSI_SENSE_SIZE);
@@ -661,19 +857,12 @@ static int virtscsi_init(struct virtio_device *vdev,
if (virtio_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG))
virtscsi_kick_event_all(vscsi);
- /* We need to know how many segments before we allocate. */
- sg_elems = virtscsi_config_get(vdev, seg_max) ?: 1;
-
- for (i = 0; i < num_targets; i++) {
- vscsi->tgt[i] = virtscsi_alloc_tgt(vdev, sg_elems);
- if (!vscsi->tgt[i]) {
- err = -ENOMEM;
- goto out;
- }
- }
err = 0;
out:
+ kfree(names);
+ kfree(callbacks);
+ kfree(vqs);
if (err)
virtscsi_remove_vqs(vdev);
return err;
@@ -686,13 +875,21 @@ static int virtscsi_probe(struct virtio_device *vdev)
int err;
u32 sg_elems, num_targets;
u32 cmd_per_lun;
+ u32 num_queues;
+ struct scsi_host_template *hostt;
+
+ /* We need to know how many queues before we allocate. */
+ num_queues = virtscsi_config_get(vdev, num_queues) ? : 1;
- /* Allocate memory and link the structs together. */
num_targets = virtscsi_config_get(vdev, max_target) + 1;
- shost = scsi_host_alloc(&virtscsi_host_template,
- sizeof(*vscsi)
- + num_targets * sizeof(struct virtio_scsi_target_state));
+ if (num_queues == 1)
+ hostt = &virtscsi_host_template_single;
+ else
+ hostt = &virtscsi_host_template_multi;
+
+ shost = scsi_host_alloc(hostt,
+ sizeof(*vscsi) + sizeof(vscsi->req_vqs[0]) * num_queues);
if (!shost)
return -ENOMEM;
@@ -700,12 +897,20 @@ static int virtscsi_probe(struct virtio_device *vdev)
shost->sg_tablesize = sg_elems;
vscsi = shost_priv(shost);
vscsi->vdev = vdev;
+ vscsi->num_queues = num_queues;
vdev->priv = shost;
- err = virtscsi_init(vdev, vscsi, num_targets);
+ err = virtscsi_init(vdev, vscsi);
if (err)
goto virtscsi_init_failed;
+ vscsi->nb.notifier_call = &virtscsi_cpu_callback;
+ err = register_hotcpu_notifier(&vscsi->nb);
+ if (err) {
+ pr_err("registering cpu notifier failed\n");
+ goto scsi_add_host_failed;
+ }
+
cmd_per_lun = virtscsi_config_get(vdev, cmd_per_lun) ?: 1;
shost->cmd_per_lun = min_t(u32, cmd_per_lun, shost->can_queue);
shost->max_sectors = virtscsi_config_get(vdev, max_sectors) ?: 0xFFFF;
@@ -743,6 +948,8 @@ static void virtscsi_remove(struct virtio_device *vdev)
scsi_remove_host(shost);
+ unregister_hotcpu_notifier(&vscsi->nb);
+
virtscsi_remove_vqs(vdev);
scsi_host_put(shost);
}
@@ -759,7 +966,7 @@ static int virtscsi_restore(struct virtio_device *vdev)
struct Scsi_Host *sh = virtio_scsi_host(vdev);
struct virtio_scsi *vscsi = shost_priv(sh);
- return virtscsi_init(vdev, vscsi, sh->max_id);
+ return virtscsi_init(vdev, vscsi);
}
#endif
@@ -794,8 +1001,7 @@ static int __init init(void)
virtscsi_cmd_cache = KMEM_CACHE(virtio_scsi_cmd, 0);
if (!virtscsi_cmd_cache) {
- printk(KERN_ERR "kmem_cache_create() for "
- "virtscsi_cmd_cache failed\n");
+ pr_err("kmem_cache_create() for virtscsi_cmd_cache failed\n");
goto error;
}
@@ -804,8 +1010,7 @@ static int __init init(void)
mempool_create_slab_pool(VIRTIO_SCSI_MEMPOOL_SZ,
virtscsi_cmd_cache);
if (!virtscsi_cmd_pool) {
- printk(KERN_ERR "mempool_create() for"
- "virtscsi_cmd_pool failed\n");
+ pr_err("mempool_create() for virtscsi_cmd_pool failed\n");
goto error;
}
ret = register_virtio_driver(&virtio_scsi_driver);
diff --git a/drivers/staging/media/go7007/go7007-driver.c b/drivers/staging/media/go7007/go7007-driver.c
index a5ca99d..3640df0 100644
--- a/drivers/staging/media/go7007/go7007-driver.c
+++ b/drivers/staging/media/go7007/go7007-driver.c
@@ -242,11 +242,8 @@ static void go7007_remove(struct v4l2_device *v4l2_dev)
if (go->hpi_ops->release)
go->hpi_ops->release(go);
if (go->i2c_adapter_online) {
- if (i2c_del_adapter(&go->i2c_adapter) == 0)
- go->i2c_adapter_online = 0;
- else
- v4l2_err(&go->v4l2_dev,
- "error removing I2C adapter!\n");
+ i2c_del_adapter(&go->i2c_adapter);
+ go->i2c_adapter_online = 0;
}
kfree(go->boot_fw);
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 09d2e3f..ac37254 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -201,7 +201,9 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type)
return (flags & PCI_MSIX_FLAGS_QSIZE) + 1;
}
- }
+ } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX)
+ if (pci_is_pcie(vdev->pdev))
+ return 1;
return 0;
}
@@ -317,6 +319,17 @@ static long vfio_pci_ioctl(void *device_data,
if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
return -EINVAL;
+ switch (info.index) {
+ case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX:
+ break;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ if (pci_is_pcie(vdev->pdev))
+ break;
+ /* pass thru to return error */
+ default:
+ return -EINVAL;
+ }
+
info.flags = VFIO_IRQ_INFO_EVENTFD;
info.count = vfio_pci_get_irq_count(vdev, info.index);
@@ -552,11 +565,40 @@ static void vfio_pci_remove(struct pci_dev *pdev)
kfree(vdev);
}
+static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state)
+{
+ struct vfio_pci_device *vdev;
+ struct vfio_device *device;
+
+ device = vfio_device_get_from_dev(&pdev->dev);
+ if (device == NULL)
+ return PCI_ERS_RESULT_DISCONNECT;
+
+ vdev = vfio_device_data(device);
+ if (vdev == NULL) {
+ vfio_device_put(device);
+ return PCI_ERS_RESULT_DISCONNECT;
+ }
+
+ if (vdev->err_trigger)
+ eventfd_signal(vdev->err_trigger, 1);
+
+ vfio_device_put(device);
+
+ return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static struct pci_error_handlers vfio_err_handlers = {
+ .error_detected = vfio_pci_aer_err_detected,
+};
+
static struct pci_driver vfio_pci_driver = {
.name = "vfio-pci",
.id_table = NULL, /* only dynamic ids */
.probe = vfio_pci_probe,
.remove = vfio_pci_remove,
+ .err_handler = &vfio_err_handlers,
};
static void __exit vfio_pci_cleanup(void)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index aeb00fc..affa347 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -274,9 +274,10 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos,
return count;
}
-static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
- int count, struct perm_bits *perm,
- int offset, __le32 val)
+/* Raw access skips any kind of virtualization */
+static int vfio_raw_config_write(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 val)
{
int ret;
@@ -287,13 +288,36 @@ static int vfio_direct_config_write(struct vfio_pci_device *vdev, int pos,
return count;
}
-/* Default all regions to read-only, no-virtualization */
+static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos,
+ int count, struct perm_bits *perm,
+ int offset, __le32 *val)
+{
+ int ret;
+
+ ret = vfio_user_config_read(vdev->pdev, pos, val, count);
+ if (ret)
+ return pcibios_err_to_errno(ret);
+
+ return count;
+}
+
+/* Default capability regions to read-only, no-virtualization */
static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = {
[0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
};
static struct perm_bits ecap_perms[PCI_EXT_CAP_ID_MAX + 1] = {
[0 ... PCI_EXT_CAP_ID_MAX] = { .readfn = vfio_direct_config_read }
};
+/*
+ * Default unassigned regions to raw read-write access. Some devices
+ * require this to function as they hide registers between the gaps in
+ * config space (be2net). Like MMIO and I/O port registers, we have
+ * to trust the hardware isolation.
+ */
+static struct perm_bits unassigned_perms = {
+ .readfn = vfio_raw_config_read,
+ .writefn = vfio_raw_config_write
+};
static void free_perm_bits(struct perm_bits *perm)
{
@@ -779,16 +803,16 @@ int __init vfio_pci_init_perm_bits(void)
/* Capabilities */
ret |= init_pci_cap_pm_perm(&cap_perms[PCI_CAP_ID_PM]);
- cap_perms[PCI_CAP_ID_VPD].writefn = vfio_direct_config_write;
+ cap_perms[PCI_CAP_ID_VPD].writefn = vfio_raw_config_write;
ret |= init_pci_cap_pcix_perm(&cap_perms[PCI_CAP_ID_PCIX]);
- cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_direct_config_write;
+ cap_perms[PCI_CAP_ID_VNDR].writefn = vfio_raw_config_write;
ret |= init_pci_cap_exp_perm(&cap_perms[PCI_CAP_ID_EXP]);
ret |= init_pci_cap_af_perm(&cap_perms[PCI_CAP_ID_AF]);
/* Extended capabilities */
ret |= init_pci_ext_cap_err_perm(&ecap_perms[PCI_EXT_CAP_ID_ERR]);
ret |= init_pci_ext_cap_pwr_perm(&ecap_perms[PCI_EXT_CAP_ID_PWR]);
- ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_direct_config_write;
+ ecap_perms[PCI_EXT_CAP_ID_VNDR].writefn = vfio_raw_config_write;
if (ret)
vfio_pci_uninit_perm_bits();
@@ -801,9 +825,6 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
u8 cap;
int base = (pos >= PCI_CFG_SPACE_SIZE) ? PCI_CFG_SPACE_SIZE :
PCI_STD_HEADER_SIZEOF;
- base /= 4;
- pos /= 4;
-
cap = vdev->pci_config_map[pos];
if (cap == PCI_CAP_ID_BASIC)
@@ -813,7 +834,7 @@ static int vfio_find_cap_start(struct vfio_pci_device *vdev, int pos)
while (pos - 1 >= base && vdev->pci_config_map[pos - 1] == cap)
pos--;
- return pos * 4;
+ return pos;
}
static int vfio_msi_config_read(struct vfio_pci_device *vdev, int pos,
@@ -1017,13 +1038,9 @@ static int vfio_cap_len(struct vfio_pci_device *vdev, u8 cap, u8 pos)
return byte;
case PCI_CAP_ID_EXP:
/* length based on version */
- ret = pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &word);
- if (ret)
- return pcibios_err_to_errno(ret);
-
vdev->extended_caps = true;
- if ((word & PCI_EXP_FLAGS_VERS) == 1)
+ if ((pcie_caps_reg(pdev) & PCI_EXP_FLAGS_VERS) == 1)
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V1;
else
return PCI_CAP_EXP_ENDPOINT_SIZEOF_V2;
@@ -1230,8 +1247,8 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
}
/* Sanity check, do we overlap other capabilities? */
- for (i = 0; i < len; i += 4) {
- if (likely(map[(pos + i) / 4] == PCI_CAP_ID_INVALID))
+ for (i = 0; i < len; i++) {
+ if (likely(map[pos + i] == PCI_CAP_ID_INVALID))
continue;
pr_warn("%s: %s pci config conflict @0x%x, was cap 0x%x now cap 0x%x\n",
@@ -1239,7 +1256,7 @@ static int vfio_cap_init(struct vfio_pci_device *vdev)
pos + i, map[pos + i], cap);
}
- memset(map + (pos / 4), cap, len / 4);
+ memset(map + pos, cap, len);
ret = vfio_fill_vconfig_bytes(vdev, pos, len);
if (ret)
return ret;
@@ -1314,8 +1331,8 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
hidden = true;
}
- for (i = 0; i < len; i += 4) {
- if (likely(map[(epos + i) / 4] == PCI_CAP_ID_INVALID))
+ for (i = 0; i < len; i++) {
+ if (likely(map[epos + i] == PCI_CAP_ID_INVALID))
continue;
pr_warn("%s: %s pci config conflict @0x%x, was ecap 0x%x now ecap 0x%x\n",
@@ -1330,7 +1347,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev)
*/
BUILD_BUG_ON(PCI_EXT_CAP_ID_MAX >= PCI_CAP_ID_INVALID);
- memset(map + (epos / 4), ecap, len / 4);
+ memset(map + epos, ecap, len);
ret = vfio_fill_vconfig_bytes(vdev, epos, len);
if (ret)
return ret;
@@ -1377,10 +1394,12 @@ int vfio_config_init(struct vfio_pci_device *vdev)
int ret;
/*
- * Config space, caps and ecaps are all dword aligned, so we can
- * use one byte per dword to record the type.
+ * Config space, caps and ecaps are all dword aligned, so we could
+ * use one byte per dword to record the type. However, there are
+ * no requiremenst on the length of a capability, so the gap between
+ * capabilities needs byte granularity.
*/
- map = kmalloc(pdev->cfg_size / 4, GFP_KERNEL);
+ map = kmalloc(pdev->cfg_size, GFP_KERNEL);
if (!map)
return -ENOMEM;
@@ -1393,9 +1412,9 @@ int vfio_config_init(struct vfio_pci_device *vdev)
vdev->pci_config_map = map;
vdev->vconfig = vconfig;
- memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF / 4);
- memset(map + (PCI_STD_HEADER_SIZEOF / 4), PCI_CAP_ID_INVALID,
- (pdev->cfg_size - PCI_STD_HEADER_SIZEOF) / 4);
+ memset(map, PCI_CAP_ID_BASIC, PCI_STD_HEADER_SIZEOF);
+ memset(map + PCI_STD_HEADER_SIZEOF, PCI_CAP_ID_INVALID,
+ pdev->cfg_size - PCI_STD_HEADER_SIZEOF);
ret = vfio_fill_vconfig_bytes(vdev, 0, PCI_STD_HEADER_SIZEOF);
if (ret)
@@ -1450,6 +1469,22 @@ void vfio_config_free(struct vfio_pci_device *vdev)
vdev->msi_perm = NULL;
}
+/*
+ * Find the remaining number of bytes in a dword that match the given
+ * position. Stop at either the end of the capability or the dword boundary.
+ */
+static size_t vfio_pci_cap_remaining_dword(struct vfio_pci_device *vdev,
+ loff_t pos)
+{
+ u8 cap = vdev->pci_config_map[pos];
+ size_t i;
+
+ for (i = 1; (pos + i) % 4 && vdev->pci_config_map[pos + i] == cap; i++)
+ /* nop */;
+
+ return i;
+}
+
static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite)
{
@@ -1458,55 +1493,48 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_device *vdev, char __user *buf,
__le32 val = 0;
int cap_start = 0, offset;
u8 cap_id;
- ssize_t ret = count;
+ ssize_t ret;
- if (*ppos < 0 || *ppos + count > pdev->cfg_size)
+ if (*ppos < 0 || *ppos >= pdev->cfg_size ||
+ *ppos + count > pdev->cfg_size)
return -EFAULT;
/*
- * gcc can't seem to figure out we're a static function, only called
- * with count of 1/2/4 and hits copy_from_user_overflow without this.
+ * Chop accesses into aligned chunks containing no more than a
+ * single capability. Caller increments to the next chunk.
*/
- if (count > sizeof(val))
- return -EINVAL;
-
- cap_id = vdev->pci_config_map[*ppos / 4];
-
- if (cap_id == PCI_CAP_ID_INVALID) {
- if (iswrite)
- return ret; /* drop */
-
- /*
- * Per PCI spec 3.0, section 6.1, reads from reserved and
- * unimplemented registers return 0
- */
- if (copy_to_user(buf, &val, count))
- return -EFAULT;
-
- return ret;
- }
+ count = min(count, vfio_pci_cap_remaining_dword(vdev, *ppos));
+ if (count >= 4 && !(*ppos % 4))
+ count = 4;
+ else if (count >= 2 && !(*ppos % 2))
+ count = 2;
+ else
+ count = 1;
- /*
- * All capabilities are minimum 4 bytes and aligned on dword
- * boundaries. Since we don't support unaligned accesses, we're
- * only ever accessing a single capability.
- */
- if (*ppos >= PCI_CFG_SPACE_SIZE) {
- WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
+ ret = count;
- perm = &ecap_perms[cap_id];
- cap_start = vfio_find_cap_start(vdev, *ppos);
+ cap_id = vdev->pci_config_map[*ppos];
+ if (cap_id == PCI_CAP_ID_INVALID) {
+ perm = &unassigned_perms;
+ cap_start = *ppos;
} else {
- WARN_ON(cap_id > PCI_CAP_ID_MAX);
+ if (*ppos >= PCI_CFG_SPACE_SIZE) {
+ WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX);
- perm = &cap_perms[cap_id];
+ perm = &ecap_perms[cap_id];
+ cap_start = vfio_find_cap_start(vdev, *ppos);
+ } else {
+ WARN_ON(cap_id > PCI_CAP_ID_MAX);
- if (cap_id == PCI_CAP_ID_MSI)
- perm = vdev->msi_perm;
+ perm = &cap_perms[cap_id];
- if (cap_id > PCI_CAP_ID_BASIC)
- cap_start = vfio_find_cap_start(vdev, *ppos);
+ if (cap_id == PCI_CAP_ID_MSI)
+ perm = vdev->msi_perm;
+
+ if (cap_id > PCI_CAP_ID_BASIC)
+ cap_start = vfio_find_cap_start(vdev, *ppos);
+ }
}
WARN_ON(!cap_start && cap_id != PCI_CAP_ID_BASIC);
@@ -1546,20 +1574,8 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_device *vdev, char __user *buf,
pos &= VFIO_PCI_OFFSET_MASK;
- /*
- * We want to both keep the access size the caller users as well as
- * support reading large chunks of config space in a single call.
- * PCI doesn't support unaligned accesses, so we can safely break
- * those apart.
- */
while (count) {
- if (count >= 4 && !(pos % 4))
- ret = vfio_config_do_rw(vdev, buf, 4, &pos, iswrite);
- else if (count >= 2 && !(pos % 2))
- ret = vfio_config_do_rw(vdev, buf, 2, &pos, iswrite);
- else
- ret = vfio_config_do_rw(vdev, buf, 1, &pos, iswrite);
-
+ ret = vfio_config_do_rw(vdev, buf, count, &pos, iswrite);
if (ret < 0)
return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index a965091..4bc704e 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -287,7 +287,8 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
* a signal is necessary, which can then be handled via a work queue
* or directly depending on the caller.
*/
-int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev, void *unused)
+static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
+ void *unused)
{
struct pci_dev *pdev = vdev->pdev;
unsigned long flags;
@@ -746,6 +747,63 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev,
return 0;
}
+static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags, void *data)
+{
+ int32_t fd = *(int32_t *)data;
+ struct pci_dev *pdev = vdev->pdev;
+
+ if ((index != VFIO_PCI_ERR_IRQ_INDEX) ||
+ !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK))
+ return -EINVAL;
+
+ /*
+ * device_lock synchronizes setting and checking of
+ * err_trigger. The vfio_pci_aer_err_detected() is also
+ * called with device_lock held.
+ */
+
+ /* DATA_NONE/DATA_BOOL enables loopback testing */
+
+ if (flags & VFIO_IRQ_SET_DATA_NONE) {
+ device_lock(&pdev->dev);
+ if (vdev->err_trigger)
+ eventfd_signal(vdev->err_trigger, 1);
+ device_unlock(&pdev->dev);
+ return 0;
+ } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+ uint8_t trigger = *(uint8_t *)data;
+ device_lock(&pdev->dev);
+ if (trigger && vdev->err_trigger)
+ eventfd_signal(vdev->err_trigger, 1);
+ device_unlock(&pdev->dev);
+ return 0;
+ }
+
+ /* Handle SET_DATA_EVENTFD */
+
+ if (fd == -1) {
+ device_lock(&pdev->dev);
+ if (vdev->err_trigger)
+ eventfd_ctx_put(vdev->err_trigger);
+ vdev->err_trigger = NULL;
+ device_unlock(&pdev->dev);
+ return 0;
+ } else if (fd >= 0) {
+ struct eventfd_ctx *efdctx;
+ efdctx = eventfd_ctx_fdget(fd);
+ if (IS_ERR(efdctx))
+ return PTR_ERR(efdctx);
+ device_lock(&pdev->dev);
+ if (vdev->err_trigger)
+ eventfd_ctx_put(vdev->err_trigger);
+ vdev->err_trigger = efdctx;
+ device_unlock(&pdev->dev);
+ return 0;
+ } else
+ return -EINVAL;
+}
int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
unsigned index, unsigned start, unsigned count,
void *data)
@@ -780,6 +838,13 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags,
break;
}
break;
+ case VFIO_PCI_ERR_IRQ_INDEX:
+ switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+ case VFIO_IRQ_SET_ACTION_TRIGGER:
+ if (pci_is_pcie(vdev->pdev))
+ func = vfio_pci_set_err_trigger;
+ break;
+ }
}
if (!func)
diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h
index d7e55d0..9c6d5d0 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -56,6 +56,7 @@ struct vfio_pci_device {
bool has_vga;
struct pci_saved_state *pci_saved_state;
atomic_t refcnt;
+ struct eventfd_ctx *err_trigger;
};
#define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX)
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3..acb7121 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -24,8 +24,10 @@
#include <linux/list.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/stat.h>
#include <linux/string.h>
#include <linux/uaccess.h>
#include <linux/vfio.h>
@@ -57,7 +59,7 @@ struct vfio_iommu_driver {
struct vfio_container {
struct kref kref;
struct list_head group_list;
- struct mutex group_lock;
+ struct rw_semaphore group_lock;
struct vfio_iommu_driver *iommu_driver;
void *iommu_data;
};
@@ -392,12 +394,13 @@ static void vfio_device_release(struct kref *kref)
}
/* Device reference always implies a group reference */
-static void vfio_device_put(struct vfio_device *device)
+void vfio_device_put(struct vfio_device *device)
{
struct vfio_group *group = device->group;
kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
vfio_group_put(group);
}
+EXPORT_SYMBOL_GPL(vfio_device_put);
static void vfio_device_get(struct vfio_device *device)
{
@@ -627,6 +630,33 @@ int vfio_add_group_dev(struct device *dev,
}
EXPORT_SYMBOL_GPL(vfio_add_group_dev);
+/**
+ * Get a reference to the vfio_device for a device that is known to
+ * be bound to a vfio driver. The driver implicitly holds a
+ * vfio_device reference between vfio_add_group_dev and
+ * vfio_del_group_dev. We can therefore use drvdata to increment
+ * that reference from the struct device. This additional
+ * reference must be released by calling vfio_device_put.
+ */
+struct vfio_device *vfio_device_get_from_dev(struct device *dev)
+{
+ struct vfio_device *device = dev_get_drvdata(dev);
+
+ vfio_device_get(device);
+
+ return device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
+
+/*
+ * Caller must hold a reference to the vfio_device
+ */
+void *vfio_device_data(struct vfio_device *device)
+{
+ return device->device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_device_data);
+
/* Given a referenced group, check if it contains the device */
static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
{
@@ -675,9 +705,13 @@ EXPORT_SYMBOL_GPL(vfio_del_group_dev);
static long vfio_ioctl_check_extension(struct vfio_container *container,
unsigned long arg)
{
- struct vfio_iommu_driver *driver = container->iommu_driver;
+ struct vfio_iommu_driver *driver;
long ret = 0;
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+
switch (arg) {
/* No base extensions yet */
default:
@@ -707,10 +741,12 @@ static long vfio_ioctl_check_extension(struct vfio_container *container,
VFIO_CHECK_EXTENSION, arg);
}
+ up_read(&container->group_lock);
+
return ret;
}
-/* hold container->group_lock */
+/* hold write lock on container->group_lock */
static int __vfio_container_attach_groups(struct vfio_container *container,
struct vfio_iommu_driver *driver,
void *data)
@@ -741,7 +777,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
struct vfio_iommu_driver *driver;
long ret = -ENODEV;
- mutex_lock(&container->group_lock);
+ down_write(&container->group_lock);
/*
* The container is designed to be an unprivileged interface while
@@ -752,7 +788,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
* the container is deprivileged and returns to an unset state.
*/
if (list_empty(&container->group_list) || container->iommu_driver) {
- mutex_unlock(&container->group_lock);
+ up_write(&container->group_lock);
return -EINVAL;
}
@@ -799,7 +835,7 @@ static long vfio_ioctl_set_iommu(struct vfio_container *container,
mutex_unlock(&vfio.iommu_drivers_lock);
skip_drivers_unlock:
- mutex_unlock(&container->group_lock);
+ up_write(&container->group_lock);
return ret;
}
@@ -815,9 +851,6 @@ static long vfio_fops_unl_ioctl(struct file *filep,
if (!container)
return ret;
- driver = container->iommu_driver;
- data = container->iommu_data;
-
switch (cmd) {
case VFIO_GET_API_VERSION:
ret = VFIO_API_VERSION;
@@ -829,8 +862,15 @@ static long vfio_fops_unl_ioctl(struct file *filep,
ret = vfio_ioctl_set_iommu(container, arg);
break;
default:
+ down_read(&container->group_lock);
+
+ driver = container->iommu_driver;
+ data = container->iommu_data;
+
if (driver) /* passthrough all unrecognized ioctls */
ret = driver->ops->ioctl(data, cmd, arg);
+
+ up_read(&container->group_lock);
}
return ret;
@@ -854,7 +894,7 @@ static int vfio_fops_open(struct inode *inode, struct file *filep)
return -ENOMEM;
INIT_LIST_HEAD(&container->group_list);
- mutex_init(&container->group_lock);
+ init_rwsem(&container->group_lock);
kref_init(&container->kref);
filep->private_data = container;
@@ -881,35 +921,55 @@ static ssize_t vfio_fops_read(struct file *filep, char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_container *container = filep->private_data;
- struct vfio_iommu_driver *driver = container->iommu_driver;
+ struct vfio_iommu_driver *driver;
+ ssize_t ret = -EINVAL;
- if (unlikely(!driver || !driver->ops->read))
- return -EINVAL;
+ down_read(&container->group_lock);
- return driver->ops->read(container->iommu_data, buf, count, ppos);
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->read))
+ ret = driver->ops->read(container->iommu_data,
+ buf, count, ppos);
+
+ up_read(&container->group_lock);
+
+ return ret;
}
static ssize_t vfio_fops_write(struct file *filep, const char __user *buf,
size_t count, loff_t *ppos)
{
struct vfio_container *container = filep->private_data;
- struct vfio_iommu_driver *driver = container->iommu_driver;
+ struct vfio_iommu_driver *driver;
+ ssize_t ret = -EINVAL;
- if (unlikely(!driver || !driver->ops->write))
- return -EINVAL;
+ down_read(&container->group_lock);
- return driver->ops->write(container->iommu_data, buf, count, ppos);
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->write))
+ ret = driver->ops->write(container->iommu_data,
+ buf, count, ppos);
+
+ up_read(&container->group_lock);
+
+ return ret;
}
static int vfio_fops_mmap(struct file *filep, struct vm_area_struct *vma)
{
struct vfio_container *container = filep->private_data;
- struct vfio_iommu_driver *driver = container->iommu_driver;
+ struct vfio_iommu_driver *driver;
+ int ret = -EINVAL;
- if (unlikely(!driver || !driver->ops->mmap))
- return -EINVAL;
+ down_read(&container->group_lock);
- return driver->ops->mmap(container->iommu_data, vma);
+ driver = container->iommu_driver;
+ if (likely(driver && driver->ops->mmap))
+ ret = driver->ops->mmap(container->iommu_data, vma);
+
+ up_read(&container->group_lock);
+
+ return ret;
}
static const struct file_operations vfio_fops = {
@@ -933,7 +993,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
struct vfio_container *container = group->container;
struct vfio_iommu_driver *driver;
- mutex_lock(&container->group_lock);
+ down_write(&container->group_lock);
driver = container->iommu_driver;
if (driver)
@@ -951,7 +1011,7 @@ static void __vfio_group_unset_container(struct vfio_group *group)
container->iommu_data = NULL;
}
- mutex_unlock(&container->group_lock);
+ up_write(&container->group_lock);
vfio_container_put(container);
}
@@ -1011,7 +1071,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
container = f.file->private_data;
WARN_ON(!container); /* fget ensures we don't race vfio_release */
- mutex_lock(&container->group_lock);
+ down_write(&container->group_lock);
driver = container->iommu_driver;
if (driver) {
@@ -1029,7 +1089,7 @@ static int vfio_group_set_container(struct vfio_group *group, int container_fd)
atomic_inc(&group->container_users);
unlock_out:
- mutex_unlock(&container->group_lock);
+ up_write(&container->group_lock);
fdput(f);
return ret;
}
@@ -1300,6 +1360,9 @@ static const struct file_operations vfio_device_fops = {
*/
static char *vfio_devnode(struct device *dev, umode_t *mode)
{
+ if (MINOR(dev->devt) == 0)
+ *mode = S_IRUGO | S_IWUGO;
+
return kasprintf(GFP_KERNEL, "vfio/%s", dev_name(dev));
}
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index bf24317..8b9226d 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
config VHOST_NET
tristate "Host kernel accelerator for virtio net"
depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+ select VHOST_RING
---help---
This kernel module can be loaded in host kernel to accelerate
guest networking with virtio_net. Not to be confused with virtio_net
@@ -9,6 +10,17 @@ config VHOST_NET
To compile this driver as a module, choose M here: the module will
be called vhost_net.
-if STAGING
-source "drivers/vhost/Kconfig.tcm"
-endif
+config VHOST_SCSI
+ tristate "VHOST_SCSI TCM fabric driver"
+ depends on TARGET_CORE && EVENTFD && m
+ select VHOST_RING
+ default n
+ ---help---
+ Say M here to enable the vhost_scsi TCM fabric module
+ for use with virtio-scsi guests
+
+config VHOST_RING
+ tristate
+ ---help---
+ This option is selected by any driver which needs to access
+ the host side of a virtio ring.
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm
deleted file mode 100644
index 7e3aa28..0000000
--- a/drivers/vhost/Kconfig.tcm
+++ /dev/null
@@ -1,6 +0,0 @@
-config TCM_VHOST
- tristate "TCM_VHOST fabric module"
- depends on TARGET_CORE && EVENTFD && m
- default n
- ---help---
- Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053..654e9afb 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -1,4 +1,7 @@
obj-$(CONFIG_VHOST_NET) += vhost_net.o
vhost_net-y := vhost.o net.o
-obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+obj-$(CONFIG_VHOST_SCSI) += vhost_scsi.o
+vhost_scsi-y := scsi.o
+
+obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 87c216c..a3645bd 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -64,9 +64,35 @@ enum {
VHOST_NET_VQ_MAX = 2,
};
+struct vhost_ubuf_ref {
+ struct kref kref;
+ wait_queue_head_t wait;
+ struct vhost_virtqueue *vq;
+};
+
+struct vhost_net_virtqueue {
+ struct vhost_virtqueue vq;
+ /* hdr is used to store the virtio header.
+ * Since each iovec has >= 1 byte length, we never need more than
+ * header length entries to store the header. */
+ struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
+ size_t vhost_hlen;
+ size_t sock_hlen;
+ /* vhost zerocopy support fields below: */
+ /* last used idx for outstanding DMA zerocopy buffers */
+ int upend_idx;
+ /* first used idx for DMA done zerocopy buffers */
+ int done_idx;
+ /* an array of userspace buffers info */
+ struct ubuf_info *ubuf_info;
+ /* Reference counting for outstanding ubufs.
+ * Protected by vq mutex. Writers must also take device mutex. */
+ struct vhost_ubuf_ref *ubufs;
+};
+
struct vhost_net {
struct vhost_dev dev;
- struct vhost_virtqueue vqs[VHOST_NET_VQ_MAX];
+ struct vhost_net_virtqueue vqs[VHOST_NET_VQ_MAX];
struct vhost_poll poll[VHOST_NET_VQ_MAX];
/* Number of TX recently submitted.
* Protected by tx vq lock. */
@@ -78,6 +104,90 @@ struct vhost_net {
bool tx_flush;
};
+static unsigned vhost_zcopy_mask __read_mostly;
+
+void vhost_enable_zcopy(int vq)
+{
+ vhost_zcopy_mask |= 0x1 << vq;
+}
+
+static void vhost_zerocopy_done_signal(struct kref *kref)
+{
+ struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
+ kref);
+ wake_up(&ubufs->wait);
+}
+
+struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
+ bool zcopy)
+{
+ struct vhost_ubuf_ref *ubufs;
+ /* No zero copy backend? Nothing to count. */
+ if (!zcopy)
+ return NULL;
+ ubufs = kmalloc(sizeof(*ubufs), GFP_KERNEL);
+ if (!ubufs)
+ return ERR_PTR(-ENOMEM);
+ kref_init(&ubufs->kref);
+ init_waitqueue_head(&ubufs->wait);
+ ubufs->vq = vq;
+ return ubufs;
+}
+
+void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
+{
+ kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
+}
+
+void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
+{
+ kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
+ wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
+ kfree(ubufs);
+}
+
+int vhost_net_set_ubuf_info(struct vhost_net *n)
+{
+ bool zcopy;
+ int i;
+
+ for (i = 0; i < n->dev.nvqs; ++i) {
+ zcopy = vhost_zcopy_mask & (0x1 << i);
+ if (!zcopy)
+ continue;
+ n->vqs[i].ubuf_info = kmalloc(sizeof(*n->vqs[i].ubuf_info) *
+ UIO_MAXIOV, GFP_KERNEL);
+ if (!n->vqs[i].ubuf_info)
+ goto err;
+ }
+ return 0;
+
+err:
+ while (i--) {
+ zcopy = vhost_zcopy_mask & (0x1 << i);
+ if (!zcopy)
+ continue;
+ kfree(n->vqs[i].ubuf_info);
+ }
+ return -ENOMEM;
+}
+
+void vhost_net_vq_reset(struct vhost_net *n)
+{
+ int i;
+
+ for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+ n->vqs[i].done_idx = 0;
+ n->vqs[i].upend_idx = 0;
+ n->vqs[i].ubufs = NULL;
+ kfree(n->vqs[i].ubuf_info);
+ n->vqs[i].ubuf_info = NULL;
+ n->vqs[i].vhost_hlen = 0;
+ n->vqs[i].sock_hlen = 0;
+ }
+
+}
+
static void vhost_net_tx_packet(struct vhost_net *net)
{
++net->tx_packets;
@@ -153,10 +263,12 @@ static void copy_iovec_hdr(const struct iovec *from, struct iovec *to,
static int vhost_zerocopy_signal_used(struct vhost_net *net,
struct vhost_virtqueue *vq)
{
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
int i;
int j = 0;
- for (i = vq->done_idx; i != vq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
+ for (i = nvq->done_idx; i != nvq->upend_idx; i = (i + 1) % UIO_MAXIOV) {
if (vq->heads[i].len == VHOST_DMA_FAILED_LEN)
vhost_net_tx_err(net);
if (VHOST_DMA_IS_DONE(vq->heads[i].len)) {
@@ -168,7 +280,7 @@ static int vhost_zerocopy_signal_used(struct vhost_net *net,
break;
}
if (j)
- vq->done_idx = i;
+ nvq->done_idx = i;
return j;
}
@@ -198,7 +310,8 @@ static void vhost_zerocopy_callback(struct ubuf_info *ubuf, bool success)
* read-size critical section for our kind of RCU. */
static void handle_tx(struct vhost_net *net)
{
- struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_TX];
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_TX];
+ struct vhost_virtqueue *vq = &nvq->vq;
unsigned out, in, s;
int head;
struct msghdr msg = {
@@ -224,8 +337,8 @@ static void handle_tx(struct vhost_net *net)
mutex_lock(&vq->mutex);
vhost_disable_notify(&net->dev, vq);
- hdr_size = vq->vhost_hlen;
- zcopy = vq->ubufs;
+ hdr_size = nvq->vhost_hlen;
+ zcopy = nvq->ubufs;
for (;;) {
/* Release DMAs done buffers first */
@@ -246,9 +359,10 @@ static void handle_tx(struct vhost_net *net)
/* If more outstanding DMAs, queue the work.
* Handle upend_idx wrap around
*/
- num_pends = likely(vq->upend_idx >= vq->done_idx) ?
- (vq->upend_idx - vq->done_idx) :
- (vq->upend_idx + UIO_MAXIOV - vq->done_idx);
+ num_pends = likely(nvq->upend_idx >= nvq->done_idx) ?
+ (nvq->upend_idx - nvq->done_idx) :
+ (nvq->upend_idx + UIO_MAXIOV -
+ nvq->done_idx);
if (unlikely(num_pends > VHOST_MAX_PEND))
break;
if (unlikely(vhost_enable_notify(&net->dev, vq))) {
@@ -263,45 +377,45 @@ static void handle_tx(struct vhost_net *net)
break;
}
/* Skip header. TODO: support TSO. */
- s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
+ s = move_iovec_hdr(vq->iov, nvq->hdr, hdr_size, out);
msg.msg_iovlen = out;
len = iov_length(vq->iov, out);
/* Sanity check */
if (!len) {
vq_err(vq, "Unexpected header len for TX: "
"%zd expected %zd\n",
- iov_length(vq->hdr, s), hdr_size);
+ iov_length(nvq->hdr, s), hdr_size);
break;
}
zcopy_used = zcopy && (len >= VHOST_GOODCOPY_LEN ||
- vq->upend_idx != vq->done_idx);
+ nvq->upend_idx != nvq->done_idx);
/* use msg_control to pass vhost zerocopy ubuf info to skb */
if (zcopy_used) {
- vq->heads[vq->upend_idx].id = head;
+ vq->heads[nvq->upend_idx].id = head;
if (!vhost_net_tx_select_zcopy(net) ||
len < VHOST_GOODCOPY_LEN) {
/* copy don't need to wait for DMA done */
- vq->heads[vq->upend_idx].len =
+ vq->heads[nvq->upend_idx].len =
VHOST_DMA_DONE_LEN;
msg.msg_control = NULL;
msg.msg_controllen = 0;
ubufs = NULL;
} else {
struct ubuf_info *ubuf;
- ubuf = vq->ubuf_info + vq->upend_idx;
+ ubuf = nvq->ubuf_info + nvq->upend_idx;
- vq->heads[vq->upend_idx].len =
+ vq->heads[nvq->upend_idx].len =
VHOST_DMA_IN_PROGRESS;
ubuf->callback = vhost_zerocopy_callback;
- ubuf->ctx = vq->ubufs;
- ubuf->desc = vq->upend_idx;
+ ubuf->ctx = nvq->ubufs;
+ ubuf->desc = nvq->upend_idx;
msg.msg_control = ubuf;
msg.msg_controllen = sizeof(ubuf);
- ubufs = vq->ubufs;
+ ubufs = nvq->ubufs;
kref_get(&ubufs->kref);
}
- vq->upend_idx = (vq->upend_idx + 1) % UIO_MAXIOV;
+ nvq->upend_idx = (nvq->upend_idx + 1) % UIO_MAXIOV;
}
/* TODO: Check specific error and bomb out unless ENOBUFS? */
err = sock->ops->sendmsg(NULL, sock, &msg, len);
@@ -309,8 +423,8 @@ static void handle_tx(struct vhost_net *net)
if (zcopy_used) {
if (ubufs)
vhost_ubuf_put(ubufs);
- vq->upend_idx = ((unsigned)vq->upend_idx - 1) %
- UIO_MAXIOV;
+ nvq->upend_idx = ((unsigned)nvq->upend_idx - 1)
+ % UIO_MAXIOV;
}
vhost_discard_vq_desc(vq, 1);
break;
@@ -417,7 +531,8 @@ err:
* read-size critical section for our kind of RCU. */
static void handle_rx(struct vhost_net *net)
{
- struct vhost_virtqueue *vq = &net->dev.vqs[VHOST_NET_VQ_RX];
+ struct vhost_net_virtqueue *nvq = &net->vqs[VHOST_NET_VQ_RX];
+ struct vhost_virtqueue *vq = &nvq->vq;
unsigned uninitialized_var(in), log;
struct vhost_log *vq_log;
struct msghdr msg = {
@@ -445,8 +560,8 @@ static void handle_rx(struct vhost_net *net)
mutex_lock(&vq->mutex);
vhost_disable_notify(&net->dev, vq);
- vhost_hlen = vq->vhost_hlen;
- sock_hlen = vq->sock_hlen;
+ vhost_hlen = nvq->vhost_hlen;
+ sock_hlen = nvq->sock_hlen;
vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
vq->log : NULL;
@@ -476,11 +591,11 @@ static void handle_rx(struct vhost_net *net)
/* We don't need to be notified again. */
if (unlikely((vhost_hlen)))
/* Skip header. TODO: support TSO. */
- move_iovec_hdr(vq->iov, vq->hdr, vhost_hlen, in);
+ move_iovec_hdr(vq->iov, nvq->hdr, vhost_hlen, in);
else
/* Copy the header for use in VIRTIO_NET_F_MRG_RXBUF:
* needed because recvmsg can modify msg_iov. */
- copy_iovec_hdr(vq->iov, vq->hdr, sock_hlen, in);
+ copy_iovec_hdr(vq->iov, nvq->hdr, sock_hlen, in);
msg.msg_iovlen = in;
err = sock->ops->recvmsg(NULL, sock, &msg,
sock_len, MSG_DONTWAIT | MSG_TRUNC);
@@ -494,7 +609,7 @@ static void handle_rx(struct vhost_net *net)
continue;
}
if (unlikely(vhost_hlen) &&
- memcpy_toiovecend(vq->hdr, (unsigned char *)&hdr, 0,
+ memcpy_toiovecend(nvq->hdr, (unsigned char *)&hdr, 0,
vhost_hlen)) {
vq_err(vq, "Unable to write vnet_hdr at addr %p\n",
vq->iov->iov_base);
@@ -502,7 +617,7 @@ static void handle_rx(struct vhost_net *net)
}
/* TODO: Should check and handle checksum. */
if (likely(mergeable) &&
- memcpy_toiovecend(vq->hdr, (unsigned char *)&headcount,
+ memcpy_toiovecend(nvq->hdr, (unsigned char *)&headcount,
offsetof(typeof(hdr), num_buffers),
sizeof hdr.num_buffers)) {
vq_err(vq, "Failed num_buffers write");
@@ -559,17 +674,34 @@ static int vhost_net_open(struct inode *inode, struct file *f)
{
struct vhost_net *n = kmalloc(sizeof *n, GFP_KERNEL);
struct vhost_dev *dev;
- int r;
+ struct vhost_virtqueue **vqs;
+ int r, i;
if (!n)
return -ENOMEM;
+ vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ kfree(n);
+ return -ENOMEM;
+ }
dev = &n->dev;
- n->vqs[VHOST_NET_VQ_TX].handle_kick = handle_tx_kick;
- n->vqs[VHOST_NET_VQ_RX].handle_kick = handle_rx_kick;
- r = vhost_dev_init(dev, n->vqs, VHOST_NET_VQ_MAX);
+ vqs[VHOST_NET_VQ_TX] = &n->vqs[VHOST_NET_VQ_TX].vq;
+ vqs[VHOST_NET_VQ_RX] = &n->vqs[VHOST_NET_VQ_RX].vq;
+ n->vqs[VHOST_NET_VQ_TX].vq.handle_kick = handle_tx_kick;
+ n->vqs[VHOST_NET_VQ_RX].vq.handle_kick = handle_rx_kick;
+ for (i = 0; i < VHOST_NET_VQ_MAX; i++) {
+ n->vqs[i].ubufs = NULL;
+ n->vqs[i].ubuf_info = NULL;
+ n->vqs[i].upend_idx = 0;
+ n->vqs[i].done_idx = 0;
+ n->vqs[i].vhost_hlen = 0;
+ n->vqs[i].sock_hlen = 0;
+ }
+ r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
if (r < 0) {
kfree(n);
+ kfree(vqs);
return r;
}
@@ -584,7 +716,9 @@ static int vhost_net_open(struct inode *inode, struct file *f)
static void vhost_net_disable_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
- struct vhost_poll *poll = n->poll + (vq - n->vqs);
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
+ struct vhost_poll *poll = n->poll + (nvq - n->vqs);
if (!vq->private_data)
return;
vhost_poll_stop(poll);
@@ -593,7 +727,9 @@ static void vhost_net_disable_vq(struct vhost_net *n,
static int vhost_net_enable_vq(struct vhost_net *n,
struct vhost_virtqueue *vq)
{
- struct vhost_poll *poll = n->poll + (vq - n->vqs);
+ struct vhost_net_virtqueue *nvq =
+ container_of(vq, struct vhost_net_virtqueue, vq);
+ struct vhost_poll *poll = n->poll + (nvq - n->vqs);
struct socket *sock;
sock = rcu_dereference_protected(vq->private_data,
@@ -621,30 +757,30 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
static void vhost_net_stop(struct vhost_net *n, struct socket **tx_sock,
struct socket **rx_sock)
{
- *tx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_TX);
- *rx_sock = vhost_net_stop_vq(n, n->vqs + VHOST_NET_VQ_RX);
+ *tx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_TX].vq);
+ *rx_sock = vhost_net_stop_vq(n, &n->vqs[VHOST_NET_VQ_RX].vq);
}
static void vhost_net_flush_vq(struct vhost_net *n, int index)
{
vhost_poll_flush(n->poll + index);
- vhost_poll_flush(&n->dev.vqs[index].poll);
+ vhost_poll_flush(&n->vqs[index].vq.poll);
}
static void vhost_net_flush(struct vhost_net *n)
{
vhost_net_flush_vq(n, VHOST_NET_VQ_TX);
vhost_net_flush_vq(n, VHOST_NET_VQ_RX);
- if (n->dev.vqs[VHOST_NET_VQ_TX].ubufs) {
- mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ if (n->vqs[VHOST_NET_VQ_TX].ubufs) {
+ mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
n->tx_flush = true;
- mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
/* Wait for all lower device DMAs done. */
- vhost_ubuf_put_and_wait(n->dev.vqs[VHOST_NET_VQ_TX].ubufs);
- mutex_lock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ vhost_ubuf_put_and_wait(n->vqs[VHOST_NET_VQ_TX].ubufs);
+ mutex_lock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
n->tx_flush = false;
- kref_init(&n->dev.vqs[VHOST_NET_VQ_TX].ubufs->kref);
- mutex_unlock(&n->dev.vqs[VHOST_NET_VQ_TX].mutex);
+ kref_init(&n->vqs[VHOST_NET_VQ_TX].ubufs->kref);
+ mutex_unlock(&n->vqs[VHOST_NET_VQ_TX].vq.mutex);
}
}
@@ -658,6 +794,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
vhost_net_flush(n);
vhost_dev_stop(&n->dev);
vhost_dev_cleanup(&n->dev, false);
+ vhost_net_vq_reset(n);
if (tx_sock)
fput(tx_sock->file);
if (rx_sock)
@@ -665,6 +802,7 @@ static int vhost_net_release(struct inode *inode, struct file *f)
/* We do an extra flush before freeing memory,
* since jobs can re-queue themselves. */
vhost_net_flush(n);
+ kfree(n->dev.vqs);
kfree(n);
return 0;
}
@@ -738,6 +876,7 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
{
struct socket *sock, *oldsock;
struct vhost_virtqueue *vq;
+ struct vhost_net_virtqueue *nvq;
struct vhost_ubuf_ref *ubufs, *oldubufs = NULL;
int r;
@@ -750,7 +889,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
r = -ENOBUFS;
goto err;
}
- vq = n->vqs + index;
+ vq = &n->vqs[index].vq;
+ nvq = &n->vqs[index];
mutex_lock(&vq->mutex);
/* Verify that ring has been setup correctly. */
@@ -783,8 +923,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
if (r)
goto err_used;
- oldubufs = vq->ubufs;
- vq->ubufs = ubufs;
+ oldubufs = nvq->ubufs;
+ nvq->ubufs = ubufs;
n->tx_packets = 0;
n->tx_zcopy_err = 0;
@@ -827,14 +967,21 @@ static long vhost_net_reset_owner(struct vhost_net *n)
struct socket *tx_sock = NULL;
struct socket *rx_sock = NULL;
long err;
+ struct vhost_memory *memory;
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
if (err)
goto done;
+ memory = vhost_dev_reset_owner_prepare();
+ if (!memory) {
+ err = -ENOMEM;
+ goto done;
+ }
vhost_net_stop(n, &tx_sock, &rx_sock);
vhost_net_flush(n);
- err = vhost_dev_reset_owner(&n->dev);
+ vhost_dev_reset_owner(&n->dev, memory);
+ vhost_net_vq_reset(n);
done:
mutex_unlock(&n->dev.mutex);
if (tx_sock)
@@ -870,10 +1017,10 @@ static int vhost_net_set_features(struct vhost_net *n, u64 features)
n->dev.acked_features = features;
smp_wmb();
for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
- mutex_lock(&n->vqs[i].mutex);
+ mutex_lock(&n->vqs[i].vq.mutex);
n->vqs[i].vhost_hlen = vhost_hlen;
n->vqs[i].sock_hlen = sock_hlen;
- mutex_unlock(&n->vqs[i].mutex);
+ mutex_unlock(&n->vqs[i].vq.mutex);
}
vhost_net_flush(n);
mutex_unlock(&n->dev.mutex);
@@ -910,11 +1057,17 @@ static long vhost_net_ioctl(struct file *f, unsigned int ioctl,
return vhost_net_reset_owner(n);
default:
mutex_lock(&n->dev.mutex);
+ if (ioctl == VHOST_SET_OWNER) {
+ r = vhost_net_set_ubuf_info(n);
+ if (r)
+ goto out;
+ }
r = vhost_dev_ioctl(&n->dev, ioctl, argp);
if (r == -ENOIOCTLCMD)
r = vhost_vring_ioctl(&n->dev, ioctl, argp);
else
vhost_net_flush(n);
+out:
mutex_unlock(&n->dev.mutex);
return r;
}
diff --git a/drivers/vhost/tcm_vhost.c b/drivers/vhost/scsi.c
index 1677238..5179f7a 100644
--- a/drivers/vhost/tcm_vhost.c
+++ b/drivers/vhost/scsi.c
@@ -45,14 +45,116 @@
#include <target/target_core_configfs.h>
#include <target/configfs_macros.h>
#include <linux/vhost.h>
-#include <linux/virtio_net.h> /* TODO vhost.h currently depends on this */
#include <linux/virtio_scsi.h>
#include <linux/llist.h>
#include <linux/bitmap.h>
#include "vhost.c"
#include "vhost.h"
-#include "tcm_vhost.h"
+
+#define TCM_VHOST_VERSION "v0.1"
+#define TCM_VHOST_NAMELEN 256
+#define TCM_VHOST_MAX_CDB_SIZE 32
+
+struct vhost_scsi_inflight {
+ /* Wait for the flush operation to finish */
+ struct completion comp;
+ /* Refcount for the inflight reqs */
+ struct kref kref;
+};
+
+struct tcm_vhost_cmd {
+ /* Descriptor from vhost_get_vq_desc() for virt_queue segment */
+ int tvc_vq_desc;
+ /* virtio-scsi initiator task attribute */
+ int tvc_task_attr;
+ /* virtio-scsi initiator data direction */
+ enum dma_data_direction tvc_data_direction;
+ /* Expected data transfer length from virtio-scsi header */
+ u32 tvc_exp_data_len;
+ /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */
+ u64 tvc_tag;
+ /* The number of scatterlists associated with this cmd */
+ u32 tvc_sgl_count;
+ /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */
+ u32 tvc_lun;
+ /* Pointer to the SGL formatted memory from virtio-scsi */
+ struct scatterlist *tvc_sgl;
+ /* Pointer to response */
+ struct virtio_scsi_cmd_resp __user *tvc_resp;
+ /* Pointer to vhost_scsi for our device */
+ struct vhost_scsi *tvc_vhost;
+ /* Pointer to vhost_virtqueue for the cmd */
+ struct vhost_virtqueue *tvc_vq;
+ /* Pointer to vhost nexus memory */
+ struct tcm_vhost_nexus *tvc_nexus;
+ /* The TCM I/O descriptor that is accessed via container_of() */
+ struct se_cmd tvc_se_cmd;
+ /* work item used for cmwq dispatch to tcm_vhost_submission_work() */
+ struct work_struct work;
+ /* Copy of the incoming SCSI command descriptor block (CDB) */
+ unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE];
+ /* Sense buffer that will be mapped into outgoing status */
+ unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
+ /* Completed commands list, serviced from vhost worker thread */
+ struct llist_node tvc_completion_list;
+ /* Used to track inflight cmd */
+ struct vhost_scsi_inflight *inflight;
+};
+
+struct tcm_vhost_nexus {
+ /* Pointer to TCM session for I_T Nexus */
+ struct se_session *tvn_se_sess;
+};
+
+struct tcm_vhost_nacl {
+ /* Binary World Wide unique Port Name for Vhost Initiator port */
+ u64 iport_wwpn;
+ /* ASCII formatted WWPN for Sas Initiator port */
+ char iport_name[TCM_VHOST_NAMELEN];
+ /* Returned by tcm_vhost_make_nodeacl() */
+ struct se_node_acl se_node_acl;
+};
+
+struct vhost_scsi;
+struct tcm_vhost_tpg {
+ /* Vhost port target portal group tag for TCM */
+ u16 tport_tpgt;
+ /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
+ int tv_tpg_port_count;
+ /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
+ int tv_tpg_vhost_count;
+ /* list for tcm_vhost_list */
+ struct list_head tv_tpg_list;
+ /* Used to protect access for tpg_nexus */
+ struct mutex tv_tpg_mutex;
+ /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */
+ struct tcm_vhost_nexus *tpg_nexus;
+ /* Pointer back to tcm_vhost_tport */
+ struct tcm_vhost_tport *tport;
+ /* Returned by tcm_vhost_make_tpg() */
+ struct se_portal_group se_tpg;
+ /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */
+ struct vhost_scsi *vhost_scsi;
+};
+
+struct tcm_vhost_tport {
+ /* SCSI protocol the tport is providing */
+ u8 tport_proto_id;
+ /* Binary World Wide unique Port Name for Vhost Target port */
+ u64 tport_wwpn;
+ /* ASCII formatted WWPN for Vhost Target port */
+ char tport_name[TCM_VHOST_NAMELEN];
+ /* Returned by tcm_vhost_make_tport() */
+ struct se_wwn tport_wwn;
+};
+
+struct tcm_vhost_evt {
+ /* event to be sent to guest */
+ struct virtio_scsi_event event;
+ /* event list, serviced from vhost worker thread */
+ struct llist_node list;
+};
enum {
VHOST_SCSI_VQ_CTL = 0,
@@ -74,13 +176,28 @@ enum {
#define VHOST_SCSI_MAX_VQ 128
#define VHOST_SCSI_MAX_EVENT 128
+struct vhost_scsi_virtqueue {
+ struct vhost_virtqueue vq;
+ /*
+ * Reference counting for inflight reqs, used for flush operation. At
+ * each time, one reference tracks new commands submitted, while we
+ * wait for another one to reach 0.
+ */
+ struct vhost_scsi_inflight inflights[2];
+ /*
+ * Indicate current inflight in use, protected by vq->mutex.
+ * Writers must also take dev mutex and flush under it.
+ */
+ int inflight_idx;
+};
+
struct vhost_scsi {
/* Protected by vhost_scsi->dev.mutex */
struct tcm_vhost_tpg **vs_tpg;
char vs_vhost_wwpn[TRANSPORT_IQN_LEN];
struct vhost_dev dev;
- struct vhost_virtqueue vqs[VHOST_SCSI_MAX_VQ];
+ struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ];
struct vhost_work vs_completion_work; /* cmd completion work item */
struct llist_head vs_completion_list; /* cmd completion queue */
@@ -107,6 +224,59 @@ static int iov_num_pages(struct iovec *iov)
((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT;
}
+void tcm_vhost_done_inflight(struct kref *kref)
+{
+ struct vhost_scsi_inflight *inflight;
+
+ inflight = container_of(kref, struct vhost_scsi_inflight, kref);
+ complete(&inflight->comp);
+}
+
+static void tcm_vhost_init_inflight(struct vhost_scsi *vs,
+ struct vhost_scsi_inflight *old_inflight[])
+{
+ struct vhost_scsi_inflight *new_inflight;
+ struct vhost_virtqueue *vq;
+ int idx, i;
+
+ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
+ vq = &vs->vqs[i].vq;
+
+ mutex_lock(&vq->mutex);
+
+ /* store old infight */
+ idx = vs->vqs[i].inflight_idx;
+ if (old_inflight)
+ old_inflight[i] = &vs->vqs[i].inflights[idx];
+
+ /* setup new infight */
+ vs->vqs[i].inflight_idx = idx ^ 1;
+ new_inflight = &vs->vqs[i].inflights[idx ^ 1];
+ kref_init(&new_inflight->kref);
+ init_completion(&new_inflight->comp);
+
+ mutex_unlock(&vq->mutex);
+ }
+}
+
+static struct vhost_scsi_inflight *
+tcm_vhost_get_inflight(struct vhost_virtqueue *vq)
+{
+ struct vhost_scsi_inflight *inflight;
+ struct vhost_scsi_virtqueue *svq;
+
+ svq = container_of(vq, struct vhost_scsi_virtqueue, vq);
+ inflight = &svq->inflights[svq->inflight_idx];
+ kref_get(&inflight->kref);
+
+ return inflight;
+}
+
+static void tcm_vhost_put_inflight(struct vhost_scsi_inflight *inflight)
+{
+ kref_put(&inflight->kref, tcm_vhost_done_inflight);
+}
+
static int tcm_vhost_check_true(struct se_portal_group *se_tpg)
{
return 1;
@@ -366,7 +536,7 @@ static void tcm_vhost_free_evt(struct vhost_scsi *vs, struct tcm_vhost_evt *evt)
static struct tcm_vhost_evt *tcm_vhost_allocate_evt(struct vhost_scsi *vs,
u32 event, u32 reason)
{
- struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+ struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
struct tcm_vhost_evt *evt;
if (vs->vs_events_nr > VHOST_SCSI_MAX_EVENT) {
@@ -403,13 +573,15 @@ static void vhost_scsi_free_cmd(struct tcm_vhost_cmd *tv_cmd)
kfree(tv_cmd->tvc_sgl);
}
+ tcm_vhost_put_inflight(tv_cmd->inflight);
+
kfree(tv_cmd);
}
static void tcm_vhost_do_evt_work(struct vhost_scsi *vs,
struct tcm_vhost_evt *evt)
{
- struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+ struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
struct virtio_scsi_event *event = &evt->event;
struct virtio_scsi_event __user *eventp;
unsigned out, in;
@@ -460,7 +632,7 @@ static void tcm_vhost_evt_work(struct vhost_work *work)
{
struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
vs_event_work);
- struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+ struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
struct tcm_vhost_evt *evt;
struct llist_node *llnode;
@@ -511,8 +683,10 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
v_rsp.sense_len);
ret = copy_to_user(tv_cmd->tvc_resp, &v_rsp, sizeof(v_rsp));
if (likely(ret == 0)) {
+ struct vhost_scsi_virtqueue *q;
vhost_add_used(tv_cmd->tvc_vq, tv_cmd->tvc_vq_desc, 0);
- vq = tv_cmd->tvc_vq - vs->vqs;
+ q = container_of(tv_cmd->tvc_vq, struct vhost_scsi_virtqueue, vq);
+ vq = q - vs->vqs;
__set_bit(vq, signal);
} else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
@@ -523,10 +697,11 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
vq = -1;
while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1))
< VHOST_SCSI_MAX_VQ)
- vhost_signal(&vs->dev, &vs->vqs[vq]);
+ vhost_signal(&vs->dev, &vs->vqs[vq].vq);
}
static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
+ struct vhost_virtqueue *vq,
struct tcm_vhost_tpg *tv_tpg,
struct virtio_scsi_cmd_req *v_req,
u32 exp_data_len,
@@ -551,6 +726,7 @@ static struct tcm_vhost_cmd *vhost_scsi_allocate_cmd(
tv_cmd->tvc_exp_data_len = exp_data_len;
tv_cmd->tvc_data_direction = data_direction;
tv_cmd->tvc_nexus = tv_nexus;
+ tv_cmd->inflight = tcm_vhost_get_inflight(vq);
return tv_cmd;
}
@@ -806,7 +982,7 @@ static void vhost_scsi_handle_vq(struct vhost_scsi *vs,
for (i = 0; i < data_num; i++)
exp_data_len += vq->iov[data_first + i].iov_len;
- tv_cmd = vhost_scsi_allocate_cmd(tv_tpg, &v_req,
+ tv_cmd = vhost_scsi_allocate_cmd(vq, tv_tpg, &v_req,
exp_data_len, data_direction);
if (IS_ERR(tv_cmd)) {
vq_err(vq, "vhost_scsi_allocate_cmd failed %ld\n",
@@ -938,17 +1114,35 @@ static void vhost_scsi_handle_kick(struct vhost_work *work)
static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
{
- vhost_poll_flush(&vs->dev.vqs[index].poll);
+ vhost_poll_flush(&vs->vqs[index].vq.poll);
}
+/* Callers must hold dev mutex */
static void vhost_scsi_flush(struct vhost_scsi *vs)
{
+ struct vhost_scsi_inflight *old_inflight[VHOST_SCSI_MAX_VQ];
int i;
+ /* Init new inflight and remember the old inflight */
+ tcm_vhost_init_inflight(vs, old_inflight);
+
+ /*
+ * The inflight->kref was initialized to 1. We decrement it here to
+ * indicate the start of the flush operation so that it will reach 0
+ * when all the reqs are finished.
+ */
+ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
+ kref_put(&old_inflight[i]->kref, tcm_vhost_done_inflight);
+
+ /* Flush both the vhost poll and vhost work */
for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
vhost_scsi_flush_vq(vs, i);
vhost_work_flush(&vs->dev, &vs->vs_completion_work);
vhost_work_flush(&vs->dev, &vs->vs_event_work);
+
+ /* Wait for all reqs issued before the flush to be finished */
+ for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
+ wait_for_completion(&old_inflight[i]->comp);
}
/*
@@ -975,7 +1169,7 @@ static int vhost_scsi_set_endpoint(
/* Verify that ring has been setup correctly. */
for (index = 0; index < vs->dev.nvqs; ++index) {
/* Verify that ring has been setup correctly. */
- if (!vhost_vq_access_ok(&vs->vqs[index])) {
+ if (!vhost_vq_access_ok(&vs->vqs[index].vq)) {
ret = -EFAULT;
goto out;
}
@@ -1022,7 +1216,7 @@ static int vhost_scsi_set_endpoint(
memcpy(vs->vs_vhost_wwpn, t->vhost_wwpn,
sizeof(vs->vs_vhost_wwpn));
for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
- vq = &vs->vqs[i];
+ vq = &vs->vqs[i].vq;
/* Flushing the vhost_work acts as synchronize_rcu */
mutex_lock(&vq->mutex);
rcu_assign_pointer(vq->private_data, vs_tpg);
@@ -1063,7 +1257,7 @@ static int vhost_scsi_clear_endpoint(
mutex_lock(&vs->dev.mutex);
/* Verify that ring has been setup correctly. */
for (index = 0; index < vs->dev.nvqs; ++index) {
- if (!vhost_vq_access_ok(&vs->vqs[index])) {
+ if (!vhost_vq_access_ok(&vs->vqs[index].vq)) {
ret = -EFAULT;
goto err_dev;
}
@@ -1103,7 +1297,7 @@ static int vhost_scsi_clear_endpoint(
}
if (match) {
for (i = 0; i < VHOST_SCSI_MAX_VQ; i++) {
- vq = &vs->vqs[i];
+ vq = &vs->vqs[i].vq;
/* Flushing the vhost_work acts as synchronize_rcu */
mutex_lock(&vq->mutex);
rcu_assign_pointer(vq->private_data, NULL);
@@ -1151,24 +1345,39 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, u64 features)
static int vhost_scsi_open(struct inode *inode, struct file *f)
{
struct vhost_scsi *s;
+ struct vhost_virtqueue **vqs;
int r, i;
s = kzalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
+ vqs = kmalloc(VHOST_SCSI_MAX_VQ * sizeof(*vqs), GFP_KERNEL);
+ if (!vqs) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
vhost_work_init(&s->vs_completion_work, vhost_scsi_complete_cmd_work);
vhost_work_init(&s->vs_event_work, tcm_vhost_evt_work);
s->vs_events_nr = 0;
s->vs_events_missed = false;
- s->vqs[VHOST_SCSI_VQ_CTL].handle_kick = vhost_scsi_ctl_handle_kick;
- s->vqs[VHOST_SCSI_VQ_EVT].handle_kick = vhost_scsi_evt_handle_kick;
- for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++)
- s->vqs[i].handle_kick = vhost_scsi_handle_kick;
- r = vhost_dev_init(&s->dev, s->vqs, VHOST_SCSI_MAX_VQ);
+ vqs[VHOST_SCSI_VQ_CTL] = &s->vqs[VHOST_SCSI_VQ_CTL].vq;
+ vqs[VHOST_SCSI_VQ_EVT] = &s->vqs[VHOST_SCSI_VQ_EVT].vq;
+ s->vqs[VHOST_SCSI_VQ_CTL].vq.handle_kick = vhost_scsi_ctl_handle_kick;
+ s->vqs[VHOST_SCSI_VQ_EVT].vq.handle_kick = vhost_scsi_evt_handle_kick;
+ for (i = VHOST_SCSI_VQ_IO; i < VHOST_SCSI_MAX_VQ; i++) {
+ vqs[i] = &s->vqs[i].vq;
+ s->vqs[i].vq.handle_kick = vhost_scsi_handle_kick;
+ }
+ r = vhost_dev_init(&s->dev, vqs, VHOST_SCSI_MAX_VQ);
+
+ tcm_vhost_init_inflight(s, NULL);
+
if (r < 0) {
+ kfree(vqs);
kfree(s);
return r;
}
@@ -1190,6 +1399,7 @@ static int vhost_scsi_release(struct inode *inode, struct file *f)
vhost_dev_cleanup(&s->dev, false);
/* Jobs can re-queue themselves in evt kick handler. Do extra flush. */
vhost_scsi_flush(s);
+ kfree(s->dev.vqs);
kfree(s);
return 0;
}
@@ -1205,7 +1415,7 @@ static long vhost_scsi_ioctl(struct file *f, unsigned int ioctl,
u32 events_missed;
u64 features;
int r, abi_version = VHOST_SCSI_ABI_VERSION;
- struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+ struct vhost_virtqueue *vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
switch (ioctl) {
case VHOST_SCSI_SET_ENDPOINT:
@@ -1333,7 +1543,7 @@ static void tcm_vhost_do_plug(struct tcm_vhost_tpg *tpg,
else
reason = VIRTIO_SCSI_EVT_RESET_REMOVED;
- vq = &vs->vqs[VHOST_SCSI_VQ_EVT];
+ vq = &vs->vqs[VHOST_SCSI_VQ_EVT].vq;
mutex_lock(&vq->mutex);
tcm_vhost_send_evt(vs, tpg, lun,
VIRTIO_SCSI_T_TRANSPORT_RESET, reason);
@@ -1926,7 +2136,8 @@ static void tcm_vhost_exit(void)
destroy_workqueue(tcm_vhost_workqueue);
};
-MODULE_DESCRIPTION("TCM_VHOST series fabric driver");
+MODULE_DESCRIPTION("VHOST_SCSI series fabric driver");
+MODULE_ALIAS("tcm_vhost");
MODULE_LICENSE("GPL");
module_init(tcm_vhost_init);
module_exit(tcm_vhost_exit);
diff --git a/drivers/vhost/tcm_vhost.h b/drivers/vhost/tcm_vhost.h
deleted file mode 100644
index 514b9fd..0000000
--- a/drivers/vhost/tcm_vhost.h
+++ /dev/null
@@ -1,128 +0,0 @@
-#define TCM_VHOST_VERSION "v0.1"
-#define TCM_VHOST_NAMELEN 256
-#define TCM_VHOST_MAX_CDB_SIZE 32
-
-struct tcm_vhost_cmd {
- /* Descriptor from vhost_get_vq_desc() for virt_queue segment */
- int tvc_vq_desc;
- /* virtio-scsi initiator task attribute */
- int tvc_task_attr;
- /* virtio-scsi initiator data direction */
- enum dma_data_direction tvc_data_direction;
- /* Expected data transfer length from virtio-scsi header */
- u32 tvc_exp_data_len;
- /* The Tag from include/linux/virtio_scsi.h:struct virtio_scsi_cmd_req */
- u64 tvc_tag;
- /* The number of scatterlists associated with this cmd */
- u32 tvc_sgl_count;
- /* Saved unpacked SCSI LUN for tcm_vhost_submission_work() */
- u32 tvc_lun;
- /* Pointer to the SGL formatted memory from virtio-scsi */
- struct scatterlist *tvc_sgl;
- /* Pointer to response */
- struct virtio_scsi_cmd_resp __user *tvc_resp;
- /* Pointer to vhost_scsi for our device */
- struct vhost_scsi *tvc_vhost;
- /* Pointer to vhost_virtqueue for the cmd */
- struct vhost_virtqueue *tvc_vq;
- /* Pointer to vhost nexus memory */
- struct tcm_vhost_nexus *tvc_nexus;
- /* The TCM I/O descriptor that is accessed via container_of() */
- struct se_cmd tvc_se_cmd;
- /* work item used for cmwq dispatch to tcm_vhost_submission_work() */
- struct work_struct work;
- /* Copy of the incoming SCSI command descriptor block (CDB) */
- unsigned char tvc_cdb[TCM_VHOST_MAX_CDB_SIZE];
- /* Sense buffer that will be mapped into outgoing status */
- unsigned char tvc_sense_buf[TRANSPORT_SENSE_BUFFER];
- /* Completed commands list, serviced from vhost worker thread */
- struct llist_node tvc_completion_list;
-};
-
-struct tcm_vhost_nexus {
- /* Pointer to TCM session for I_T Nexus */
- struct se_session *tvn_se_sess;
-};
-
-struct tcm_vhost_nacl {
- /* Binary World Wide unique Port Name for Vhost Initiator port */
- u64 iport_wwpn;
- /* ASCII formatted WWPN for Sas Initiator port */
- char iport_name[TCM_VHOST_NAMELEN];
- /* Returned by tcm_vhost_make_nodeacl() */
- struct se_node_acl se_node_acl;
-};
-
-struct vhost_scsi;
-struct tcm_vhost_tpg {
- /* Vhost port target portal group tag for TCM */
- u16 tport_tpgt;
- /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */
- int tv_tpg_port_count;
- /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */
- int tv_tpg_vhost_count;
- /* list for tcm_vhost_list */
- struct list_head tv_tpg_list;
- /* Used to protect access for tpg_nexus */
- struct mutex tv_tpg_mutex;
- /* Pointer to the TCM VHost I_T Nexus for this TPG endpoint */
- struct tcm_vhost_nexus *tpg_nexus;
- /* Pointer back to tcm_vhost_tport */
- struct tcm_vhost_tport *tport;
- /* Returned by tcm_vhost_make_tpg() */
- struct se_portal_group se_tpg;
- /* Pointer back to vhost_scsi, protected by tv_tpg_mutex */
- struct vhost_scsi *vhost_scsi;
-};
-
-struct tcm_vhost_tport {
- /* SCSI protocol the tport is providing */
- u8 tport_proto_id;
- /* Binary World Wide unique Port Name for Vhost Target port */
- u64 tport_wwpn;
- /* ASCII formatted WWPN for Vhost Target port */
- char tport_name[TCM_VHOST_NAMELEN];
- /* Returned by tcm_vhost_make_tport() */
- struct se_wwn tport_wwn;
-};
-
-struct tcm_vhost_evt {
- /* event to be sent to guest */
- struct virtio_scsi_event event;
- /* event list, serviced from vhost worker thread */
- struct llist_node list;
-};
-
-/*
- * As per request from MST, keep TCM_VHOST related ioctl defines out of
- * linux/vhost.h (user-space) for now..
- */
-
-#include <linux/vhost.h>
-
-/*
- * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
- *
- * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
- * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage
- * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target.
- * All the targets under vhost_wwpn can be seen and used by guset.
- */
-
-#define VHOST_SCSI_ABI_VERSION 1
-
-struct vhost_scsi_target {
- int abi_version;
- char vhost_wwpn[TRANSPORT_IQN_LEN];
- unsigned short vhost_tpgt;
- unsigned short reserved;
-};
-
-/* VHOST_SCSI specific defines */
-#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
-#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
-/* Changing this breaks userspace. */
-#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
-/* Set and get the events missed flag */
-#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
-#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c
index 91d6f06..1ee45bc 100644
--- a/drivers/vhost/test.c
+++ b/drivers/vhost/test.c
@@ -219,13 +219,20 @@ static long vhost_test_reset_owner(struct vhost_test *n)
{
void *priv = NULL;
long err;
+ struct vhost_memory *memory;
+
mutex_lock(&n->dev.mutex);
err = vhost_dev_check_owner(&n->dev);
if (err)
goto done;
+ memory = vhost_dev_reset_owner_prepare();
+ if (!memory) {
+ err = -ENOMEM;
+ goto done;
+ }
vhost_test_stop(n, &priv);
vhost_test_flush(n);
- err = vhost_dev_reset_owner(&n->dev);
+ vhost_dev_reset_owner(&n->dev, memory);
done:
mutex_unlock(&n->dev.mutex);
return err;
@@ -275,7 +282,9 @@ static long vhost_test_ioctl(struct file *f, unsigned int ioctl,
return vhost_test_reset_owner(n);
default:
mutex_lock(&n->dev.mutex);
- r = vhost_dev_ioctl(&n->dev, ioctl, arg);
+ r = vhost_dev_ioctl(&n->dev, ioctl, argp);
+ if (r == -ENOIOCTLCMD)
+ r = vhost_vring_ioctl(&n->dev, ioctl, argp);
vhost_test_flush(n);
mutex_unlock(&n->dev.mutex);
return r;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 4eecdb8..749b5ab 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -33,8 +33,6 @@ enum {
VHOST_MEMORY_F_LOG = 0x1,
};
-static unsigned vhost_zcopy_mask __read_mostly;
-
#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num])
#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num])
@@ -181,8 +179,6 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->used_flags = 0;
vq->log_used = false;
vq->log_addr = -1ull;
- vq->vhost_hlen = 0;
- vq->sock_hlen = 0;
vq->private_data = NULL;
vq->log_base = NULL;
vq->error_ctx = NULL;
@@ -191,9 +187,6 @@ static void vhost_vq_reset(struct vhost_dev *dev,
vq->call_ctx = NULL;
vq->call = NULL;
vq->log_ctx = NULL;
- vq->upend_idx = 0;
- vq->done_idx = 0;
- vq->ubufs = NULL;
}
static int vhost_worker(void *data)
@@ -253,43 +246,29 @@ static void vhost_vq_free_iovecs(struct vhost_virtqueue *vq)
vq->log = NULL;
kfree(vq->heads);
vq->heads = NULL;
- kfree(vq->ubuf_info);
- vq->ubuf_info = NULL;
-}
-
-void vhost_enable_zcopy(int vq)
-{
- vhost_zcopy_mask |= 0x1 << vq;
}
/* Helper to allocate iovec buffers for all vqs. */
static long vhost_dev_alloc_iovecs(struct vhost_dev *dev)
{
int i;
- bool zcopy;
for (i = 0; i < dev->nvqs; ++i) {
- dev->vqs[i].indirect = kmalloc(sizeof *dev->vqs[i].indirect *
+ dev->vqs[i]->indirect = kmalloc(sizeof *dev->vqs[i]->indirect *
UIO_MAXIOV, GFP_KERNEL);
- dev->vqs[i].log = kmalloc(sizeof *dev->vqs[i].log * UIO_MAXIOV,
+ dev->vqs[i]->log = kmalloc(sizeof *dev->vqs[i]->log * UIO_MAXIOV,
GFP_KERNEL);
- dev->vqs[i].heads = kmalloc(sizeof *dev->vqs[i].heads *
+ dev->vqs[i]->heads = kmalloc(sizeof *dev->vqs[i]->heads *
UIO_MAXIOV, GFP_KERNEL);
- zcopy = vhost_zcopy_mask & (0x1 << i);
- if (zcopy)
- dev->vqs[i].ubuf_info =
- kmalloc(sizeof *dev->vqs[i].ubuf_info *
- UIO_MAXIOV, GFP_KERNEL);
- if (!dev->vqs[i].indirect || !dev->vqs[i].log ||
- !dev->vqs[i].heads ||
- (zcopy && !dev->vqs[i].ubuf_info))
+ if (!dev->vqs[i]->indirect || !dev->vqs[i]->log ||
+ !dev->vqs[i]->heads)
goto err_nomem;
}
return 0;
err_nomem:
for (; i >= 0; --i)
- vhost_vq_free_iovecs(&dev->vqs[i]);
+ vhost_vq_free_iovecs(dev->vqs[i]);
return -ENOMEM;
}
@@ -298,11 +277,11 @@ static void vhost_dev_free_iovecs(struct vhost_dev *dev)
int i;
for (i = 0; i < dev->nvqs; ++i)
- vhost_vq_free_iovecs(&dev->vqs[i]);
+ vhost_vq_free_iovecs(dev->vqs[i]);
}
long vhost_dev_init(struct vhost_dev *dev,
- struct vhost_virtqueue *vqs, int nvqs)
+ struct vhost_virtqueue **vqs, int nvqs)
{
int i;
@@ -318,16 +297,15 @@ long vhost_dev_init(struct vhost_dev *dev,
dev->worker = NULL;
for (i = 0; i < dev->nvqs; ++i) {
- dev->vqs[i].log = NULL;
- dev->vqs[i].indirect = NULL;
- dev->vqs[i].heads = NULL;
- dev->vqs[i].ubuf_info = NULL;
- dev->vqs[i].dev = dev;
- mutex_init(&dev->vqs[i].mutex);
- vhost_vq_reset(dev, dev->vqs + i);
- if (dev->vqs[i].handle_kick)
- vhost_poll_init(&dev->vqs[i].poll,
- dev->vqs[i].handle_kick, POLLIN, dev);
+ dev->vqs[i]->log = NULL;
+ dev->vqs[i]->indirect = NULL;
+ dev->vqs[i]->heads = NULL;
+ dev->vqs[i]->dev = dev;
+ mutex_init(&dev->vqs[i]->mutex);
+ vhost_vq_reset(dev, dev->vqs[i]);
+ if (dev->vqs[i]->handle_kick)
+ vhost_poll_init(&dev->vqs[i]->poll,
+ dev->vqs[i]->handle_kick, POLLIN, dev);
}
return 0;
@@ -408,21 +386,19 @@ err_mm:
return err;
}
-/* Caller should have device mutex */
-long vhost_dev_reset_owner(struct vhost_dev *dev)
+struct vhost_memory *vhost_dev_reset_owner_prepare(void)
{
- struct vhost_memory *memory;
-
- /* Restore memory to default empty mapping. */
- memory = kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
- if (!memory)
- return -ENOMEM;
+ return kmalloc(offsetof(struct vhost_memory, regions), GFP_KERNEL);
+}
+/* Caller should have device mutex */
+void vhost_dev_reset_owner(struct vhost_dev *dev, struct vhost_memory *memory)
+{
vhost_dev_cleanup(dev, true);
+ /* Restore memory to default empty mapping. */
memory->nregions = 0;
RCU_INIT_POINTER(dev->memory, memory);
- return 0;
}
void vhost_dev_stop(struct vhost_dev *dev)
@@ -430,9 +406,9 @@ void vhost_dev_stop(struct vhost_dev *dev)
int i;
for (i = 0; i < dev->nvqs; ++i) {
- if (dev->vqs[i].kick && dev->vqs[i].handle_kick) {
- vhost_poll_stop(&dev->vqs[i].poll);
- vhost_poll_flush(&dev->vqs[i].poll);
+ if (dev->vqs[i]->kick && dev->vqs[i]->handle_kick) {
+ vhost_poll_stop(&dev->vqs[i]->poll);
+ vhost_poll_flush(&dev->vqs[i]->poll);
}
}
}
@@ -443,17 +419,17 @@ void vhost_dev_cleanup(struct vhost_dev *dev, bool locked)
int i;
for (i = 0; i < dev->nvqs; ++i) {
- if (dev->vqs[i].error_ctx)
- eventfd_ctx_put(dev->vqs[i].error_ctx);
- if (dev->vqs[i].error)
- fput(dev->vqs[i].error);
- if (dev->vqs[i].kick)
- fput(dev->vqs[i].kick);
- if (dev->vqs[i].call_ctx)
- eventfd_ctx_put(dev->vqs[i].call_ctx);
- if (dev->vqs[i].call)
- fput(dev->vqs[i].call);
- vhost_vq_reset(dev, dev->vqs + i);
+ if (dev->vqs[i]->error_ctx)
+ eventfd_ctx_put(dev->vqs[i]->error_ctx);
+ if (dev->vqs[i]->error)
+ fput(dev->vqs[i]->error);
+ if (dev->vqs[i]->kick)
+ fput(dev->vqs[i]->kick);
+ if (dev->vqs[i]->call_ctx)
+ eventfd_ctx_put(dev->vqs[i]->call_ctx);
+ if (dev->vqs[i]->call)
+ fput(dev->vqs[i]->call);
+ vhost_vq_reset(dev, dev->vqs[i]);
}
vhost_dev_free_iovecs(dev);
if (dev->log_ctx)
@@ -524,14 +500,14 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem,
for (i = 0; i < d->nvqs; ++i) {
int ok;
- mutex_lock(&d->vqs[i].mutex);
+ mutex_lock(&d->vqs[i]->mutex);
/* If ring is inactive, will check when it's enabled. */
- if (d->vqs[i].private_data)
- ok = vq_memory_access_ok(d->vqs[i].log_base, mem,
+ if (d->vqs[i]->private_data)
+ ok = vq_memory_access_ok(d->vqs[i]->log_base, mem,
log_all);
else
ok = 1;
- mutex_unlock(&d->vqs[i].mutex);
+ mutex_unlock(&d->vqs[i]->mutex);
if (!ok)
return 0;
}
@@ -641,7 +617,7 @@ long vhost_vring_ioctl(struct vhost_dev *d, int ioctl, void __user *argp)
if (idx >= d->nvqs)
return -ENOBUFS;
- vq = d->vqs + idx;
+ vq = d->vqs[idx];
mutex_lock(&vq->mutex);
@@ -852,7 +828,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
for (i = 0; i < d->nvqs; ++i) {
struct vhost_virtqueue *vq;
void __user *base = (void __user *)(unsigned long)p;
- vq = d->vqs + i;
+ vq = d->vqs[i];
mutex_lock(&vq->mutex);
/* If ring is inactive, will check when it's enabled. */
if (vq->private_data && !vq_log_access_ok(d, vq, base))
@@ -879,9 +855,9 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user *argp)
} else
filep = eventfp;
for (i = 0; i < d->nvqs; ++i) {
- mutex_lock(&d->vqs[i].mutex);
- d->vqs[i].log_ctx = d->log_ctx;
- mutex_unlock(&d->vqs[i].mutex);
+ mutex_lock(&d->vqs[i]->mutex);
+ d->vqs[i]->log_ctx = d->log_ctx;
+ mutex_unlock(&d->vqs[i]->mutex);
}
if (ctx)
eventfd_ctx_put(ctx);
@@ -1551,38 +1527,3 @@ void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq)
&vq->used->flags, r);
}
}
-
-static void vhost_zerocopy_done_signal(struct kref *kref)
-{
- struct vhost_ubuf_ref *ubufs = container_of(kref, struct vhost_ubuf_ref,
- kref);
- wake_up(&ubufs->wait);
-}
-
-struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *vq,
- bool zcopy)
-{
- struct vhost_ubuf_ref *ubufs;
- /* No zero copy backend? Nothing to count. */
- if (!zcopy)
- return NULL;
- ubufs = kmalloc(sizeof *ubufs, GFP_KERNEL);
- if (!ubufs)
- return ERR_PTR(-ENOMEM);
- kref_init(&ubufs->kref);
- init_waitqueue_head(&ubufs->wait);
- ubufs->vq = vq;
- return ubufs;
-}
-
-void vhost_ubuf_put(struct vhost_ubuf_ref *ubufs)
-{
- kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
-}
-
-void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *ubufs)
-{
- kref_put(&ubufs->kref, vhost_zerocopy_done_signal);
- wait_event(ubufs->wait, !atomic_read(&ubufs->kref.refcount));
- kfree(ubufs);
-}
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 17261e2..b58f4ae 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -54,18 +54,6 @@ struct vhost_log {
struct vhost_virtqueue;
-struct vhost_ubuf_ref {
- struct kref kref;
- wait_queue_head_t wait;
- struct vhost_virtqueue *vq;
-};
-
-struct vhost_ubuf_ref *vhost_ubuf_alloc(struct vhost_virtqueue *, bool zcopy);
-void vhost_ubuf_put(struct vhost_ubuf_ref *);
-void vhost_ubuf_put_and_wait(struct vhost_ubuf_ref *);
-
-struct ubuf_info;
-
/* The virtqueue structure describes a queue attached to a device. */
struct vhost_virtqueue {
struct vhost_dev *dev;
@@ -114,10 +102,7 @@ struct vhost_virtqueue {
/* hdr is used to store the virtio header.
* Since each iovec has >= 1 byte length, we never need more than
* header length entries to store the header. */
- struct iovec hdr[sizeof(struct virtio_net_hdr_mrg_rxbuf)];
struct iovec *indirect;
- size_t vhost_hlen;
- size_t sock_hlen;
struct vring_used_elem *heads;
/* We use a kind of RCU to access private pointer.
* All readers access it from worker, which makes it possible to
@@ -130,16 +115,6 @@ struct vhost_virtqueue {
/* Log write descriptors */
void __user *log_base;
struct vhost_log *log;
- /* vhost zerocopy support fields below: */
- /* last used idx for outstanding DMA zerocopy buffers */
- int upend_idx;
- /* first used idx for DMA done zerocopy buffers */
- int done_idx;
- /* an array of userspace buffers info */
- struct ubuf_info *ubuf_info;
- /* Reference counting for outstanding ubufs.
- * Protected by vq mutex. Writers must also take device mutex. */
- struct vhost_ubuf_ref *ubufs;
};
struct vhost_dev {
@@ -150,7 +125,7 @@ struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
unsigned acked_features;
- struct vhost_virtqueue *vqs;
+ struct vhost_virtqueue **vqs;
int nvqs;
struct file *log_file;
struct eventfd_ctx *log_ctx;
@@ -159,9 +134,10 @@ struct vhost_dev {
struct task_struct *worker;
};
-long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue *vqs, int nvqs);
+long vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs, int nvqs);
long vhost_dev_check_owner(struct vhost_dev *);
-long vhost_dev_reset_owner(struct vhost_dev *);
+struct vhost_memory *vhost_dev_reset_owner_prepare(void);
+void vhost_dev_reset_owner(struct vhost_dev *, struct vhost_memory *);
void vhost_dev_cleanup(struct vhost_dev *, bool locked);
void vhost_dev_stop(struct vhost_dev *);
long vhost_dev_ioctl(struct vhost_dev *, unsigned int ioctl, void __user *argp);
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 0000000..bff0775
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
+/*
+ * Helpers for the host side of a virtio ring.
+ *
+ * Since these may be in userspace, we use (inline) accessors.
+ */
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
+{
+ static DEFINE_RATELIMIT_STATE(vringh_rs,
+ DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+ if (__ratelimit(&vringh_rs)) {
+ va_list ap;
+ va_start(ap, fmt);
+ printk(KERN_NOTICE "vringh:");
+ vprintk(fmt, ap);
+ va_end(ap);
+ }
+}
+
+/* Returns vring->num if empty, -ve on error. */
+static inline int __vringh_get_head(const struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p),
+ u16 *last_avail_idx)
+{
+ u16 avail_idx, i, head;
+ int err;
+
+ err = getu16(&avail_idx, &vrh->vring.avail->idx);
+ if (err) {
+ vringh_bad("Failed to access avail idx at %p",
+ &vrh->vring.avail->idx);
+ return err;
+ }
+
+ if (*last_avail_idx == avail_idx)
+ return vrh->vring.num;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ virtio_rmb(vrh->weak_barriers);
+
+ i = *last_avail_idx & (vrh->vring.num - 1);
+
+ err = getu16(&head, &vrh->vring.avail->ring[i]);
+ if (err) {
+ vringh_bad("Failed to read head: idx %d address %p",
+ *last_avail_idx, &vrh->vring.avail->ring[i]);
+ return err;
+ }
+
+ if (head >= vrh->vring.num) {
+ vringh_bad("Guest says index %u > %u is available",
+ head, vrh->vring.num);
+ return -EINVAL;
+ }
+
+ (*last_avail_idx)++;
+ return head;
+}
+
+/* Copy some bytes to/from the iovec. Returns num copied. */
+static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+ void *ptr, size_t len,
+ int (*xfer)(void *addr, void *ptr,
+ size_t len))
+{
+ int err, done = 0;
+
+ while (len && iov->i < iov->used) {
+ size_t partlen;
+
+ partlen = min(iov->iov[iov->i].iov_len, len);
+ err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+ if (err)
+ return err;
+ done += partlen;
+ len -= partlen;
+ ptr += partlen;
+ iov->consumed += partlen;
+ iov->iov[iov->i].iov_len -= partlen;
+ iov->iov[iov->i].iov_base += partlen;
+
+ if (!iov->iov[iov->i].iov_len) {
+ /* Fix up old iov element then increment. */
+ iov->iov[iov->i].iov_len = iov->consumed;
+ iov->iov[iov->i].iov_base -= iov->consumed;
+
+ iov->consumed = 0;
+ iov->i++;
+ }
+ }
+ return done;
+}
+
+/* May reduce *len if range is shorter. */
+static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *,
+ u64, struct vringh_range *))
+{
+ if (addr < range->start || addr > range->end_incl) {
+ if (!getrange(vrh, addr, range))
+ return false;
+ }
+ BUG_ON(addr < range->start || addr > range->end_incl);
+
+ /* To end of memory? */
+ if (unlikely(addr + *len == 0)) {
+ if (range->end_incl == -1ULL)
+ return true;
+ goto truncate;
+ }
+
+ /* Otherwise, don't wrap. */
+ if (addr + *len < addr) {
+ vringh_bad("Wrapping descriptor %zu@0x%llx",
+ *len, (unsigned long long)addr);
+ return false;
+ }
+
+ if (unlikely(addr + *len - 1 > range->end_incl))
+ goto truncate;
+ return true;
+
+truncate:
+ *len = range->end_incl + 1 - addr;
+ return true;
+}
+
+static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *,
+ u64, struct vringh_range *))
+{
+ return true;
+}
+
+/* No reason for this code to be inline. */
+static int move_to_indirect(int *up_next, u16 *i, void *addr,
+ const struct vring_desc *desc,
+ struct vring_desc **descs, int *desc_max)
+{
+ /* Indirect tables can't have indirect. */
+ if (*up_next != -1) {
+ vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
+ return -EINVAL;
+ }
+
+ if (unlikely(desc->len % sizeof(struct vring_desc))) {
+ vringh_bad("Strange indirect len %u", desc->len);
+ return -EINVAL;
+ }
+
+ /* We will check this when we follow it! */
+ if (desc->flags & VRING_DESC_F_NEXT)
+ *up_next = desc->next;
+ else
+ *up_next = -2;
+ *descs = addr;
+ *desc_max = desc->len / sizeof(struct vring_desc);
+
+ /* Now, start at the first indirect. */
+ *i = 0;
+ return 0;
+}
+
+static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
+{
+ struct kvec *new;
+ unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
+
+ if (new_num < 8)
+ new_num = 8;
+
+ flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
+ if (flag)
+ new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
+ else {
+ new = kmalloc(new_num * sizeof(struct iovec), gfp);
+ if (new) {
+ memcpy(new, iov->iov,
+ iov->max_num * sizeof(struct iovec));
+ flag = VRINGH_IOV_ALLOCATED;
+ }
+ }
+ if (!new)
+ return -ENOMEM;
+ iov->iov = new;
+ iov->max_num = (new_num | flag);
+ return 0;
+}
+
+static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
+ struct vring_desc **descs, int *desc_max)
+{
+ u16 i = *up_next;
+
+ *up_next = -1;
+ *descs = vrh->vring.desc;
+ *desc_max = vrh->vring.num;
+ return i;
+}
+
+static int slow_copy(struct vringh *vrh, void *dst, const void *src,
+ bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *vrh,
+ u64,
+ struct vringh_range *)),
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr,
+ struct vringh_range *r),
+ struct vringh_range *range,
+ int (*copy)(void *dst, const void *src, size_t len))
+{
+ size_t part, len = sizeof(struct vring_desc);
+
+ do {
+ u64 addr;
+ int err;
+
+ part = len;
+ addr = (u64)(unsigned long)src - range->offset;
+
+ if (!rcheck(vrh, addr, &part, range, getrange))
+ return -EINVAL;
+
+ err = copy(dst, src, part);
+ if (err)
+ return err;
+
+ dst += part;
+ src += part;
+ len -= part;
+ } while (len);
+ return 0;
+}
+
+static inline int
+__vringh_iov(struct vringh *vrh, u16 i,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+ struct vringh_range *range,
+ bool (*getrange)(struct vringh *, u64,
+ struct vringh_range *)),
+ bool (*getrange)(struct vringh *, u64, struct vringh_range *),
+ gfp_t gfp,
+ int (*copy)(void *dst, const void *src, size_t len))
+{
+ int err, count = 0, up_next, desc_max;
+ struct vring_desc desc, *descs;
+ struct vringh_range range = { -1ULL, 0 }, slowrange;
+ bool slow = false;
+
+ /* We start traversing vring's descriptor table. */
+ descs = vrh->vring.desc;
+ desc_max = vrh->vring.num;
+ up_next = -1;
+
+ if (riov)
+ riov->i = riov->used = 0;
+ else if (wiov)
+ wiov->i = wiov->used = 0;
+ else
+ /* You must want something! */
+ BUG();
+
+ for (;;) {
+ void *addr;
+ struct vringh_kiov *iov;
+ size_t len;
+
+ if (unlikely(slow))
+ err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
+ &slowrange, copy);
+ else
+ err = copy(&desc, &descs[i], sizeof(desc));
+ if (unlikely(err))
+ goto fail;
+
+ if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+ /* Make sure it's OK, and get offset. */
+ len = desc.len;
+ if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+ err = -EINVAL;
+ goto fail;
+ }
+
+ if (unlikely(len != desc.len)) {
+ slow = true;
+ /* We need to save this range to use offset */
+ slowrange = range;
+ }
+
+ addr = (void *)(long)(desc.addr + range.offset);
+ err = move_to_indirect(&up_next, &i, addr, &desc,
+ &descs, &desc_max);
+ if (err)
+ goto fail;
+ continue;
+ }
+
+ if (count++ == vrh->vring.num) {
+ vringh_bad("Descriptor loop in %p", descs);
+ err = -ELOOP;
+ goto fail;
+ }
+
+ if (desc.flags & VRING_DESC_F_WRITE)
+ iov = wiov;
+ else {
+ iov = riov;
+ if (unlikely(wiov && wiov->i)) {
+ vringh_bad("Readable desc %p after writable",
+ &descs[i]);
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ if (!iov) {
+ vringh_bad("Unexpected %s desc",
+ !wiov ? "writable" : "readable");
+ err = -EPROTO;
+ goto fail;
+ }
+
+ again:
+ /* Make sure it's OK, and get offset. */
+ len = desc.len;
+ if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+ err = -EINVAL;
+ goto fail;
+ }
+ addr = (void *)(unsigned long)(desc.addr + range.offset);
+
+ if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
+ err = resize_iovec(iov, gfp);
+ if (err)
+ goto fail;
+ }
+
+ iov->iov[iov->used].iov_base = addr;
+ iov->iov[iov->used].iov_len = len;
+ iov->used++;
+
+ if (unlikely(len != desc.len)) {
+ desc.len -= len;
+ desc.addr += len;
+ goto again;
+ }
+
+ if (desc.flags & VRING_DESC_F_NEXT) {
+ i = desc.next;
+ } else {
+ /* Just in case we need to finish traversing above. */
+ if (unlikely(up_next > 0)) {
+ i = return_from_indirect(vrh, &up_next,
+ &descs, &desc_max);
+ slow = false;
+ } else
+ break;
+ }
+
+ if (i >= desc_max) {
+ vringh_bad("Chained index %u > %u", i, desc_max);
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ return 0;
+
+fail:
+ return err;
+}
+
+static inline int __vringh_complete(struct vringh *vrh,
+ const struct vring_used_elem *used,
+ unsigned int num_used,
+ int (*putu16)(u16 *p, u16 val),
+ int (*putused)(struct vring_used_elem *dst,
+ const struct vring_used_elem
+ *src, unsigned num))
+{
+ struct vring_used *used_ring;
+ int err;
+ u16 used_idx, off;
+
+ used_ring = vrh->vring.used;
+ used_idx = vrh->last_used_idx + vrh->completed;
+
+ off = used_idx % vrh->vring.num;
+
+ /* Compiler knows num_used == 1 sometimes, hence extra check */
+ if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
+ u16 part = vrh->vring.num - off;
+ err = putused(&used_ring->ring[off], used, part);
+ if (!err)
+ err = putused(&used_ring->ring[0], used + part,
+ num_used - part);
+ } else
+ err = putused(&used_ring->ring[off], used, num_used);
+
+ if (err) {
+ vringh_bad("Failed to write %u used entries %u at %p",
+ num_used, off, &used_ring->ring[off]);
+ return err;
+ }
+
+ /* Make sure buffer is written before we update index. */
+ virtio_wmb(vrh->weak_barriers);
+
+ err = putu16(&vrh->vring.used->idx, used_idx + num_used);
+ if (err) {
+ vringh_bad("Failed to update used index at %p",
+ &vrh->vring.used->idx);
+ return err;
+ }
+
+ vrh->completed += num_used;
+ return 0;
+}
+
+
+static inline int __vringh_need_notify(struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p))
+{
+ bool notify;
+ u16 used_event;
+ int err;
+
+ /* Flush out used index update. This is paired with the
+ * barrier that the Guest executes when enabling
+ * interrupts. */
+ virtio_mb(vrh->weak_barriers);
+
+ /* Old-style, without event indices. */
+ if (!vrh->event_indices) {
+ u16 flags;
+ err = getu16(&flags, &vrh->vring.avail->flags);
+ if (err) {
+ vringh_bad("Failed to get flags at %p",
+ &vrh->vring.avail->flags);
+ return err;
+ }
+ return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
+ }
+
+ /* Modern: we know when other side wants to know. */
+ err = getu16(&used_event, &vring_used_event(&vrh->vring));
+ if (err) {
+ vringh_bad("Failed to get used event idx at %p",
+ &vring_used_event(&vrh->vring));
+ return err;
+ }
+
+ /* Just in case we added so many that we wrap. */
+ if (unlikely(vrh->completed > 0xffff))
+ notify = true;
+ else
+ notify = vring_need_event(used_event,
+ vrh->last_used_idx + vrh->completed,
+ vrh->last_used_idx);
+
+ vrh->last_used_idx += vrh->completed;
+ vrh->completed = 0;
+ return notify;
+}
+
+static inline bool __vringh_notify_enable(struct vringh *vrh,
+ int (*getu16)(u16 *val, const u16 *p),
+ int (*putu16)(u16 *p, u16 val))
+{
+ u16 avail;
+
+ if (!vrh->event_indices) {
+ /* Old-school; update flags. */
+ if (putu16(&vrh->vring.used->flags, 0) != 0) {
+ vringh_bad("Clearing used flags %p",
+ &vrh->vring.used->flags);
+ return true;
+ }
+ } else {
+ if (putu16(&vring_avail_event(&vrh->vring),
+ vrh->last_avail_idx) != 0) {
+ vringh_bad("Updating avail event index %p",
+ &vring_avail_event(&vrh->vring));
+ return true;
+ }
+ }
+
+ /* They could have slipped one in as we were doing that: make
+ * sure it's written, then check again. */
+ virtio_mb(vrh->weak_barriers);
+
+ if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
+ vringh_bad("Failed to check avail idx at %p",
+ &vrh->vring.avail->idx);
+ return true;
+ }
+
+ /* This is unlikely, so we just leave notifications enabled
+ * (if we're using event_indices, we'll only get one
+ * notification anyway). */
+ return avail == vrh->last_avail_idx;
+}
+
+static inline void __vringh_notify_disable(struct vringh *vrh,
+ int (*putu16)(u16 *p, u16 val))
+{
+ if (!vrh->event_indices) {
+ /* Old-school; update flags. */
+ if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
+ vringh_bad("Setting used flags %p",
+ &vrh->vring.used->flags);
+ }
+ }
+}
+
+/* Userspace access helpers: in this case, addresses are really userspace. */
+static inline int getu16_user(u16 *val, const u16 *p)
+{
+ return get_user(*val, (__force u16 __user *)p);
+}
+
+static inline int putu16_user(u16 *p, u16 val)
+{
+ return put_user(val, (__force u16 __user *)p);
+}
+
+static inline int copydesc_user(void *dst, const void *src, size_t len)
+{
+ return copy_from_user(dst, (__force void __user *)src, len) ?
+ -EFAULT : 0;
+}
+
+static inline int putused_user(struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ return copy_to_user((__force void __user *)dst, src,
+ sizeof(*dst) * num) ? -EFAULT : 0;
+}
+
+static inline int xfer_from_user(void *src, void *dst, size_t len)
+{
+ return copy_from_user(dst, (__force void __user *)src, len) ?
+ -EFAULT : 0;
+}
+
+static inline int xfer_to_user(void *dst, void *src, size_t len)
+{
+ return copy_to_user((__force void __user *)dst, src, len) ?
+ -EFAULT : 0;
+}
+
+/**
+ * vringh_init_user - initialize a vringh for a userspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid: you should check pointers
+ * yourself!
+ */
+int vringh_init_user(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used)
+{
+ /* Sane power of 2 please! */
+ if (!num || num > 0xffff || (num & (num - 1))) {
+ vringh_bad("Bad ring size %u", num);
+ return -EINVAL;
+ }
+
+ vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+ vrh->weak_barriers = weak_barriers;
+ vrh->completed = 0;
+ vrh->last_avail_idx = 0;
+ vrh->last_used_idx = 0;
+ vrh->vring.num = num;
+ /* vring expects kernel addresses, but only used via accessors. */
+ vrh->vring.desc = (__force struct vring_desc *)desc;
+ vrh->vring.avail = (__force struct vring_avail *)avail;
+ vrh->vring.used = (__force struct vring_used *)used;
+ return 0;
+}
+EXPORT_SYMBOL(vringh_init_user);
+
+/**
+ * vringh_getdesc_user - get next available descriptor from userspace ring.
+ * @vrh: the userspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @getrange: function to call to check ranges.
+ * @head: head index we received, for passing to vringh_complete_user().
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_user(struct vringh *vrh,
+ struct vringh_iov *riov,
+ struct vringh_iov *wiov,
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr, struct vringh_range *r),
+ u16 *head)
+{
+ int err;
+
+ *head = vrh->vring.num;
+ err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ /* We need the layouts to be the identical for this to work */
+ BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
+ offsetof(struct vringh_iov, iov));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
+ offsetof(struct vringh_iov, i));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
+ offsetof(struct vringh_iov, used));
+ BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
+ offsetof(struct vringh_iov, max_num));
+ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+ BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
+ offsetof(struct kvec, iov_base));
+ BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
+ offsetof(struct kvec, iov_len));
+ BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
+ != sizeof(((struct kvec *)NULL)->iov_base));
+ BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
+ != sizeof(((struct kvec *)NULL)->iov_len));
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
+ (struct vringh_kiov *)wiov,
+ range_check, getrange, GFP_KERNEL, copydesc_user);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_user);
+
+/**
+ * vringh_iov_pull_user - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
+{
+ return vringh_iov_xfer((struct vringh_kiov *)riov,
+ dst, len, xfer_from_user);
+}
+EXPORT_SYMBOL(vringh_iov_pull_user);
+
+/**
+ * vringh_iov_push_user - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer((struct vringh_kiov *)wiov,
+ (void *)src, len, xfer_to_user);
+}
+EXPORT_SYMBOL(vringh_iov_push_user);
+
+/**
+ * vringh_abandon_user - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_user() to undo).
+ *
+ * The next vringh_get_user() will return the old descriptor(s) again.
+ */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet. */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_user);
+
+/**
+ * vringh_complete_user - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_user.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = head;
+ used.len = len;
+ return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_user);
+
+/**
+ * vringh_complete_multi_user - we've finished with many descriptors.
+ * @vrh: the vring.
+ * @used: the head, length pairs.
+ * @num_used: the number of used elements.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_multi_user(struct vringh *vrh,
+ const struct vring_used_elem used[],
+ unsigned num_used)
+{
+ return __vringh_complete(vrh, used, num_used,
+ putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_multi_user);
+
+/**
+ * vringh_notify_enable_user - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_user(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_user, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_enable_user);
+
+/**
+ * vringh_notify_disable_user - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_user(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_disable_user);
+
+/**
+ * vringh_need_notify_user - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_user() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_user(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_user);
+}
+EXPORT_SYMBOL(vringh_need_notify_user);
+
+/* Kernelspace access helpers. */
+static inline int getu16_kern(u16 *val, const u16 *p)
+{
+ *val = ACCESS_ONCE(*p);
+ return 0;
+}
+
+static inline int putu16_kern(u16 *p, u16 val)
+{
+ ACCESS_ONCE(*p) = val;
+ return 0;
+}
+
+static inline int copydesc_kern(void *dst, const void *src, size_t len)
+{
+ memcpy(dst, src, len);
+ return 0;
+}
+
+static inline int putused_kern(struct vring_used_elem *dst,
+ const struct vring_used_elem *src,
+ unsigned int num)
+{
+ memcpy(dst, src, num * sizeof(*dst));
+ return 0;
+}
+
+static inline int xfer_kern(void *src, void *dst, size_t len)
+{
+ memcpy(dst, src, len);
+ return 0;
+}
+
+/**
+ * vringh_init_kern - initialize a vringh for a kernelspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used)
+{
+ /* Sane power of 2 please! */
+ if (!num || num > 0xffff || (num & (num - 1))) {
+ vringh_bad("Bad ring size %u", num);
+ return -EINVAL;
+ }
+
+ vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+ vrh->weak_barriers = weak_barriers;
+ vrh->completed = 0;
+ vrh->last_avail_idx = 0;
+ vrh->last_used_idx = 0;
+ vrh->vring.num = num;
+ vrh->vring.desc = desc;
+ vrh->vring.avail = avail;
+ vrh->vring.used = used;
+ return 0;
+}
+EXPORT_SYMBOL(vringh_init_kern);
+
+/**
+ * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_kern().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num. You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_kern(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp)
+{
+ int err;
+
+ err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
+ if (err < 0)
+ return err;
+
+ /* Empty... */
+ if (err == vrh->vring.num)
+ return 0;
+
+ *head = err;
+ err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+ gfp, copydesc_kern);
+ if (err)
+ return err;
+
+ return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_kern);
+
+/**
+ * vringh_iov_pull_kern - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
+{
+ return vringh_iov_xfer(riov, dst, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_pull_kern);
+
+/**
+ * vringh_iov_push_kern - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+ const void *src, size_t len)
+{
+ return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_push_kern);
+
+/**
+ * vringh_abandon_kern - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ * vringh_get_kern() to undo).
+ *
+ * The next vringh_get_kern() will return the old descriptor(s) again.
+ */
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
+{
+ /* We only update vring_avail_event(vr) when we want to be notified,
+ * so we haven't changed that yet. */
+ vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_kern);
+
+/**
+ * vringh_complete_kern - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_kern.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_kern() after one or more calls
+ * to this function.
+ */
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
+{
+ struct vring_used_elem used;
+
+ used.id = head;
+ used.len = len;
+
+ return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
+}
+EXPORT_SYMBOL(vringh_complete_kern);
+
+/**
+ * vringh_notify_enable_kern - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_kern(struct vringh *vrh)
+{
+ return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_enable_kern);
+
+/**
+ * vringh_notify_disable_kern - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_kern(struct vringh *vrh)
+{
+ __vringh_notify_disable(vrh, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_disable_kern);
+
+/**
+ * vringh_need_notify_kern - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_kern() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_kern(struct vringh *vrh)
+{
+ return __vringh_need_notify(vrh, getu16_kern);
+}
+EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
index 8dab163..bd3ae32 100644
--- a/drivers/virtio/virtio_balloon.c
+++ b/drivers/virtio/virtio_balloon.c
@@ -108,7 +108,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq)
sg_init_one(&sg, vb->pfns, sizeof(vb->pfns[0]) * vb->num_pfns);
/* We should always be able to add one buffer to an empty queue. */
- if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+ if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
@@ -256,7 +256,7 @@ static void stats_handle_request(struct virtio_balloon *vb)
if (!virtqueue_get_buf(vq, &len))
return;
sg_init_one(&sg, vb->stats, sizeof(vb->stats));
- if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
+ if (virtqueue_add_outbuf(vq, &sg, 1, vb, GFP_KERNEL) < 0)
BUG();
virtqueue_kick(vq);
}
@@ -341,7 +341,7 @@ static int init_vqs(struct virtio_balloon *vb)
* use it to signal us later.
*/
sg_init_one(&sg, vb->stats, sizeof vb->stats);
- if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL)
+ if (virtqueue_add_outbuf(vb->stats_vq, &sg, 1, vb, GFP_KERNEL)
< 0)
BUG();
virtqueue_kick(vb->stats_vq);
diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7d..5217baf 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
#include <linux/module.h>
#include <linux/hrtimer.h>
-/* virtio guest is communicating with a virtual "device" that actually runs on
- * a host processor. Memory barriers are used to control SMP effects. */
-#ifdef CONFIG_SMP
-/* Where possible, use SMP barriers which are more lightweight than mandatory
- * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio-pci does not use). */
-#define virtio_mb(vq) \
- do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
-#define virtio_rmb(vq) \
- do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
-#define virtio_wmb(vq) \
- do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
-#else
-/* We must force memory ordering even if guest is UP since host could be
- * running on another CPU, but SMP barriers are defined to barrier() in that
- * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb(vq) mb()
-#define virtio_rmb(vq) rmb()
-#define virtio_wmb(vq) wmb()
-#endif
-
#ifdef DEBUG
/* For development, we want to crash whenever the ring is screwed. */
#define BAD_RING(_vq, fmt, args...) \
@@ -119,16 +98,36 @@ struct vring_virtqueue
#define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
+static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
+ unsigned int *count)
+{
+ return sg_next(sg);
+}
+
+static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
+ unsigned int *count)
+{
+ if (--(*count) == 0)
+ return NULL;
+ return sg + 1;
+}
+
/* Set up an indirect table of descriptors and add it to the queue. */
-static int vring_add_indirect(struct vring_virtqueue *vq,
- struct scatterlist sg[],
- unsigned int out,
- unsigned int in,
- gfp_t gfp)
+static inline int vring_add_indirect(struct vring_virtqueue *vq,
+ struct scatterlist *sgs[],
+ struct scatterlist *(*next)
+ (struct scatterlist *, unsigned int *),
+ unsigned int total_sg,
+ unsigned int total_out,
+ unsigned int total_in,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ gfp_t gfp)
{
struct vring_desc *desc;
unsigned head;
- int i;
+ struct scatterlist *sg;
+ int i, n;
/*
* We require lowmem mappings for the descriptors because
@@ -137,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
*/
gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
- desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+ desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
if (!desc)
return -ENOMEM;
- /* Transfer entries from the sg list into the indirect page */
- for (i = 0; i < out; i++) {
- desc[i].flags = VRING_DESC_F_NEXT;
- desc[i].addr = sg_phys(sg);
- desc[i].len = sg->length;
- desc[i].next = i+1;
- sg++;
+ /* Transfer entries from the sg lists into the indirect page */
+ i = 0;
+ for (n = 0; n < out_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+ desc[i].flags = VRING_DESC_F_NEXT;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ i++;
+ }
}
- for (; i < (out + in); i++) {
- desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
- desc[i].addr = sg_phys(sg);
- desc[i].len = sg->length;
- desc[i].next = i+1;
- sg++;
+ for (; n < (out_sgs + in_sgs); n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+ desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+ desc[i].addr = sg_phys(sg);
+ desc[i].len = sg->length;
+ desc[i].next = i+1;
+ i++;
+ }
}
+ BUG_ON(i != total_sg);
/* Last one doesn't continue. */
desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -176,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
return head;
}
-/**
- * virtqueue_add_buf - expose buffer to other end
- * @vq: the struct virtqueue we're talking about.
- * @sg: the description of the buffer(s).
- * @out_num: the number of sg readable by other side
- * @in_num: the number of sg which are writable (after readable ones)
- * @data: the token identifying the buffer.
- * @gfp: how to do memory allocations (if necessary).
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
- struct scatterlist sg[],
- unsigned int out,
- unsigned int in,
- void *data,
- gfp_t gfp)
+static inline int virtqueue_add(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ struct scatterlist *(*next)
+ (struct scatterlist *, unsigned int *),
+ unsigned int total_out,
+ unsigned int total_in,
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
{
struct vring_virtqueue *vq = to_vvq(_vq);
- unsigned int i, avail, uninitialized_var(prev);
+ struct scatterlist *sg;
+ unsigned int i, n, avail, uninitialized_var(prev), total_sg;
int head;
START_USE(vq);
@@ -218,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
}
#endif
+ total_sg = total_in + total_out;
+
/* If the host supports indirect descriptor tables, and we have multiple
* buffers, then go indirect. FIXME: tune this threshold */
- if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
- head = vring_add_indirect(vq, sg, out, in, gfp);
+ if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
+ head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+ total_in,
+ out_sgs, in_sgs, gfp);
if (likely(head >= 0))
goto add_head;
}
- BUG_ON(out + in > vq->vring.num);
- BUG_ON(out + in == 0);
+ BUG_ON(total_sg > vq->vring.num);
+ BUG_ON(total_sg == 0);
- if (vq->vq.num_free < out + in) {
+ if (vq->vq.num_free < total_sg) {
pr_debug("Can't add buf len %i - avail = %i\n",
- out + in, vq->vq.num_free);
+ total_sg, vq->vq.num_free);
/* FIXME: for historical reasons, we force a notify here if
* there are outgoing parts to the buffer. Presumably the
* host should service the ring ASAP. */
- if (out)
+ if (out_sgs)
vq->notify(&vq->vq);
END_USE(vq);
return -ENOSPC;
}
/* We're about to use some buffers from the free list. */
- vq->vq.num_free -= out + in;
-
- head = vq->free_head;
- for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- sg++;
+ vq->vq.num_free -= total_sg;
+
+ head = i = vq->free_head;
+ for (n = 0; n < out_sgs; n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+ vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+ vq->vring.desc[i].addr = sg_phys(sg);
+ vq->vring.desc[i].len = sg->length;
+ prev = i;
+ i = vq->vring.desc[i].next;
+ }
}
- for (; in; i = vq->vring.desc[i].next, in--) {
- vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
- vq->vring.desc[i].addr = sg_phys(sg);
- vq->vring.desc[i].len = sg->length;
- prev = i;
- sg++;
+ for (; n < (out_sgs + in_sgs); n++) {
+ for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+ vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+ vq->vring.desc[i].addr = sg_phys(sg);
+ vq->vring.desc[i].len = sg->length;
+ prev = i;
+ i = vq->vring.desc[i].next;
+ }
}
/* Last one doesn't continue. */
vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -276,7 +280,7 @@ add_head:
/* Descriptors and available array need to be set before we expose the
* new available array entries. */
- virtio_wmb(vq);
+ virtio_wmb(vq->weak_barriers);
vq->vring.avail->idx++;
vq->num_added++;
@@ -290,9 +294,122 @@ add_head:
return 0;
}
+
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+ struct scatterlist sg[],
+ unsigned int out,
+ unsigned int in,
+ void *data,
+ gfp_t gfp)
+{
+ struct scatterlist *sgs[2];
+
+ sgs[0] = sg;
+ sgs[1] = sg + out;
+
+ return virtqueue_add(_vq, sgs, sg_next_arr,
+ out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
+}
EXPORT_SYMBOL_GPL(virtqueue_add_buf);
/**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_num: the number of scatterlists readable by other side
+ * @in_num: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+ struct scatterlist *sgs[],
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp)
+{
+ unsigned int i, total_out, total_in;
+
+ /* Count them first. */
+ for (i = total_out = total_in = 0; i < out_sgs; i++) {
+ struct scatterlist *sg;
+ for (sg = sgs[i]; sg; sg = sg_next(sg))
+ total_out++;
+ }
+ for (; i < out_sgs + in_sgs; i++) {
+ struct scatterlist *sg;
+ for (sg = sgs[i]; sg; sg = sg_next(sg))
+ total_in++;
+ }
+ return virtqueue_add(_vq, sgs, sg_next_chained,
+ total_out, total_in, out_sgs, in_sgs, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp)
+{
+ return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
+/**
* virtqueue_kick_prepare - first half of split virtqueue_kick call.
* @vq: the struct virtqueue
*
@@ -312,7 +429,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
START_USE(vq);
/* We need to expose available array entries before checking avail
* event. */
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
old = vq->vring.avail->idx - vq->num_added;
new = vq->vring.avail->idx;
@@ -436,7 +553,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
}
/* Only get used array entries after they have been exposed by host. */
- virtio_rmb(vq);
+ virtio_rmb(vq->weak_barriers);
last_used = (vq->last_used_idx & (vq->vring.num - 1));
i = vq->vring.used->ring[last_used].id;
@@ -460,7 +577,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
* the read in the next get_buf call. */
if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
vring_used_event(&vq->vring) = vq->last_used_idx;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
}
#ifdef DEBUG
@@ -513,7 +630,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
* entry. Always do both to keep code simple. */
vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
vring_used_event(&vq->vring) = vq->last_used_idx;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
if (unlikely(more_used(vq))) {
END_USE(vq);
return false;
@@ -553,7 +670,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
/* TODO: tune this threshold */
bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
- virtio_mb(vq);
+ virtio_mb(vq->weak_barriers);
if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
END_USE(vq);
return false;
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 40cb05a..b5f9a00 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -42,7 +42,7 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
int (*deselect) (struct i2c_adapter *,
void *mux_dev, u32 chan_id));
-int i2c_del_mux_adapter(struct i2c_adapter *adap);
+void i2c_del_mux_adapter(struct i2c_adapter *adap);
#endif /* __KERNEL__ */
diff --git a/include/linux/i2c-tegra.h b/include/linux/i2c-tegra.h
deleted file mode 100644
index 9c85da4..0000000
--- a/include/linux/i2c-tegra.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * drivers/i2c/busses/i2c-tegra.c
- *
- * Copyright (C) 2010 Google, Inc.
- * Author: Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_I2C_TEGRA_H
-#define _LINUX_I2C_TEGRA_H
-
-struct tegra_i2c_platform_data {
- unsigned long bus_clk_rate;
-};
-
-#endif /* _LINUX_I2C_TEGRA_H */
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d0c4db7..e988fa9 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -125,7 +125,6 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
* struct i2c_driver - represent an I2C device driver
* @class: What kind of i2c device we instantiate (for detect)
* @attach_adapter: Callback for bus addition (deprecated)
- * @detach_adapter: Callback for bus removal (deprecated)
* @probe: Callback for device binding
* @remove: Callback for device unbinding
* @shutdown: Callback for device shutdown
@@ -162,12 +161,10 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
struct i2c_driver {
unsigned int class;
- /* Notifies the driver that a new bus has appeared or is about to be
- * removed. You should avoid using this, it will be removed in a
- * near future.
+ /* Notifies the driver that a new bus has appeared. You should avoid
+ * using this, it will be removed in a near future.
*/
int (*attach_adapter)(struct i2c_adapter *) __deprecated;
- int (*detach_adapter)(struct i2c_adapter *) __deprecated;
/* Standard driver model interfaces */
int (*probe)(struct i2c_client *, const struct i2c_device_id *);
@@ -370,6 +367,45 @@ struct i2c_algorithm {
u32 (*functionality) (struct i2c_adapter *);
};
+/**
+ * struct i2c_bus_recovery_info - I2C bus recovery information
+ * @recover_bus: Recover routine. Either pass driver's recover_bus() routine, or
+ * i2c_generic_scl_recovery() or i2c_generic_gpio_recovery().
+ * @get_scl: This gets current value of SCL line. Mandatory for generic SCL
+ * recovery. Used internally for generic GPIO recovery.
+ * @set_scl: This sets/clears SCL line. Mandatory for generic SCL recovery. Used
+ * internally for generic GPIO recovery.
+ * @get_sda: This gets current value of SDA line. Optional for generic SCL
+ * recovery. Used internally, if sda_gpio is a valid GPIO, for generic GPIO
+ * recovery.
+ * @prepare_recovery: This will be called before starting recovery. Platform may
+ * configure padmux here for SDA/SCL line or something else they want.
+ * @unprepare_recovery: This will be called after completing recovery. Platform
+ * may configure padmux here for SDA/SCL line or something else they want.
+ * @scl_gpio: gpio number of the SCL line. Only required for GPIO recovery.
+ * @sda_gpio: gpio number of the SDA line. Only required for GPIO recovery.
+ */
+struct i2c_bus_recovery_info {
+ int (*recover_bus)(struct i2c_adapter *);
+
+ int (*get_scl)(struct i2c_adapter *);
+ void (*set_scl)(struct i2c_adapter *, int val);
+ int (*get_sda)(struct i2c_adapter *);
+
+ void (*prepare_recovery)(struct i2c_bus_recovery_info *bri);
+ void (*unprepare_recovery)(struct i2c_bus_recovery_info *bri);
+
+ /* gpio recovery */
+ int scl_gpio;
+ int sda_gpio;
+};
+
+int i2c_recover_bus(struct i2c_adapter *adap);
+
+/* Generic recovery routines */
+int i2c_generic_gpio_recovery(struct i2c_adapter *adap);
+int i2c_generic_scl_recovery(struct i2c_adapter *adap);
+
/*
* i2c_adapter is the structure used to identify a physical i2c bus along
* with the access algorithms necessary to access it.
@@ -393,6 +429,8 @@ struct i2c_adapter {
struct mutex userspace_clients_lock;
struct list_head userspace_clients;
+
+ struct i2c_bus_recovery_info *bus_recovery_info;
};
#define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
@@ -450,7 +488,7 @@ void i2c_unlock_adapter(struct i2c_adapter *);
*/
#if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
extern int i2c_add_adapter(struct i2c_adapter *);
-extern int i2c_del_adapter(struct i2c_adapter *);
+extern void i2c_del_adapter(struct i2c_adapter *);
extern int i2c_add_numbered_adapter(struct i2c_adapter *);
extern int i2c_register_driver(struct module *, struct i2c_driver *);
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdae..bfc47e0 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -172,6 +172,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
}
/**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ * Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+ sg->page_link &= ~0x02;
+}
+
+/**
* sg_phys - Return physical address of an sg entry
* @sg: SG entry
*
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e862..ac8d488 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
void *device_data);
extern void *vfio_del_group_dev(struct device *dev);
+extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
+extern void vfio_device_put(struct vfio_device *device);
+extern void *vfio_device_data(struct vfio_device *device);
/**
* struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 2d7a5e0..9ff8645 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -8,6 +8,7 @@
#include <linux/device.h>
#include <linux/mod_devicetable.h>
#include <linux/gfp.h>
+#include <linux/vringh.h>
/**
* virtqueue - a queue to register buffers for sending or receiving.
@@ -40,6 +41,23 @@ int virtqueue_add_buf(struct virtqueue *vq,
void *data,
gfp_t gfp);
+int virtqueue_add_outbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp);
+
+int virtqueue_add_inbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp);
+
+int virtqueue_add_sgs(struct virtqueue *vq,
+ struct scatterlist *sgs[],
+ unsigned int out_sgs,
+ unsigned int in_sgs,
+ void *data,
+ gfp_t gfp);
+
void virtqueue_kick(struct virtqueue *vq);
bool virtqueue_kick_prepare(struct virtqueue *vq);
@@ -64,6 +82,7 @@ unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
* @dev: underlying device.
* @id: the device type identification (used to match it with a driver).
* @config: the configuration ops for this device.
+ * @vringh_config: configuration ops for host vrings.
* @vqs: the list of virtqueues for this device.
* @features: the features supported by both driver and device.
* @priv: private pointer for the driver's use.
@@ -73,6 +92,7 @@ struct virtio_device {
struct device dev;
struct virtio_device_id id;
const struct virtio_config_ops *config;
+ const struct vringh_config_ops *vringh_config;
struct list_head vqs;
/* Note that this is a Linux set_bit-style bitmap. */
unsigned long features[1];
diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h
new file mode 100644
index 0000000..5d2d312
--- /dev/null
+++ b/include/linux/virtio_caif.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2012
+ * Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
+ *
+ * This header is BSD licensed so
+ * anyone can use the definitions to implement compatible remote processors
+ */
+
+#ifndef VIRTIO_CAIF_H
+#define VIRTIO_CAIF_H
+
+#include <linux/types.h>
+struct virtio_caif_transf_config {
+ u16 headroom;
+ u16 tailroom;
+ u32 mtu;
+ u8 reserved[4];
+};
+
+struct virtio_caif_config {
+ struct virtio_caif_transf_config uplink, downlink;
+ u8 reserved[8];
+};
+#endif
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 63c6ea1..ca3ad41 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -4,6 +4,63 @@
#include <linux/irqreturn.h>
#include <uapi/linux/virtio_ring.h>
+/*
+ * Barriers in virtio are tricky. Non-SMP virtio guests can't assume
+ * they're not on an SMP host system, so they need to assume real
+ * barriers. Non-SMP virtio hosts could skip the barriers, but does
+ * anyone care?
+ *
+ * For virtio_pci on SMP, we don't need to order with respect to MMIO
+ * accesses through relaxed memory I/O windows, so smp_mb() et al are
+ * sufficient.
+ *
+ * For using virtio to talk to real devices (eg. other heterogeneous
+ * CPUs) we do need real barriers. In theory, we could be using both
+ * kinds of virtio, so it's a runtime decision, and the branch is
+ * actually quite cheap.
+ */
+
+#ifdef CONFIG_SMP
+static inline void virtio_mb(bool weak_barriers)
+{
+ if (weak_barriers)
+ smp_mb();
+ else
+ mb();
+}
+
+static inline void virtio_rmb(bool weak_barriers)
+{
+ if (weak_barriers)
+ smp_rmb();
+ else
+ rmb();
+}
+
+static inline void virtio_wmb(bool weak_barriers)
+{
+ if (weak_barriers)
+ smp_wmb();
+ else
+ wmb();
+}
+#else
+static inline void virtio_mb(bool weak_barriers)
+{
+ mb();
+}
+
+static inline void virtio_rmb(bool weak_barriers)
+{
+ rmb();
+}
+
+static inline void virtio_wmb(bool weak_barriers)
+{
+ wmb();
+}
+#endif
+
struct virtio_device;
struct virtqueue;
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
new file mode 100644
index 0000000..749cde2
--- /dev/null
+++ b/include/linux/vringh.h
@@ -0,0 +1,225 @@
+/*
+ * Linux host-side vring helpers; for when the kernel needs to access
+ * someone else's vring.
+ *
+ * Copyright IBM Corporation, 2013.
+ * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Written by: Rusty Russell <rusty@rustcorp.com.au>
+ */
+#ifndef _LINUX_VRINGH_H
+#define _LINUX_VRINGH_H
+#include <uapi/linux/virtio_ring.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <asm/barrier.h>
+
+/* virtio_ring with information needed for host access. */
+struct vringh {
+ /* Guest publishes used event idx (note: we always do). */
+ bool event_indices;
+
+ /* Can we get away with weak barriers? */
+ bool weak_barriers;
+
+ /* Last available index we saw (ie. where we're up to). */
+ u16 last_avail_idx;
+
+ /* Last index we used. */
+ u16 last_used_idx;
+
+ /* How many descriptors we've completed since last need_notify(). */
+ u32 completed;
+
+ /* The vring (note: it may contain user pointers!) */
+ struct vring vring;
+
+ /* The function to call to notify the guest about added buffers */
+ void (*notify)(struct vringh *);
+};
+
+/**
+ * struct vringh_config_ops - ops for creating a host vring from a virtio driver
+ * @find_vrhs: find the host vrings and instantiate them
+ * vdev: the virtio_device
+ * nhvrs: the number of host vrings to find
+ * hvrs: on success, includes new host vrings
+ * callbacks: array of driver callbacks, for each host vring
+ * include a NULL entry for vqs that do not need a callback
+ * Returns 0 on success or error status
+ * @del_vrhs: free the host vrings found by find_vrhs().
+ */
+struct virtio_device;
+typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
+struct vringh_config_ops {
+ int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
+ struct vringh *vrhs[], vrh_callback_t *callbacks[]);
+ void (*del_vrhs)(struct virtio_device *vdev);
+};
+
+/* The memory the vring can access, and what offset to apply. */
+struct vringh_range {
+ u64 start, end_incl;
+ u64 offset;
+};
+
+/**
+ * struct vringh_iov - iovec mangler.
+ *
+ * Mangles iovec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_iov {
+ struct iovec *iov;
+ size_t consumed; /* Within iov[i] */
+ unsigned i, used, max_num;
+};
+
+/**
+ * struct vringh_iov - kvec mangler.
+ *
+ * Mangles kvec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_kiov {
+ struct kvec *iov;
+ size_t consumed; /* Within iov[i] */
+ unsigned i, used, max_num;
+};
+
+/* Flag on max_num to indicate we're kmalloced. */
+#define VRINGH_IOV_ALLOCATED 0x8000000
+
+/* Helpers for userspace vrings. */
+int vringh_init_user(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc __user *desc,
+ struct vring_avail __user *avail,
+ struct vring_used __user *used);
+
+static inline void vringh_iov_init(struct vringh_iov *iov,
+ struct iovec *iovec, unsigned num)
+{
+ iov->used = iov->i = 0;
+ iov->consumed = 0;
+ iov->max_num = num;
+ iov->iov = iovec;
+}
+
+static inline void vringh_iov_reset(struct vringh_iov *iov)
+{
+ iov->iov[iov->i].iov_len += iov->consumed;
+ iov->iov[iov->i].iov_base -= iov->consumed;
+ iov->consumed = 0;
+ iov->i = 0;
+}
+
+static inline void vringh_iov_cleanup(struct vringh_iov *iov)
+{
+ if (iov->max_num & VRINGH_IOV_ALLOCATED)
+ kfree(iov->iov);
+ iov->max_num = iov->used = iov->i = iov->consumed = 0;
+ iov->iov = NULL;
+}
+
+/* Convert a descriptor into iovecs. */
+int vringh_getdesc_user(struct vringh *vrh,
+ struct vringh_iov *riov,
+ struct vringh_iov *wiov,
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr, struct vringh_range *r),
+ u16 *head);
+
+/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
+
+/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+ const void *src, size_t len);
+
+/* Mark a descriptor as used. */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
+int vringh_complete_multi_user(struct vringh *vrh,
+ const struct vring_used_elem used[],
+ unsigned num_used);
+
+/* Pretend we've never seen descriptor (for easy error handling). */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num);
+
+/* Do we need to fire the eventfd to notify the other side? */
+int vringh_need_notify_user(struct vringh *vrh);
+
+bool vringh_notify_enable_user(struct vringh *vrh);
+void vringh_notify_disable_user(struct vringh *vrh);
+
+/* Helpers for kernelspace vrings. */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+ unsigned int num, bool weak_barriers,
+ struct vring_desc *desc,
+ struct vring_avail *avail,
+ struct vring_used *used);
+
+static inline void vringh_kiov_init(struct vringh_kiov *kiov,
+ struct kvec *kvec, unsigned num)
+{
+ kiov->used = kiov->i = 0;
+ kiov->consumed = 0;
+ kiov->max_num = num;
+ kiov->iov = kvec;
+}
+
+static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
+{
+ kiov->iov[kiov->i].iov_len += kiov->consumed;
+ kiov->iov[kiov->i].iov_base -= kiov->consumed;
+ kiov->consumed = 0;
+ kiov->i = 0;
+}
+
+static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
+{
+ if (kiov->max_num & VRINGH_IOV_ALLOCATED)
+ kfree(kiov->iov);
+ kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
+ kiov->iov = NULL;
+}
+
+int vringh_getdesc_kern(struct vringh *vrh,
+ struct vringh_kiov *riov,
+ struct vringh_kiov *wiov,
+ u16 *head,
+ gfp_t gfp);
+
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+ const void *src, size_t len);
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
+
+bool vringh_notify_enable_kern(struct vringh *vrh);
+void vringh_notify_disable_kern(struct vringh *vrh);
+
+int vringh_need_notify_kern(struct vringh *vrh);
+
+/* Notify the guest about buffers added to the used ring */
+static inline void vringh_notify(struct vringh *vrh)
+{
+ if (vrh->notify)
+ vrh->notify(vrh);
+}
+
+#endif /* _LINUX_VRINGH_H */
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 4f41f30..284ff24 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -319,6 +319,7 @@ enum {
VFIO_PCI_INTX_IRQ_INDEX,
VFIO_PCI_MSI_IRQ_INDEX,
VFIO_PCI_MSIX_IRQ_INDEX,
+ VFIO_PCI_ERR_IRQ_INDEX,
VFIO_PCI_NUM_IRQS
};
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index e847f1e..bb6a5b4 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -127,4 +127,32 @@ struct vhost_memory {
/* vhost-net should add virtio_net_hdr for RX, and strip for TX packets. */
#define VHOST_NET_F_VIRTIO_NET_HDR 27
+/* VHOST_SCSI specific definitions */
+
+/*
+ * Used by QEMU userspace to ensure a consistent vhost-scsi ABI.
+ *
+ * ABI Rev 0: July 2012 version starting point for v3.6-rc merge candidate +
+ * RFC-v2 vhost-scsi userspace. Add GET_ABI_VERSION ioctl usage
+ * ABI Rev 1: January 2013. Ignore vhost_tpgt filed in struct vhost_scsi_target.
+ * All the targets under vhost_wwpn can be seen and used by guset.
+ */
+
+#define VHOST_SCSI_ABI_VERSION 1
+
+struct vhost_scsi_target {
+ int abi_version;
+ char vhost_wwpn[224]; /* TRANSPORT_IQN_LEN */
+ unsigned short vhost_tpgt;
+ unsigned short reserved;
+};
+
+#define VHOST_SCSI_SET_ENDPOINT _IOW(VHOST_VIRTIO, 0x40, struct vhost_scsi_target)
+#define VHOST_SCSI_CLEAR_ENDPOINT _IOW(VHOST_VIRTIO, 0x41, struct vhost_scsi_target)
+/* Changing this breaks userspace. */
+#define VHOST_SCSI_GET_ABI_VERSION _IOW(VHOST_VIRTIO, 0x42, int)
+/* Set and get the events missed flag */
+#define VHOST_SCSI_SET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x43, __u32)
+#define VHOST_SCSI_GET_EVENTS_MISSED _IOW(VHOST_VIRTIO, 0x44, __u32)
+
#endif
diff --git a/include/uapi/linux/virtio_balloon.h b/include/uapi/linux/virtio_balloon.h
index 652dc8b..5e26f61 100644
--- a/include/uapi/linux/virtio_balloon.h
+++ b/include/uapi/linux/virtio_balloon.h
@@ -52,8 +52,8 @@ struct virtio_balloon_config
#define VIRTIO_BALLOON_S_NR 6
struct virtio_balloon_stat {
- u16 tag;
- u64 val;
+ __u16 tag;
+ __u64 val;
} __attribute__((packed));
#endif /* _LINUX_VIRTIO_BALLOON_H */
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index a7630d0..284fc3a 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -38,5 +38,6 @@
#define VIRTIO_ID_SCSI 8 /* virtio scsi */
#define VIRTIO_ID_9P 9 /* 9p virtio console */
#define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF 12 /* Virtio caif */
#endif /* _LINUX_VIRTIO_IDS_H */
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index de2e950..e1c26b1 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -194,11 +194,14 @@ static int pack_sg_list(struct scatterlist *sg, int start,
if (s > count)
s = count;
BUG_ON(index > limit);
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
sg_set_buf(&sg[index++], data, s);
count -= s;
data += s;
}
-
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
return index-start;
}
@@ -236,12 +239,17 @@ pack_sg_list_p(struct scatterlist *sg, int start, int limit,
s = rest_of_page(data);
if (s > count)
s = count;
+ /* Make sure we don't terminate early. */
+ sg_unmark_end(&sg[index]);
sg_set_page(&sg[index++], pdata[i++], s, data_off);
data_off = 0;
data += s;
count -= s;
nr_pages--;
}
+
+ if (index-start)
+ sg_mark_end(&sg[index - 1]);
return index - start;
}
@@ -256,9 +264,10 @@ static int
p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
{
int err;
- int in, out;
+ int in, out, out_sgs, in_sgs;
unsigned long flags;
struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[2];
p9_debug(P9_DEBUG_TRANS, "9p debug: virtio request\n");
@@ -266,14 +275,19 @@ p9_virtio_request(struct p9_client *client, struct p9_req_t *req)
req_retry:
spin_lock_irqsave(&chan->lock, flags);
+ out_sgs = in_sgs = 0;
/* Handle out VirtIO ring buffers */
out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
+ if (out)
+ sgs[out_sgs++] = chan->sg;
in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
- err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC);
if (err < 0) {
if (err == -ENOSPC) {
@@ -289,7 +303,7 @@ req_retry:
} else {
spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS,
- "virtio rpc add_buf returned failure\n");
+ "virtio rpc add_sgs returned failure\n");
return -EIO;
}
}
@@ -351,11 +365,12 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
char *uidata, char *uodata, int inlen,
int outlen, int in_hdr_len, int kern_buf)
{
- int in, out, err;
+ int in, out, err, out_sgs, in_sgs;
unsigned long flags;
int in_nr_pages = 0, out_nr_pages = 0;
struct page **in_pages = NULL, **out_pages = NULL;
struct virtio_chan *chan = client->trans;
+ struct scatterlist *sgs[4];
p9_debug(P9_DEBUG_TRANS, "virtio request\n");
@@ -396,13 +411,22 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req,
req->status = REQ_STATUS_SENT;
req_retry_pinned:
spin_lock_irqsave(&chan->lock, flags);
+
+ out_sgs = in_sgs = 0;
+
/* out data */
out = pack_sg_list(chan->sg, 0,
VIRTQUEUE_NUM, req->tc->sdata, req->tc->size);
- if (out_pages)
+ if (out)
+ sgs[out_sgs++] = chan->sg;
+
+ if (out_pages) {
+ sgs[out_sgs++] = chan->sg + out;
out += pack_sg_list_p(chan->sg, out, VIRTQUEUE_NUM,
out_pages, out_nr_pages, uodata, outlen);
+ }
+
/*
* Take care of in data
* For example TREAD have 11.
@@ -412,11 +436,17 @@ req_retry_pinned:
*/
in = pack_sg_list(chan->sg, out,
VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len);
- if (in_pages)
+ if (in)
+ sgs[out_sgs + in_sgs++] = chan->sg + out;
+
+ if (in_pages) {
+ sgs[out_sgs + in_sgs++] = chan->sg + out + in;
in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
in_pages, in_nr_pages, uidata, inlen);
+ }
- err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+ BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
GFP_ATOMIC);
if (err < 0) {
if (err == -ENOSPC) {
@@ -432,7 +462,7 @@ req_retry_pinned:
} else {
spin_unlock_irqrestore(&chan->lock, flags);
p9_debug(P9_DEBUG_TRANS,
- "virtio rpc add_buf returned failure\n");
+ "virtio rpc add_sgs returned failure\n");
err = -EIO;
goto err_out;
}
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
index 7203ace..06e1f46 100644
--- a/tools/lguest/lguest.txt
+++ b/tools/lguest/lguest.txt
@@ -70,7 +70,7 @@ Running Lguest:
- Run an lguest as root:
- Documentation/virtual/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
+ tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
--block=rootfile root=/dev/vda
Explanation:
diff --git a/tools/virtio/Makefile b/tools/virtio/Makefile
index d1d442e..3187c62 100644
--- a/tools/virtio/Makefile
+++ b/tools/virtio/Makefile
@@ -1,12 +1,14 @@
all: test mod
-test: virtio_test
+test: virtio_test vringh_test
virtio_test: virtio_ring.o virtio_test.o
-CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -MMD
-vpath %.c ../../drivers/virtio
+vringh_test: vringh_test.o vringh.o virtio_ring.o
+
+CFLAGS += -g -O2 -Wall -I. -I ../../usr/include/ -Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD -U_FORTIFY_SOURCE
+vpath %.c ../../drivers/virtio ../../drivers/vhost
mod:
${MAKE} -C `pwd`/../.. M=`pwd`/vhost_test
.PHONY: all test mod clean
clean:
- ${RM} *.o vhost_test/*.o vhost_test/.*.cmd \
+ ${RM} *.o vringh_test virtio_test vhost_test/*.o vhost_test/.*.cmd \
vhost_test/Module.symvers vhost_test/modules.order *.d
-include *.d
diff --git a/tools/virtio/asm/barrier.h b/tools/virtio/asm/barrier.h
new file mode 100644
index 0000000..aff61e1
--- /dev/null
+++ b/tools/virtio/asm/barrier.h
@@ -0,0 +1,14 @@
+#if defined(__i386__) || defined(__x86_64__)
+#define barrier() asm volatile("" ::: "memory")
+#define mb() __sync_synchronize()
+
+#define smp_mb() mb()
+# define smp_rmb() barrier()
+# define smp_wmb() barrier()
+/* Weak barriers should be used. If not - it's a bug */
+# define rmb() abort()
+# define wmb() abort()
+#else
+#error Please fill in barrier macros
+#endif
+
diff --git a/tools/virtio/linux/bug.h b/tools/virtio/linux/bug.h
new file mode 100644
index 0000000..fb94f07
--- /dev/null
+++ b/tools/virtio/linux/bug.h
@@ -0,0 +1,10 @@
+#ifndef BUG_H
+#define BUG_H
+
+#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
+
+#define BUILD_BUG_ON(x)
+
+#define BUG() abort()
+
+#endif /* BUG_H */
diff --git a/tools/virtio/linux/err.h b/tools/virtio/linux/err.h
new file mode 100644
index 0000000..e32eff8
--- /dev/null
+++ b/tools/virtio/linux/err.h
@@ -0,0 +1,26 @@
+#ifndef ERR_H
+#define ERR_H
+#define MAX_ERRNO 4095
+
+#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void * __must_check ERR_PTR(long error)
+{
+ return (void *) error;
+}
+
+static inline long __must_check PTR_ERR(const void *ptr)
+{
+ return (long) ptr;
+}
+
+static inline long __must_check IS_ERR(const void *ptr)
+{
+ return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline long __must_check IS_ERR_OR_NULL(const void *ptr)
+{
+ return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+}
+#endif /* ERR_H */
diff --git a/tools/virtio/linux/export.h b/tools/virtio/linux/export.h
new file mode 100644
index 0000000..7311d32
--- /dev/null
+++ b/tools/virtio/linux/export.h
@@ -0,0 +1,5 @@
+#define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
diff --git a/tools/virtio/linux/irqreturn.h b/tools/virtio/linux/irqreturn.h
new file mode 100644
index 0000000..a3c4e7b
--- /dev/null
+++ b/tools/virtio/linux/irqreturn.h
@@ -0,0 +1 @@
+#include "../../../include/linux/irqreturn.h"
diff --git a/tools/virtio/linux/kernel.h b/tools/virtio/linux/kernel.h
new file mode 100644
index 0000000..fba7059
--- /dev/null
+++ b/tools/virtio/linux/kernel.h
@@ -0,0 +1,112 @@
+#ifndef KERNEL_H
+#define KERNEL_H
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include <stdarg.h>
+
+#include <linux/types.h>
+#include <linux/printk.h>
+#include <linux/bug.h>
+#include <errno.h>
+#include <unistd.h>
+#include <asm/barrier.h>
+
+#define CONFIG_SMP
+
+#define PAGE_SIZE getpagesize()
+#define PAGE_MASK (~(PAGE_SIZE-1))
+
+typedef unsigned long long dma_addr_t;
+typedef size_t __kernel_size_t;
+
+struct page {
+ unsigned long long dummy;
+};
+
+/* Physical == Virtual */
+#define virt_to_phys(p) ((unsigned long)p)
+#define phys_to_virt(a) ((void *)(unsigned long)(a))
+/* Page address: Virtual / 4K */
+#define page_to_phys(p) ((dma_addr_t)(unsigned long)(p))
+#define virt_to_page(p) ((struct page *)((unsigned long)p & PAGE_MASK))
+
+#define offset_in_page(p) (((unsigned long)p) % PAGE_SIZE)
+
+#define __printf(a,b) __attribute__((format(printf,a,b)))
+
+typedef enum {
+ GFP_KERNEL,
+ GFP_ATOMIC,
+ __GFP_HIGHMEM,
+ __GFP_HIGH
+} gfp_t;
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+extern void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+static inline void *kmalloc(size_t s, gfp_t gfp)
+{
+ if (__kmalloc_fake)
+ return __kmalloc_fake;
+ return malloc(s);
+}
+
+static inline void kfree(void *p)
+{
+ if (p >= __kfree_ignore_start && p < __kfree_ignore_end)
+ return;
+ free(p);
+}
+
+static inline void *krealloc(void *p, size_t s, gfp_t gfp)
+{
+ return realloc(p, s);
+}
+
+
+static inline unsigned long __get_free_page(gfp_t gfp)
+{
+ void *p;
+
+ posix_memalign(&p, PAGE_SIZE, PAGE_SIZE);
+ return (unsigned long)p;
+}
+
+static inline void free_page(unsigned long addr)
+{
+ free((void *)addr);
+}
+
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+#define uninitialized_var(x) x = x
+
+# ifndef likely
+# define likely(x) (__builtin_expect(!!(x), 1))
+# endif
+# ifndef unlikely
+# define unlikely(x) (__builtin_expect(!!(x), 0))
+# endif
+
+#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#ifdef DEBUG
+#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#else
+#define pr_debug(format, ...) do {} while (0)
+#endif
+#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; })
+
+#endif /* KERNEL_H */
diff --git a/tools/virtio/linux/module.h b/tools/virtio/linux/module.h
index e69de29..3039a7e 100644
--- a/tools/virtio/linux/module.h
+++ b/tools/virtio/linux/module.h
@@ -0,0 +1 @@
+#include <linux/export.h>
diff --git a/tools/virtio/linux/printk.h b/tools/virtio/linux/printk.h
new file mode 100644
index 0000000..9f2423b
--- /dev/null
+++ b/tools/virtio/linux/printk.h
@@ -0,0 +1,4 @@
+#include "../../../include/linux/kern_levels.h"
+
+#define printk printf
+#define vprintk vprintf
diff --git a/tools/virtio/linux/ratelimit.h b/tools/virtio/linux/ratelimit.h
new file mode 100644
index 0000000..dcce172
--- /dev/null
+++ b/tools/virtio/linux/ratelimit.h
@@ -0,0 +1,4 @@
+#define DEFINE_RATELIMIT_STATE(name, interval_init, burst_init) int name = 0
+
+#define __ratelimit(x) (*(x))
+
diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h
new file mode 100644
index 0000000..68c9e2a
--- /dev/null
+++ b/tools/virtio/linux/scatterlist.h
@@ -0,0 +1,189 @@
+#ifndef SCATTERLIST_H
+#define SCATTERLIST_H
+#include <linux/kernel.h>
+
+struct scatterlist {
+ unsigned long page_link;
+ unsigned int offset;
+ unsigned int length;
+ dma_addr_t dma_address;
+};
+
+/* Scatterlist helpers, stolen from linux/scatterlist.h */
+#define sg_is_chain(sg) ((sg)->page_link & 0x01)
+#define sg_is_last(sg) ((sg)->page_link & 0x02)
+#define sg_chain_ptr(sg) \
+ ((struct scatterlist *) ((sg)->page_link & ~0x03))
+
+/**
+ * sg_assign_page - Assign a given page to an SG entry
+ * @sg: SG entry
+ * @page: The page
+ *
+ * Description:
+ * Assign page to sg entry. Also see sg_set_page(), the most commonly used
+ * variant.
+ *
+ **/
+static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
+{
+ unsigned long page_link = sg->page_link & 0x3;
+
+ /*
+ * In order for the low bit stealing approach to work, pages
+ * must be aligned at a 32-bit boundary as a minimum.
+ */
+ BUG_ON((unsigned long) page & 0x03);
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+ BUG_ON(sg_is_chain(sg));
+#endif
+ sg->page_link = page_link | (unsigned long) page;
+}
+
+/**
+ * sg_set_page - Set sg entry to point at given page
+ * @sg: SG entry
+ * @page: The page
+ * @len: Length of data
+ * @offset: Offset into page
+ *
+ * Description:
+ * Use this function to set an sg entry pointing at a page, never assign
+ * the page directly. We encode sg table information in the lower bits
+ * of the page pointer. See sg_page() for looking up the page belonging
+ * to an sg entry.
+ *
+ **/
+static inline void sg_set_page(struct scatterlist *sg, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ sg_assign_page(sg, page);
+ sg->offset = offset;
+ sg->length = len;
+}
+
+static inline struct page *sg_page(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+ BUG_ON(sg_is_chain(sg));
+#endif
+ return (struct page *)((sg)->page_link & ~0x3);
+}
+
+/*
+ * Loop over each sg element, following the pointer to a new list if necessary
+ */
+#define for_each_sg(sglist, sg, nr, __i) \
+ for (__i = 0, sg = (sglist); __i < (nr); __i++, sg = sg_next(sg))
+
+/**
+ * sg_chain - Chain two sglists together
+ * @prv: First scatterlist
+ * @prv_nents: Number of entries in prv
+ * @sgl: Second scatterlist
+ *
+ * Description:
+ * Links @prv@ and @sgl@ together, to form a longer scatterlist.
+ *
+ **/
+static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents,
+ struct scatterlist *sgl)
+{
+ /*
+ * offset and length are unused for chain entry. Clear them.
+ */
+ prv[prv_nents - 1].offset = 0;
+ prv[prv_nents - 1].length = 0;
+
+ /*
+ * Set lowest bit to indicate a link pointer, and make sure to clear
+ * the termination bit if it happens to be set.
+ */
+ prv[prv_nents - 1].page_link = ((unsigned long) sgl | 0x01) & ~0x02;
+}
+
+/**
+ * sg_mark_end - Mark the end of the scatterlist
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ * Marks the passed in sg entry as the termination point for the sg
+ * table. A call to sg_next() on this entry will return NULL.
+ *
+ **/
+static inline void sg_mark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+ /*
+ * Set termination bit, clear potential chain bit
+ */
+ sg->page_link |= 0x02;
+ sg->page_link &= ~0x01;
+}
+
+/**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ * Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+ sg->page_link &= ~0x02;
+}
+
+static inline struct scatterlist *sg_next(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+ BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+ if (sg_is_last(sg))
+ return NULL;
+
+ sg++;
+ if (unlikely(sg_is_chain(sg)))
+ sg = sg_chain_ptr(sg);
+
+ return sg;
+}
+
+static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
+{
+ memset(sgl, 0, sizeof(*sgl) * nents);
+#ifdef CONFIG_DEBUG_SG
+ {
+ unsigned int i;
+ for (i = 0; i < nents; i++)
+ sgl[i].sg_magic = SG_MAGIC;
+ }
+#endif
+ sg_mark_end(&sgl[nents - 1]);
+}
+
+static inline dma_addr_t sg_phys(struct scatterlist *sg)
+{
+ return page_to_phys(sg_page(sg)) + sg->offset;
+}
+
+static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
+ unsigned int buflen)
+{
+ sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
+}
+
+static inline void sg_init_one(struct scatterlist *sg,
+ const void *buf, unsigned int buflen)
+{
+ sg_init_table(sg, 1);
+ sg_set_buf(sg, buf, buflen);
+}
+#endif /* SCATTERLIST_H */
diff --git a/tools/virtio/linux/types.h b/tools/virtio/linux/types.h
new file mode 100644
index 0000000..f8ebb9a
--- /dev/null
+++ b/tools/virtio/linux/types.h
@@ -0,0 +1,28 @@
+#ifndef TYPES_H
+#define TYPES_H
+#include <stdint.h>
+
+#define __force
+#define __user
+#define __must_check
+#define __cold
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+
+typedef uint64_t __u64;
+typedef int64_t __s64;
+typedef uint32_t __u32;
+typedef int32_t __s32;
+typedef uint16_t __u16;
+typedef int16_t __s16;
+typedef uint8_t __u8;
+typedef int8_t __s8;
+
+#endif /* TYPES_H */
diff --git a/tools/virtio/linux/uaccess.h b/tools/virtio/linux/uaccess.h
new file mode 100644
index 0000000..0a578fe
--- /dev/null
+++ b/tools/virtio/linux/uaccess.h
@@ -0,0 +1,50 @@
+#ifndef UACCESS_H
+#define UACCESS_H
+extern void *__user_addr_min, *__user_addr_max;
+
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+static inline void __chk_user_ptr(const volatile void *p, size_t size)
+{
+ assert(p >= __user_addr_min && p + size <= __user_addr_max);
+}
+
+#define put_user(x, ptr) \
+({ \
+ typeof(ptr) __pu_ptr = (ptr); \
+ __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
+ ACCESS_ONCE(*(__pu_ptr)) = x; \
+ 0; \
+})
+
+#define get_user(x, ptr) \
+({ \
+ typeof(ptr) __pu_ptr = (ptr); \
+ __chk_user_ptr(__pu_ptr, sizeof(*__pu_ptr)); \
+ x = ACCESS_ONCE(*(__pu_ptr)); \
+ 0; \
+})
+
+static void volatile_memcpy(volatile char *to, const volatile char *from,
+ unsigned long n)
+{
+ while (n--)
+ *(to++) = *(from++);
+}
+
+static inline int copy_from_user(void *to, const void __user volatile *from,
+ unsigned long n)
+{
+ __chk_user_ptr(from, n);
+ volatile_memcpy(to, from, n);
+ return 0;
+}
+
+static inline int copy_to_user(void __user volatile *to, const void *from,
+ unsigned long n)
+{
+ __chk_user_ptr(to, n);
+ volatile_memcpy(to, from, n);
+ return 0;
+}
+#endif /* UACCESS_H */
diff --git a/tools/virtio/linux/uio.h b/tools/virtio/linux/uio.h
new file mode 100644
index 0000000..cd20f0b
--- /dev/null
+++ b/tools/virtio/linux/uio.h
@@ -0,0 +1,3 @@
+#include <linux/kernel.h>
+
+#include "../../../include/linux/uio.h"
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index 81847dd..cd80183 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -1,127 +1,7 @@
#ifndef LINUX_VIRTIO_H
#define LINUX_VIRTIO_H
-
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stddef.h>
-#include <stdio.h>
-#include <string.h>
-#include <assert.h>
-
-#include <linux/types.h>
-#include <errno.h>
-
-typedef unsigned long long dma_addr_t;
-
-struct scatterlist {
- unsigned long page_link;
- unsigned int offset;
- unsigned int length;
- dma_addr_t dma_address;
-};
-
-struct page {
- unsigned long long dummy;
-};
-
-#define BUG_ON(__BUG_ON_cond) assert(!(__BUG_ON_cond))
-
-/* Physical == Virtual */
-#define virt_to_phys(p) ((unsigned long)p)
-#define phys_to_virt(a) ((void *)(unsigned long)(a))
-/* Page address: Virtual / 4K */
-#define virt_to_page(p) ((struct page*)((virt_to_phys(p) / 4096) * \
- sizeof(struct page)))
-#define offset_in_page(p) (((unsigned long)p) % 4096)
-#define sg_phys(sg) ((sg->page_link & ~0x3) / sizeof(struct page) * 4096 + \
- sg->offset)
-static inline void sg_mark_end(struct scatterlist *sg)
-{
- /*
- * Set termination bit, clear potential chain bit
- */
- sg->page_link |= 0x02;
- sg->page_link &= ~0x01;
-}
-static inline void sg_init_table(struct scatterlist *sgl, unsigned int nents)
-{
- memset(sgl, 0, sizeof(*sgl) * nents);
- sg_mark_end(&sgl[nents - 1]);
-}
-static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
-{
- unsigned long page_link = sg->page_link & 0x3;
-
- /*
- * In order for the low bit stealing approach to work, pages
- * must be aligned at a 32-bit boundary as a minimum.
- */
- BUG_ON((unsigned long) page & 0x03);
- sg->page_link = page_link | (unsigned long) page;
-}
-
-static inline void sg_set_page(struct scatterlist *sg, struct page *page,
- unsigned int len, unsigned int offset)
-{
- sg_assign_page(sg, page);
- sg->offset = offset;
- sg->length = len;
-}
-
-static inline void sg_set_buf(struct scatterlist *sg, const void *buf,
- unsigned int buflen)
-{
- sg_set_page(sg, virt_to_page(buf), buflen, offset_in_page(buf));
-}
-
-static inline void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen)
-{
- sg_init_table(sg, 1);
- sg_set_buf(sg, buf, buflen);
-}
-
-typedef __u16 u16;
-
-typedef enum {
- GFP_KERNEL,
- GFP_ATOMIC,
-} gfp_t;
-typedef enum {
- IRQ_NONE,
- IRQ_HANDLED
-} irqreturn_t;
-
-static inline void *kmalloc(size_t s, gfp_t gfp)
-{
- return malloc(s);
-}
-
-static inline void kfree(void *p)
-{
- free(p);
-}
-
-#define container_of(ptr, type, member) ({ \
- const typeof( ((type *)0)->member ) *__mptr = (ptr); \
- (type *)( (char *)__mptr - offsetof(type,member) );})
-
-#define uninitialized_var(x) x = x
-
-# ifndef likely
-# define likely(x) (__builtin_expect(!!(x), 1))
-# endif
-# ifndef unlikely
-# define unlikely(x) (__builtin_expect(!!(x), 0))
-# endif
-
-#define pr_err(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#ifdef DEBUG
-#define pr_debug(format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#else
-#define pr_debug(format, ...) do {} while (0)
-#endif
-#define dev_err(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
-#define dev_warn(dev, format, ...) fprintf (stderr, format, ## __VA_ARGS__)
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
/* TODO: empty stubs for now. Broken but enough for virtio_ring.c */
#define list_add_tail(a, b) do {} while (0)
@@ -131,6 +11,7 @@ static inline void kfree(void *p)
#define BITS_PER_BYTE 8
#define BITS_PER_LONG (sizeof(long) * BITS_PER_BYTE)
#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+
/* TODO: Not atomic as it should be:
* we don't use this for anything important. */
static inline void clear_bit(int nr, volatile unsigned long *addr)
@@ -145,10 +26,6 @@ static inline int test_bit(int nr, const volatile unsigned long *addr)
{
return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
}
-
-/* The only feature we care to support */
-#define virtio_has_feature(dev, feature) \
- test_bit((feature), (dev)->features)
/* end of stubs */
struct virtio_device {
@@ -163,39 +40,32 @@ struct virtqueue {
void (*callback)(struct virtqueue *vq);
const char *name;
struct virtio_device *vdev;
+ unsigned int index;
+ unsigned int num_free;
void *priv;
};
-#define EXPORT_SYMBOL_GPL(__EXPORT_SYMBOL_GPL_name) \
- void __EXPORT_SYMBOL_GPL##__EXPORT_SYMBOL_GPL_name() { \
-}
#define MODULE_LICENSE(__MODULE_LICENSE_value) \
const char *__MODULE_LICENSE_name = __MODULE_LICENSE_value
-#define CONFIG_SMP
-
-#if defined(__i386__) || defined(__x86_64__)
-#define barrier() asm volatile("" ::: "memory")
-#define mb() __sync_synchronize()
-
-#define smp_mb() mb()
-# define smp_rmb() barrier()
-# define smp_wmb() barrier()
-/* Weak barriers should be used. If not - it's a bug */
-# define rmb() abort()
-# define wmb() abort()
-#else
-#error Please fill in barrier macros
-#endif
-
/* Interfaces exported by virtio_ring. */
-int virtqueue_add_buf(struct virtqueue *vq,
- struct scatterlist sg[],
- unsigned int out_num,
- unsigned int in_num,
+int virtqueue_add_sgs(struct virtqueue *vq,
+ struct scatterlist *sgs[],
+ unsigned int out_sgs,
+ unsigned int in_sgs,
void *data,
gfp_t gfp);
+int virtqueue_add_outbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp);
+
+int virtqueue_add_inbuf(struct virtqueue *vq,
+ struct scatterlist sg[], unsigned int num,
+ void *data,
+ gfp_t gfp);
+
void virtqueue_kick(struct virtqueue *vq);
void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
@@ -206,7 +76,8 @@ bool virtqueue_enable_cb(struct virtqueue *vq);
bool virtqueue_enable_cb_delayed(struct virtqueue *vq);
void *virtqueue_detach_unused_buf(struct virtqueue *vq);
-struct virtqueue *vring_new_virtqueue(unsigned int num,
+struct virtqueue *vring_new_virtqueue(unsigned int index,
+ unsigned int num,
unsigned int vring_align,
struct virtio_device *vdev,
bool weak_barriers,
diff --git a/tools/virtio/linux/virtio_config.h b/tools/virtio/linux/virtio_config.h
new file mode 100644
index 0000000..5049967
--- /dev/null
+++ b/tools/virtio/linux/virtio_config.h
@@ -0,0 +1,6 @@
+#define VIRTIO_TRANSPORT_F_START 28
+#define VIRTIO_TRANSPORT_F_END 32
+
+#define virtio_has_feature(dev, feature) \
+ test_bit((feature), (dev)->features)
+
diff --git a/tools/virtio/linux/virtio_ring.h b/tools/virtio/linux/virtio_ring.h
new file mode 100644
index 0000000..8949c4e
--- /dev/null
+++ b/tools/virtio/linux/virtio_ring.h
@@ -0,0 +1 @@
+#include "../../../include/linux/virtio_ring.h"
diff --git a/tools/virtio/linux/vringh.h b/tools/virtio/linux/vringh.h
new file mode 100644
index 0000000..9348957
--- /dev/null
+++ b/tools/virtio/linux/vringh.h
@@ -0,0 +1 @@
+#include "../../../include/linux/vringh.h"
diff --git a/tools/virtio/uapi/linux/uio.h b/tools/virtio/uapi/linux/uio.h
new file mode 100644
index 0000000..7230e90
--- /dev/null
+++ b/tools/virtio/uapi/linux/uio.h
@@ -0,0 +1 @@
+#include <sys/uio.h>
diff --git a/tools/virtio/uapi/linux/virtio_config.h b/tools/virtio/uapi/linux/virtio_config.h
new file mode 100644
index 0000000..4c86675
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_config.h
@@ -0,0 +1 @@
+#include "../../../../include/uapi/linux/virtio_config.h"
diff --git a/tools/virtio/uapi/linux/virtio_ring.h b/tools/virtio/uapi/linux/virtio_ring.h
new file mode 100644
index 0000000..4d99c78
--- /dev/null
+++ b/tools/virtio/uapi/linux/virtio_ring.h
@@ -0,0 +1,4 @@
+#ifndef VIRTIO_RING_H
+#define VIRTIO_RING_H
+#include "../../../../include/uapi/linux/virtio_ring.h"
+#endif /* VIRTIO_RING_H */
diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
index fcc9aa2..da7a195 100644
--- a/tools/virtio/virtio_test.c
+++ b/tools/virtio/virtio_test.c
@@ -10,11 +10,15 @@
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
+#include <stdbool.h>
#include <linux/vhost.h>
#include <linux/virtio.h>
#include <linux/virtio_ring.h>
#include "../../drivers/vhost/test.h"
+/* Unused */
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+
struct vq_info {
int kick;
int call;
@@ -92,7 +96,8 @@ static void vq_info_add(struct vdev_info *dev, int num)
assert(r >= 0);
memset(info->ring, 0, vring_size(num, 4096));
vring_init(&info->vring, num, info->ring, 4096);
- info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev,
+ info->vq = vring_new_virtqueue(info->idx,
+ info->vring.num, 4096, &dev->vdev,
true, info->ring,
vq_notify, vq_callback, "test");
assert(info->vq);
@@ -161,9 +166,9 @@ static void run_test(struct vdev_info *dev, struct vq_info *vq,
do {
if (started < bufs) {
sg_init_one(&sl, dev->buf, dev->buf_size);
- r = virtqueue_add_buf(vq->vq, &sl, 1, 0,
- dev->buf + started,
- GFP_ATOMIC);
+ r = virtqueue_add_outbuf(vq->vq, &sl, 1,
+ dev->buf + started,
+ GFP_ATOMIC);
if (likely(r == 0)) {
++started;
virtqueue_kick(vq->vq);
diff --git a/tools/virtio/vringh_test.c b/tools/virtio/vringh_test.c
new file mode 100644
index 0000000..d053ea4
--- /dev/null
+++ b/tools/virtio/vringh_test.c
@@ -0,0 +1,741 @@
+/* Simple test of virtio code, entirely in userpsace. */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <err.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/uaccess.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/wait.h>
+#include <fcntl.h>
+
+#define USER_MEM (1024*1024)
+void *__user_addr_min, *__user_addr_max;
+void *__kmalloc_fake, *__kfree_ignore_start, *__kfree_ignore_end;
+static u64 user_addr_offset;
+
+#define RINGSIZE 256
+#define ALIGN 4096
+
+static void never_notify_host(struct virtqueue *vq)
+{
+ abort();
+}
+
+static void never_callback_guest(struct virtqueue *vq)
+{
+ abort();
+}
+
+static bool getrange_iov(struct vringh *vrh, u64 addr, struct vringh_range *r)
+{
+ if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
+ return false;
+ if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
+ return false;
+
+ r->start = (u64)(unsigned long)__user_addr_min - user_addr_offset;
+ r->end_incl = (u64)(unsigned long)__user_addr_max - 1 - user_addr_offset;
+ r->offset = user_addr_offset;
+ return true;
+}
+
+/* We return single byte ranges. */
+static bool getrange_slow(struct vringh *vrh, u64 addr, struct vringh_range *r)
+{
+ if (addr < (u64)(unsigned long)__user_addr_min - user_addr_offset)
+ return false;
+ if (addr >= (u64)(unsigned long)__user_addr_max - user_addr_offset)
+ return false;
+
+ r->start = addr;
+ r->end_incl = r->start;
+ r->offset = user_addr_offset;
+ return true;
+}
+
+struct guest_virtio_device {
+ struct virtio_device vdev;
+ int to_host_fd;
+ unsigned long notifies;
+};
+
+static void parallel_notify_host(struct virtqueue *vq)
+{
+ struct guest_virtio_device *gvdev;
+
+ gvdev = container_of(vq->vdev, struct guest_virtio_device, vdev);
+ write(gvdev->to_host_fd, "", 1);
+ gvdev->notifies++;
+}
+
+static void no_notify_host(struct virtqueue *vq)
+{
+}
+
+#define NUM_XFERS (10000000)
+
+/* We aim for two "distant" cpus. */
+static void find_cpus(unsigned int *first, unsigned int *last)
+{
+ unsigned int i;
+
+ *first = -1U;
+ *last = 0;
+ for (i = 0; i < 4096; i++) {
+ cpu_set_t set;
+ CPU_ZERO(&set);
+ CPU_SET(i, &set);
+ if (sched_setaffinity(getpid(), sizeof(set), &set) == 0) {
+ if (i < *first)
+ *first = i;
+ if (i > *last)
+ *last = i;
+ }
+ }
+}
+
+/* Opencoded version for fast mode */
+static inline int vringh_get_head(struct vringh *vrh, u16 *head)
+{
+ u16 avail_idx, i;
+ int err;
+
+ err = get_user(avail_idx, &vrh->vring.avail->idx);
+ if (err)
+ return err;
+
+ if (vrh->last_avail_idx == avail_idx)
+ return 0;
+
+ /* Only get avail ring entries after they have been exposed by guest. */
+ virtio_rmb(vrh->weak_barriers);
+
+ i = vrh->last_avail_idx & (vrh->vring.num - 1);
+
+ err = get_user(*head, &vrh->vring.avail->ring[i]);
+ if (err)
+ return err;
+
+ vrh->last_avail_idx++;
+ return 1;
+}
+
+static int parallel_test(unsigned long features,
+ bool (*getrange)(struct vringh *vrh,
+ u64 addr, struct vringh_range *r),
+ bool fast_vringh)
+{
+ void *host_map, *guest_map;
+ int fd, mapsize, to_guest[2], to_host[2];
+ unsigned long xfers = 0, notifies = 0, receives = 0;
+ unsigned int first_cpu, last_cpu;
+ cpu_set_t cpu_set;
+ char buf[128];
+
+ /* Create real file to mmap. */
+ fd = open("/tmp/vringh_test-file", O_RDWR|O_CREAT|O_TRUNC, 0600);
+ if (fd < 0)
+ err(1, "Opening /tmp/vringh_test-file");
+
+ /* Extra room at the end for some data, and indirects */
+ mapsize = vring_size(RINGSIZE, ALIGN)
+ + RINGSIZE * 2 * sizeof(int)
+ + RINGSIZE * 6 * sizeof(struct vring_desc);
+ mapsize = (mapsize + getpagesize() - 1) & ~(getpagesize() - 1);
+ ftruncate(fd, mapsize);
+
+ /* Parent and child use separate addresses, to check our mapping logic! */
+ host_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+ guest_map = mmap(NULL, mapsize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
+
+ pipe(to_guest);
+ pipe(to_host);
+
+ CPU_ZERO(&cpu_set);
+ find_cpus(&first_cpu, &last_cpu);
+ printf("Using CPUS %u and %u\n", first_cpu, last_cpu);
+ fflush(stdout);
+
+ if (fork() != 0) {
+ struct vringh vrh;
+ int status, err, rlen = 0;
+ char rbuf[5];
+
+ /* We are the host: never access guest addresses! */
+ munmap(guest_map, mapsize);
+
+ __user_addr_min = host_map;
+ __user_addr_max = __user_addr_min + mapsize;
+ user_addr_offset = host_map - guest_map;
+ assert(user_addr_offset);
+
+ close(to_guest[0]);
+ close(to_host[1]);
+
+ vring_init(&vrh.vring, RINGSIZE, host_map, ALIGN);
+ vringh_init_user(&vrh, features, RINGSIZE, true,
+ vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
+ CPU_SET(first_cpu, &cpu_set);
+ if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
+ errx(1, "Could not set affinity to cpu %u", first_cpu);
+
+ while (xfers < NUM_XFERS) {
+ struct iovec host_riov[2], host_wiov[2];
+ struct vringh_iov riov, wiov;
+ u16 head, written;
+
+ if (fast_vringh) {
+ for (;;) {
+ err = vringh_get_head(&vrh, &head);
+ if (err != 0)
+ break;
+ err = vringh_need_notify_user(&vrh);
+ if (err < 0)
+ errx(1, "vringh_need_notify_user: %i",
+ err);
+ if (err) {
+ write(to_guest[1], "", 1);
+ notifies++;
+ }
+ }
+ if (err != 1)
+ errx(1, "vringh_get_head");
+ written = 0;
+ goto complete;
+ } else {
+ vringh_iov_init(&riov,
+ host_riov,
+ ARRAY_SIZE(host_riov));
+ vringh_iov_init(&wiov,
+ host_wiov,
+ ARRAY_SIZE(host_wiov));
+
+ err = vringh_getdesc_user(&vrh, &riov, &wiov,
+ getrange, &head);
+ }
+ if (err == 0) {
+ err = vringh_need_notify_user(&vrh);
+ if (err < 0)
+ errx(1, "vringh_need_notify_user: %i",
+ err);
+ if (err) {
+ write(to_guest[1], "", 1);
+ notifies++;
+ }
+
+ if (!vringh_notify_enable_user(&vrh))
+ continue;
+
+ /* Swallow all notifies at once. */
+ if (read(to_host[0], buf, sizeof(buf)) < 1)
+ break;
+
+ vringh_notify_disable_user(&vrh);
+ receives++;
+ continue;
+ }
+ if (err != 1)
+ errx(1, "vringh_getdesc_user: %i", err);
+
+ /* We simply copy bytes. */
+ if (riov.used) {
+ rlen = vringh_iov_pull_user(&riov, rbuf,
+ sizeof(rbuf));
+ if (rlen != 4)
+ errx(1, "vringh_iov_pull_user: %i",
+ rlen);
+ assert(riov.i == riov.used);
+ written = 0;
+ } else {
+ err = vringh_iov_push_user(&wiov, rbuf, rlen);
+ if (err != rlen)
+ errx(1, "vringh_iov_push_user: %i",
+ err);
+ assert(wiov.i == wiov.used);
+ written = err;
+ }
+ complete:
+ xfers++;
+
+ err = vringh_complete_user(&vrh, head, written);
+ if (err != 0)
+ errx(1, "vringh_complete_user: %i", err);
+ }
+
+ err = vringh_need_notify_user(&vrh);
+ if (err < 0)
+ errx(1, "vringh_need_notify_user: %i", err);
+ if (err) {
+ write(to_guest[1], "", 1);
+ notifies++;
+ }
+ wait(&status);
+ if (!WIFEXITED(status))
+ errx(1, "Child died with signal %i?", WTERMSIG(status));
+ if (WEXITSTATUS(status) != 0)
+ errx(1, "Child exited %i?", WEXITSTATUS(status));
+ printf("Host: notified %lu, pinged %lu\n", notifies, receives);
+ return 0;
+ } else {
+ struct guest_virtio_device gvdev;
+ struct virtqueue *vq;
+ unsigned int *data;
+ struct vring_desc *indirects;
+ unsigned int finished = 0;
+
+ /* We pass sg[]s pointing into here, but we need RINGSIZE+1 */
+ data = guest_map + vring_size(RINGSIZE, ALIGN);
+ indirects = (void *)data + (RINGSIZE + 1) * 2 * sizeof(int);
+
+ /* We are the guest. */
+ munmap(host_map, mapsize);
+
+ close(to_guest[1]);
+ close(to_host[0]);
+
+ gvdev.vdev.features[0] = features;
+ gvdev.to_host_fd = to_host[1];
+ gvdev.notifies = 0;
+
+ CPU_SET(first_cpu, &cpu_set);
+ if (sched_setaffinity(getpid(), sizeof(cpu_set), &cpu_set))
+ err(1, "Could not set affinity to cpu %u", first_cpu);
+
+ vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &gvdev.vdev, true,
+ guest_map, fast_vringh ? no_notify_host
+ : parallel_notify_host,
+ never_callback_guest, "guest vq");
+
+ /* Don't kfree indirects. */
+ __kfree_ignore_start = indirects;
+ __kfree_ignore_end = indirects + RINGSIZE * 6;
+
+ while (xfers < NUM_XFERS) {
+ struct scatterlist sg[4];
+ unsigned int num_sg, len;
+ int *dbuf, err;
+ bool output = !(xfers % 2);
+
+ /* Consume bufs. */
+ while ((dbuf = virtqueue_get_buf(vq, &len)) != NULL) {
+ if (len == 4)
+ assert(*dbuf == finished - 1);
+ else if (!fast_vringh)
+ assert(*dbuf == finished);
+ finished++;
+ }
+
+ /* Produce a buffer. */
+ dbuf = data + (xfers % (RINGSIZE + 1));
+
+ if (output)
+ *dbuf = xfers;
+ else
+ *dbuf = -1;
+
+ switch ((xfers / sizeof(*dbuf)) % 4) {
+ case 0:
+ /* Nasty three-element sg list. */
+ sg_init_table(sg, num_sg = 3);
+ sg_set_buf(&sg[0], (void *)dbuf, 1);
+ sg_set_buf(&sg[1], (void *)dbuf + 1, 2);
+ sg_set_buf(&sg[2], (void *)dbuf + 3, 1);
+ break;
+ case 1:
+ sg_init_table(sg, num_sg = 2);
+ sg_set_buf(&sg[0], (void *)dbuf, 1);
+ sg_set_buf(&sg[1], (void *)dbuf + 1, 3);
+ break;
+ case 2:
+ sg_init_table(sg, num_sg = 1);
+ sg_set_buf(&sg[0], (void *)dbuf, 4);
+ break;
+ case 3:
+ sg_init_table(sg, num_sg = 4);
+ sg_set_buf(&sg[0], (void *)dbuf, 1);
+ sg_set_buf(&sg[1], (void *)dbuf + 1, 1);
+ sg_set_buf(&sg[2], (void *)dbuf + 2, 1);
+ sg_set_buf(&sg[3], (void *)dbuf + 3, 1);
+ break;
+ }
+
+ /* May allocate an indirect, so force it to allocate
+ * user addr */
+ __kmalloc_fake = indirects + (xfers % RINGSIZE) * 4;
+ if (output)
+ err = virtqueue_add_outbuf(vq, sg, num_sg, dbuf,
+ GFP_KERNEL);
+ else
+ err = virtqueue_add_inbuf(vq, sg, num_sg,
+ dbuf, GFP_KERNEL);
+
+ if (err == -ENOSPC) {
+ if (!virtqueue_enable_cb_delayed(vq))
+ continue;
+ /* Swallow all notifies at once. */
+ if (read(to_guest[0], buf, sizeof(buf)) < 1)
+ break;
+
+ receives++;
+ virtqueue_disable_cb(vq);
+ continue;
+ }
+
+ if (err)
+ errx(1, "virtqueue_add_in/outbuf: %i", err);
+
+ xfers++;
+ virtqueue_kick(vq);
+ }
+
+ /* Any extra? */
+ while (finished != xfers) {
+ int *dbuf;
+ unsigned int len;
+
+ /* Consume bufs. */
+ dbuf = virtqueue_get_buf(vq, &len);
+ if (dbuf) {
+ if (len == 4)
+ assert(*dbuf == finished - 1);
+ else
+ assert(len == 0);
+ finished++;
+ continue;
+ }
+
+ if (!virtqueue_enable_cb_delayed(vq))
+ continue;
+ if (read(to_guest[0], buf, sizeof(buf)) < 1)
+ break;
+
+ receives++;
+ virtqueue_disable_cb(vq);
+ }
+
+ printf("Guest: notified %lu, pinged %lu\n",
+ gvdev.notifies, receives);
+ vring_del_virtqueue(vq);
+ return 0;
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ struct virtio_device vdev;
+ struct virtqueue *vq;
+ struct vringh vrh;
+ struct scatterlist guest_sg[RINGSIZE], *sgs[2];
+ struct iovec host_riov[2], host_wiov[2];
+ struct vringh_iov riov, wiov;
+ struct vring_used_elem used[RINGSIZE];
+ char buf[28];
+ u16 head;
+ int err;
+ unsigned i;
+ void *ret;
+ bool (*getrange)(struct vringh *vrh, u64 addr, struct vringh_range *r);
+ bool fast_vringh = false, parallel = false;
+
+ getrange = getrange_iov;
+ vdev.features[0] = 0;
+
+ while (argv[1]) {
+ if (strcmp(argv[1], "--indirect") == 0)
+ vdev.features[0] |= (1 << VIRTIO_RING_F_INDIRECT_DESC);
+ else if (strcmp(argv[1], "--eventidx") == 0)
+ vdev.features[0] |= (1 << VIRTIO_RING_F_EVENT_IDX);
+ else if (strcmp(argv[1], "--slow-range") == 0)
+ getrange = getrange_slow;
+ else if (strcmp(argv[1], "--fast-vringh") == 0)
+ fast_vringh = true;
+ else if (strcmp(argv[1], "--parallel") == 0)
+ parallel = true;
+ else
+ errx(1, "Unknown arg %s", argv[1]);
+ argv++;
+ }
+
+ if (parallel)
+ return parallel_test(vdev.features[0], getrange, fast_vringh);
+
+ if (posix_memalign(&__user_addr_min, PAGE_SIZE, USER_MEM) != 0)
+ abort();
+ __user_addr_max = __user_addr_min + USER_MEM;
+ memset(__user_addr_min, 0, vring_size(RINGSIZE, ALIGN));
+
+ /* Set up guest side. */
+ vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
+ __user_addr_min,
+ never_notify_host, never_callback_guest,
+ "guest vq");
+
+ /* Set up host side. */
+ vring_init(&vrh.vring, RINGSIZE, __user_addr_min, ALIGN);
+ vringh_init_user(&vrh, vdev.features[0], RINGSIZE, true,
+ vrh.vring.desc, vrh.vring.avail, vrh.vring.used);
+
+ /* No descriptor to get yet... */
+ err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+ if (err != 0)
+ errx(1, "vringh_getdesc_user: %i", err);
+
+ /* Guest puts in a descriptor. */
+ memcpy(__user_addr_max - 1, "a", 1);
+ sg_init_table(guest_sg, 1);
+ sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
+ sg_init_table(guest_sg+1, 1);
+ sg_set_buf(&guest_sg[1], __user_addr_max - 3, 2);
+ sgs[0] = &guest_sg[0];
+ sgs[1] = &guest_sg[1];
+
+ /* May allocate an indirect, so force it to allocate user addr */
+ __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+ err = virtqueue_add_sgs(vq, sgs, 1, 1, &err, GFP_KERNEL);
+ if (err)
+ errx(1, "virtqueue_add_sgs: %i", err);
+ __kmalloc_fake = NULL;
+
+ /* Host retreives it. */
+ vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+ vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+
+ err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+ if (err != 1)
+ errx(1, "vringh_getdesc_user: %i", err);
+
+ assert(riov.used == 1);
+ assert(riov.iov[0].iov_base == __user_addr_max - 1);
+ assert(riov.iov[0].iov_len == 1);
+ if (getrange != getrange_slow) {
+ assert(wiov.used == 1);
+ assert(wiov.iov[0].iov_base == __user_addr_max - 3);
+ assert(wiov.iov[0].iov_len == 2);
+ } else {
+ assert(wiov.used == 2);
+ assert(wiov.iov[0].iov_base == __user_addr_max - 3);
+ assert(wiov.iov[0].iov_len == 1);
+ assert(wiov.iov[1].iov_base == __user_addr_max - 2);
+ assert(wiov.iov[1].iov_len == 1);
+ }
+
+ err = vringh_iov_pull_user(&riov, buf, 5);
+ if (err != 1)
+ errx(1, "vringh_iov_pull_user: %i", err);
+ assert(buf[0] == 'a');
+ assert(riov.i == 1);
+ assert(vringh_iov_pull_user(&riov, buf, 5) == 0);
+
+ memcpy(buf, "bcdef", 5);
+ err = vringh_iov_push_user(&wiov, buf, 5);
+ if (err != 2)
+ errx(1, "vringh_iov_push_user: %i", err);
+ assert(memcmp(__user_addr_max - 3, "bc", 2) == 0);
+ assert(wiov.i == wiov.used);
+ assert(vringh_iov_push_user(&wiov, buf, 5) == 0);
+
+ /* Host is done. */
+ err = vringh_complete_user(&vrh, head, err);
+ if (err != 0)
+ errx(1, "vringh_complete_user: %i", err);
+
+ /* Guest should see used token now. */
+ __kfree_ignore_start = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+ __kfree_ignore_end = __kfree_ignore_start + 1;
+ ret = virtqueue_get_buf(vq, &i);
+ if (ret != &err)
+ errx(1, "virtqueue_get_buf: %p", ret);
+ assert(i == 2);
+
+ /* Guest puts in a huge descriptor. */
+ sg_init_table(guest_sg, RINGSIZE);
+ for (i = 0; i < RINGSIZE; i++) {
+ sg_set_buf(&guest_sg[i],
+ __user_addr_max - USER_MEM/4, USER_MEM/4);
+ }
+
+ /* Fill contents with recognisable garbage. */
+ for (i = 0; i < USER_MEM/4; i++)
+ ((char *)__user_addr_max - USER_MEM/4)[i] = i;
+
+ /* This will allocate an indirect, so force it to allocate user addr */
+ __kmalloc_fake = __user_addr_min + vring_size(RINGSIZE, ALIGN);
+ err = virtqueue_add_outbuf(vq, guest_sg, RINGSIZE, &err, GFP_KERNEL);
+ if (err)
+ errx(1, "virtqueue_add_outbuf (large): %i", err);
+ __kmalloc_fake = NULL;
+
+ /* Host picks it up (allocates new iov). */
+ vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+ vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+
+ err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+ if (err != 1)
+ errx(1, "vringh_getdesc_user: %i", err);
+
+ assert(riov.max_num & VRINGH_IOV_ALLOCATED);
+ assert(riov.iov != host_riov);
+ if (getrange != getrange_slow)
+ assert(riov.used == RINGSIZE);
+ else
+ assert(riov.used == RINGSIZE * USER_MEM/4);
+
+ assert(!(wiov.max_num & VRINGH_IOV_ALLOCATED));
+ assert(wiov.used == 0);
+
+ /* Pull data back out (in odd chunks), should be as expected. */
+ for (i = 0; i < RINGSIZE * USER_MEM/4; i += 3) {
+ err = vringh_iov_pull_user(&riov, buf, 3);
+ if (err != 3 && i + err != RINGSIZE * USER_MEM/4)
+ errx(1, "vringh_iov_pull_user large: %i", err);
+ assert(buf[0] == (char)i);
+ assert(err < 2 || buf[1] == (char)(i + 1));
+ assert(err < 3 || buf[2] == (char)(i + 2));
+ }
+ assert(riov.i == riov.used);
+ vringh_iov_cleanup(&riov);
+ vringh_iov_cleanup(&wiov);
+
+ /* Complete using multi interface, just because we can. */
+ used[0].id = head;
+ used[0].len = 0;
+ err = vringh_complete_multi_user(&vrh, used, 1);
+ if (err)
+ errx(1, "vringh_complete_multi_user(1): %i", err);
+
+ /* Free up those descriptors. */
+ ret = virtqueue_get_buf(vq, &i);
+ if (ret != &err)
+ errx(1, "virtqueue_get_buf: %p", ret);
+
+ /* Add lots of descriptors. */
+ sg_init_table(guest_sg, 1);
+ sg_set_buf(&guest_sg[0], __user_addr_max - 1, 1);
+ for (i = 0; i < RINGSIZE; i++) {
+ err = virtqueue_add_outbuf(vq, guest_sg, 1, &err, GFP_KERNEL);
+ if (err)
+ errx(1, "virtqueue_add_outbuf (multiple): %i", err);
+ }
+
+ /* Now get many, and consume them all at once. */
+ vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+ vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+
+ for (i = 0; i < RINGSIZE; i++) {
+ err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+ if (err != 1)
+ errx(1, "vringh_getdesc_user: %i", err);
+ used[i].id = head;
+ used[i].len = 0;
+ }
+ /* Make sure it wraps around ring, to test! */
+ assert(vrh.vring.used->idx % RINGSIZE != 0);
+ err = vringh_complete_multi_user(&vrh, used, RINGSIZE);
+ if (err)
+ errx(1, "vringh_complete_multi_user: %i", err);
+
+ /* Free those buffers. */
+ for (i = 0; i < RINGSIZE; i++) {
+ unsigned len;
+ assert(virtqueue_get_buf(vq, &len) != NULL);
+ }
+
+ /* Test weird (but legal!) indirect. */
+ if (vdev.features[0] & (1 << VIRTIO_RING_F_INDIRECT_DESC)) {
+ char *data = __user_addr_max - USER_MEM/4;
+ struct vring_desc *d = __user_addr_max - USER_MEM/2;
+ struct vring vring;
+
+ /* Force creation of direct, which we modify. */
+ vdev.features[0] &= ~(1 << VIRTIO_RING_F_INDIRECT_DESC);
+ vq = vring_new_virtqueue(0, RINGSIZE, ALIGN, &vdev, true,
+ __user_addr_min,
+ never_notify_host,
+ never_callback_guest,
+ "guest vq");
+
+ sg_init_table(guest_sg, 4);
+ sg_set_buf(&guest_sg[0], d, sizeof(*d)*2);
+ sg_set_buf(&guest_sg[1], d + 2, sizeof(*d)*1);
+ sg_set_buf(&guest_sg[2], data + 6, 4);
+ sg_set_buf(&guest_sg[3], d + 3, sizeof(*d)*3);
+
+ err = virtqueue_add_outbuf(vq, guest_sg, 4, &err, GFP_KERNEL);
+ if (err)
+ errx(1, "virtqueue_add_outbuf (indirect): %i", err);
+
+ vring_init(&vring, RINGSIZE, __user_addr_min, ALIGN);
+
+ /* They're used in order, but double-check... */
+ assert(vring.desc[0].addr == (unsigned long)d);
+ assert(vring.desc[1].addr == (unsigned long)(d+2));
+ assert(vring.desc[2].addr == (unsigned long)data + 6);
+ assert(vring.desc[3].addr == (unsigned long)(d+3));
+ vring.desc[0].flags |= VRING_DESC_F_INDIRECT;
+ vring.desc[1].flags |= VRING_DESC_F_INDIRECT;
+ vring.desc[3].flags |= VRING_DESC_F_INDIRECT;
+
+ /* First indirect */
+ d[0].addr = (unsigned long)data;
+ d[0].len = 1;
+ d[0].flags = VRING_DESC_F_NEXT;
+ d[0].next = 1;
+ d[1].addr = (unsigned long)data + 1;
+ d[1].len = 2;
+ d[1].flags = 0;
+
+ /* Second indirect */
+ d[2].addr = (unsigned long)data + 3;
+ d[2].len = 3;
+ d[2].flags = 0;
+
+ /* Third indirect */
+ d[3].addr = (unsigned long)data + 10;
+ d[3].len = 5;
+ d[3].flags = VRING_DESC_F_NEXT;
+ d[3].next = 1;
+ d[4].addr = (unsigned long)data + 15;
+ d[4].len = 6;
+ d[4].flags = VRING_DESC_F_NEXT;
+ d[4].next = 2;
+ d[5].addr = (unsigned long)data + 21;
+ d[5].len = 7;
+ d[5].flags = 0;
+
+ /* Host picks it up (allocates new iov). */
+ vringh_iov_init(&riov, host_riov, ARRAY_SIZE(host_riov));
+ vringh_iov_init(&wiov, host_wiov, ARRAY_SIZE(host_wiov));
+
+ err = vringh_getdesc_user(&vrh, &riov, &wiov, getrange, &head);
+ if (err != 1)
+ errx(1, "vringh_getdesc_user: %i", err);
+
+ if (head != 0)
+ errx(1, "vringh_getdesc_user: head %i not 0", head);
+
+ assert(riov.max_num & VRINGH_IOV_ALLOCATED);
+ if (getrange != getrange_slow)
+ assert(riov.used == 7);
+ else
+ assert(riov.used == 28);
+ err = vringh_iov_pull_user(&riov, buf, 29);
+ assert(err == 28);
+
+ /* Data should be linear. */
+ for (i = 0; i < err; i++)
+ assert(buf[i] == i);
+ vringh_iov_cleanup(&riov);
+ }
+
+ /* Don't leak memory... */
+ vring_del_virtqueue(vq);
+ free(__user_addr_min);
+
+ return 0;
+}