diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-12 00:29:01 (GMT) |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-10-12 00:29:01 (GMT) |
commit | 81ae31d78239318610d7c2acb3e2610d622a5aa4 (patch) | |
tree | 1e31b300f1574fceaff065a9bd92460b7c466f7c /include | |
parent | ef4a48c513211d842c55e84f7a1c31884b91dcf7 (diff) | |
parent | 95afae481414cbdb0567bf82d5e5077c3ac9da20 (diff) | |
download | linux-81ae31d78239318610d7c2acb3e2610d622a5aa4.tar.xz |
Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull Xen updates from David Vrabel:
"Features and fixes:
- Add pvscsi frontend and backend drivers.
- Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
- Try and keep memory contiguous during PV memory setup (reduces
SWIOTLB usage).
- Allow front/back drivers to use threaded irqs.
- Support large initrds in PV guests.
- Fix PVH guests in preparation for Xen 4.5"
* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
xen: remove DEFINE_XENBUS_DRIVER() macro
xen/xenbus: Remove BUG_ON() when error string trucated
xen/xenbus: Correct the comments for xenbus_grant_ring()
x86/xen: Set EFER.NX and EFER.SCE in PVH guests
xen: eliminate scalability issues from initrd handling
xen: sync some headers with xen tree
xen: make pvscsi frontend dependant on xenbus frontend
arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
xen-scsifront: don't deadlock if the ring becomes full
x86: remove the Xen-specific _PAGE_IOMAP PTE flag
x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
x86: skip check for spurious faults for non-present faults
xen/efi: Directly include needed headers
xen-scsiback: clean up a type issue in scsiback_make_tpg()
xen-scsifront: use GFP_ATOMIC under spin_lock
MAINTAINERS: Add xen pvscsi maintainer
xen-scsiback: Add Xen PV SCSI backend driver
xen-scsifront: Add Xen PV SCSI frontend driver
xen: Add Xen pvSCSI protocol description
xen/events: support threaded irqs for interdomain event channels
...
Diffstat (limited to 'include')
-rw-r--r-- | include/xen/events.h | 2 | ||||
-rw-r--r-- | include/xen/interface/elfnote.h | 48 | ||||
-rw-r--r-- | include/xen/interface/io/vscsiif.h | 229 | ||||
-rw-r--r-- | include/xen/interface/xen.h | 272 | ||||
-rw-r--r-- | include/xen/xenbus.h | 21 |
5 files changed, 537 insertions, 35 deletions
diff --git a/include/xen/events.h b/include/xen/events.h index 8bee7a7..5321cd9 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -28,6 +28,8 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, unsigned long irqflags, const char *devname, void *dev_id); +int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, + unsigned int remote_port); int bind_interdomain_evtchn_to_irqhandler(unsigned int remote_domain, unsigned int remote_port, irq_handler_t handler, diff --git a/include/xen/interface/elfnote.h b/include/xen/interface/elfnote.h index 6f4eae3..f90b034 100644 --- a/include/xen/interface/elfnote.h +++ b/include/xen/interface/elfnote.h @@ -3,6 +3,24 @@ * * Definitions used for the Xen ELF notes. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2006, Ian Campbell, XenSource Ltd. */ @@ -18,12 +36,13 @@ * * LEGACY indicated the fields in the legacy __xen_guest string which * this a note type replaces. + * + * String values (for non-legacy) are NULL terminated ASCII, also known + * as ASCIZ type. */ /* * NAME=VALUE pair (string). - * - * LEGACY: FEATURES and PAE */ #define XEN_ELFNOTE_INFO 0 @@ -137,10 +156,30 @@ /* * Whether or not the guest supports cooperative suspend cancellation. + * This is a numeric value. + * + * Default is 0 */ #define XEN_ELFNOTE_SUSPEND_CANCEL 14 /* + * The (non-default) location the initial phys-to-machine map should be + * placed at by the hypervisor (Dom0) or the tools (DomU). + * The kernel must be prepared for this mapping to be established using + * large pages, despite such otherwise not being available to guests. + * The kernel must also be able to handle the page table pages used for + * this mapping not being accessible through the initial mapping. + * (Only x86-64 supports this at present.) + */ +#define XEN_ELFNOTE_INIT_P2M 15 + +/* + * Whether or not the guest can deal with being passed an initrd not + * mapped through its initial page tables. + */ +#define XEN_ELFNOTE_MOD_START_PFN 16 + +/* * The features supported by this kernel (numeric). * * Other than XEN_ELFNOTE_FEATURES on pre-4.2 Xen, this note allows a @@ -153,6 +192,11 @@ */ #define XEN_ELFNOTE_SUPPORTED_FEATURES 17 +/* + * The number of the highest elfnote defined. + */ +#define XEN_ELFNOTE_MAX XEN_ELFNOTE_SUPPORTED_FEATURES + #endif /* __XEN_PUBLIC_ELFNOTE_H__ */ /* diff --git a/include/xen/interface/io/vscsiif.h b/include/xen/interface/io/vscsiif.h new file mode 100644 index 0000000..d07d7ac --- /dev/null +++ b/include/xen/interface/io/vscsiif.h @@ -0,0 +1,229 @@ +/****************************************************************************** + * vscsiif.h + * + * Based on the blkif.h code. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright(c) FUJITSU Limited 2008. + */ + +#ifndef __XEN__PUBLIC_IO_SCSI_H__ +#define __XEN__PUBLIC_IO_SCSI_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + * Feature and Parameter Negotiation + * ================================= + * The two halves of a Xen pvSCSI driver utilize nodes within the XenStore to + * communicate capabilities and to negotiate operating parameters. This + * section enumerates these nodes which reside in the respective front and + * backend portions of the XenStore, following the XenBus convention. + * + * Any specified default value is in effect if the corresponding XenBus node + * is not present in the XenStore. + * + * XenStore nodes in sections marked "PRIVATE" are solely for use by the + * driver side whose XenBus tree contains them. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *------------------ Backend Device Identification (PRIVATE) ------------------ + * + * p-devname + * Values: string + * + * A free string used to identify the physical device (e.g. a disk name). + * + * p-dev + * Values: string + * + * A string specifying the backend device: either a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers), or a WWN (e.g. + * "naa.60014054ac780582"). + * + * v-dev + * Values: string + * + * A string specifying the frontend device in form of a 4-tuple "h:c:t:l" + * (host, controller, target, lun, all integers). + * + *--------------------------------- Features --------------------------------- + * + * feature-sg-grant + * Values: unsigned [VSCSIIF_SG_TABLESIZE...65535] + * Default Value: 0 + * + * Specifies the maximum number of scatter/gather elements in grant pages + * supported. If not set, the backend supports up to VSCSIIF_SG_TABLESIZE + * SG elements specified directly in the request. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: unsigned + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: unsigned + * + * The Xen grant reference granting permission for the backend to map + * the sole page in a single page sized ring buffer. + * + * protocol + * Values: string (XEN_IO_PROTO_ABI_*) + * Default Value: XEN_IO_PROTO_ABI_NATIVE + * + * The machine ABI rules governing the format of all ring request and + * response structures. + */ + +/* Requests from the frontend to the backend */ + +/* + * Request a SCSI operation specified via a CDB in vscsiif_request.cmnd. + * The target is specified via channel, id and lun. + * + * The operation to be performed is specified via a CDB in cmnd[], the length + * of the CDB is in cmd_len. sc_data_direction specifies the direction of data + * (to the device, from the device, or none at all). + * + * If data is to be transferred to or from the device the buffer(s) in the + * guest memory is/are specified via one or multiple scsiif_request_segment + * descriptors each specifying a memory page via a grant_ref_t, a offset into + * the page and the length of the area in that page. All scsiif_request_segment + * areas concatenated form the resulting data buffer used by the operation. + * If the number of scsiif_request_segment areas is not too large (less than + * or equal VSCSIIF_SG_TABLESIZE) the areas can be specified directly in the + * seg[] array and the number of valid scsiif_request_segment elements is to be + * set in nr_segments. + * + * If "feature-sg-grant" in the Xenstore is set it is possible to specify more + * than VSCSIIF_SG_TABLESIZE scsiif_request_segment elements via indirection. + * The maximum number of allowed scsiif_request_segment elements is the value + * of the "feature-sg-grant" entry from Xenstore. When using indirection the + * seg[] array doesn't contain specifications of the data buffers, but + * references to scsiif_request_segment arrays, which in turn reference the + * data buffers. While nr_segments holds the number of populated seg[] entries + * (plus the set VSCSIIF_SG_GRANT bit), the number of scsiif_request_segment + * elements referencing the target data buffers is calculated from the lengths + * of the seg[] elements (the sum of all valid seg[].length divided by the + * size of one scsiif_request_segment structure). + */ +#define VSCSIIF_ACT_SCSI_CDB 1 + +/* + * Request abort of a running operation for the specified target given by + * channel, id, lun and the operation's rqid in ref_rqid. + */ +#define VSCSIIF_ACT_SCSI_ABORT 2 + +/* + * Request a device reset of the specified target (channel and id). + */ +#define VSCSIIF_ACT_SCSI_RESET 3 + +/* + * Preset scatter/gather elements for a following request. Deprecated. + * Keeping the define only to avoid usage of the value "4" for other actions. + */ +#define VSCSIIF_ACT_SCSI_SG_PRESET 4 + +/* + * Maximum scatter/gather segments per request. + * + * Considering balance between allocating at least 16 "vscsiif_request" + * structures on one page (4096 bytes) and the number of scatter/gather + * elements needed, we decided to use 26 as a magic number. + * + * If "feature-sg-grant" is set, more scatter/gather elements can be specified + * by placing them in one or more (up to VSCSIIF_SG_TABLESIZE) granted pages. + * In this case the vscsiif_request seg elements don't contain references to + * the user data, but to the SG elements referencing the user data. + */ +#define VSCSIIF_SG_TABLESIZE 26 + +/* + * based on Linux kernel 2.6.18, still valid + * Changing these values requires support of multiple protocols via the rings + * as "old clients" will blindly use these values and the resulting structure + * sizes. + */ +#define VSCSIIF_MAX_COMMAND_SIZE 16 +#define VSCSIIF_SENSE_BUFFERSIZE 96 + +struct scsiif_request_segment { + grant_ref_t gref; + uint16_t offset; + uint16_t length; +}; + +#define VSCSIIF_SG_PER_PAGE (PAGE_SIZE / sizeof(struct scsiif_request_segment)) + +/* Size of one request is 252 bytes */ +struct vscsiif_request { + uint16_t rqid; /* private guest value, echoed in resp */ + uint8_t act; /* command between backend and frontend */ + uint8_t cmd_len; /* valid CDB bytes */ + + uint8_t cmnd[VSCSIIF_MAX_COMMAND_SIZE]; /* the CDB */ + uint16_t timeout_per_command; /* deprecated */ + uint16_t channel, id, lun; /* (virtual) device specification */ + uint16_t ref_rqid; /* command abort reference */ + uint8_t sc_data_direction; /* for DMA_TO_DEVICE(1) + DMA_FROM_DEVICE(2) + DMA_NONE(3) requests */ + uint8_t nr_segments; /* Number of pieces of scatter-gather */ +/* + * flag in nr_segments: SG elements via grant page + * + * If VSCSIIF_SG_GRANT is set, the low 7 bits of nr_segments specify the number + * of grant pages containing SG elements. Usable if "feature-sg-grant" set. + */ +#define VSCSIIF_SG_GRANT 0x80 + + struct scsiif_request_segment seg[VSCSIIF_SG_TABLESIZE]; + uint32_t reserved[3]; +}; + +/* Size of one response is 252 bytes */ +struct vscsiif_response { + uint16_t rqid; /* identifies request */ + uint8_t padding; + uint8_t sense_len; + uint8_t sense_buffer[VSCSIIF_SENSE_BUFFERSIZE]; + int32_t rslt; + uint32_t residual_len; /* request bufflen - + return the value from physical device */ + uint32_t reserved[36]; +}; + +DEFINE_RING_TYPES(vscsiif, struct vscsiif_request, struct vscsiif_response); + +#endif /*__XEN__PUBLIC_IO_SCSI_H__*/ diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h index de08213..f68719f 100644 --- a/include/xen/interface/xen.h +++ b/include/xen/interface/xen.h @@ -3,6 +3,24 @@ * * Guest OS interface to Xen. * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * * Copyright (c) 2004, K A Fraser */ @@ -73,13 +91,23 @@ * VIRTUAL INTERRUPTS * * Virtual interrupts that a guest OS may receive from Xen. + * In the side comments, 'V.' denotes a per-VCPU VIRQ while 'G.' denotes a + * global VIRQ. The former can be bound once per VCPU and cannot be re-bound. + * The latter can be allocated only once per guest: they must initially be + * allocated to VCPU0 but can subsequently be re-bound. */ -#define VIRQ_TIMER 0 /* Timebase update, and/or requested timeout. */ -#define VIRQ_DEBUG 1 /* Request guest to dump debug info. */ -#define VIRQ_CONSOLE 2 /* (DOM0) Bytes received on emergency console. */ -#define VIRQ_DOM_EXC 3 /* (DOM0) Exceptional event for some domain. */ -#define VIRQ_DEBUGGER 6 /* (DOM0) A domain has paused for debugging. */ -#define VIRQ_PCPU_STATE 9 /* (DOM0) PCPU state changed */ +#define VIRQ_TIMER 0 /* V. Timebase update, and/or requested timeout. */ +#define VIRQ_DEBUG 1 /* V. Request guest to dump debug info. */ +#define VIRQ_CONSOLE 2 /* G. (DOM0) Bytes received on emergency console. */ +#define VIRQ_DOM_EXC 3 /* G. (DOM0) Exceptional event for some domain. */ +#define VIRQ_TBUF 4 /* G. (DOM0) Trace buffer has records available. */ +#define VIRQ_DEBUGGER 6 /* G. (DOM0) A domain has paused for debugging. */ +#define VIRQ_XENOPROF 7 /* V. XenOprofile interrupt: new sample available */ +#define VIRQ_CON_RING 8 /* G. (DOM0) Bytes received on console */ +#define VIRQ_PCPU_STATE 9 /* G. (DOM0) PCPU state changed */ +#define VIRQ_MEM_EVENT 10 /* G. (DOM0) A memory event has occured */ +#define VIRQ_XC_RESERVED 11 /* G. Reserved for XenClient */ +#define VIRQ_ENOMEM 12 /* G. (DOM0) Low on heap memory */ /* Architecture-specific VIRQ definitions. */ #define VIRQ_ARCH_0 16 @@ -92,24 +120,68 @@ #define VIRQ_ARCH_7 23 #define NR_VIRQS 24 + /* - * MMU-UPDATE REQUESTS - * - * HYPERVISOR_mmu_update() accepts a list of (ptr, val) pairs. - * A foreigndom (FD) can be specified (or DOMID_SELF for none). - * Where the FD has some effect, it is described below. - * ptr[1:0] specifies the appropriate MMU_* command. + * enum neg_errnoval HYPERVISOR_mmu_update(const struct mmu_update reqs[], + * unsigned count, unsigned *done_out, + * unsigned foreigndom) + * @reqs is an array of mmu_update_t structures ((ptr, val) pairs). + * @count is the length of the above array. + * @pdone is an output parameter indicating number of completed operations + * @foreigndom[15:0]: FD, the expected owner of data pages referenced in this + * hypercall invocation. Can be DOMID_SELF. + * @foreigndom[31:16]: PFD, the expected owner of pagetable pages referenced + * in this hypercall invocation. The value of this field + * (x) encodes the PFD as follows: + * x == 0 => PFD == DOMID_SELF + * x != 0 => PFD == x - 1 * + * Sub-commands: ptr[1:0] specifies the appropriate MMU_* command. + * ------------- * ptr[1:0] == MMU_NORMAL_PT_UPDATE: - * Updates an entry in a page table. If updating an L1 table, and the new - * table entry is valid/present, the mapped frame must belong to the FD, if - * an FD has been specified. If attempting to map an I/O page then the - * caller assumes the privilege of the FD. + * Updates an entry in a page table belonging to PFD. If updating an L1 table, + * and the new table entry is valid/present, the mapped frame must belong to + * FD. If attempting to map an I/O page then the caller assumes the privilege + * of the FD. * FD == DOMID_IO: Permit /only/ I/O mappings, at the priv level of the caller. * FD == DOMID_XEN: Map restricted areas of Xen's heap space. * ptr[:2] -- Machine address of the page-table entry to modify. * val -- Value to write. * + * There also certain implicit requirements when using this hypercall. The + * pages that make up a pagetable must be mapped read-only in the guest. + * This prevents uncontrolled guest updates to the pagetable. Xen strictly + * enforces this, and will disallow any pagetable update which will end up + * mapping pagetable page RW, and will disallow using any writable page as a + * pagetable. In practice it means that when constructing a page table for a + * process, thread, etc, we MUST be very dilligient in following these rules: + * 1). Start with top-level page (PGD or in Xen language: L4). Fill out + * the entries. + * 2). Keep on going, filling out the upper (PUD or L3), and middle (PMD + * or L2). + * 3). Start filling out the PTE table (L1) with the PTE entries. Once + * done, make sure to set each of those entries to RO (so writeable bit + * is unset). Once that has been completed, set the PMD (L2) for this + * PTE table as RO. + * 4). When completed with all of the PMD (L2) entries, and all of them have + * been set to RO, make sure to set RO the PUD (L3). Do the same + * operation on PGD (L4) pagetable entries that have a PUD (L3) entry. + * 5). Now before you can use those pages (so setting the cr3), you MUST also + * pin them so that the hypervisor can verify the entries. This is done + * via the HYPERVISOR_mmuext_op(MMUEXT_PIN_L4_TABLE, guest physical frame + * number of the PGD (L4)). And this point the HYPERVISOR_mmuext_op( + * MMUEXT_NEW_BASEPTR, guest physical frame number of the PGD (L4)) can be + * issued. + * For 32-bit guests, the L4 is not used (as there is less pagetables), so + * instead use L3. + * At this point the pagetables can be modified using the MMU_NORMAL_PT_UPDATE + * hypercall. Also if so desired the OS can also try to write to the PTE + * and be trapped by the hypervisor (as the PTE entry is RO). + * + * To deallocate the pages, the operations are the reverse of the steps + * mentioned above. The argument is MMUEXT_UNPIN_TABLE for all levels and the + * pagetable MUST not be in use (meaning that the cr3 is not set to it). + * * ptr[1:0] == MMU_MACHPHYS_UPDATE: * Updates an entry in the machine->pseudo-physical mapping table. * ptr[:2] -- Machine address within the frame whose mapping to modify. @@ -119,6 +191,72 @@ * ptr[1:0] == MMU_PT_UPDATE_PRESERVE_AD: * As MMU_NORMAL_PT_UPDATE above, but A/D bits currently in the PTE are ORed * with those in @val. + * + * @val is usually the machine frame number along with some attributes. + * The attributes by default follow the architecture defined bits. Meaning that + * if this is a X86_64 machine and four page table layout is used, the layout + * of val is: + * - 63 if set means No execute (NX) + * - 46-13 the machine frame number + * - 12 available for guest + * - 11 available for guest + * - 10 available for guest + * - 9 available for guest + * - 8 global + * - 7 PAT (PSE is disabled, must use hypercall to make 4MB or 2MB pages) + * - 6 dirty + * - 5 accessed + * - 4 page cached disabled + * - 3 page write through + * - 2 userspace accessible + * - 1 writeable + * - 0 present + * + * The one bits that does not fit with the default layout is the PAGE_PSE + * also called PAGE_PAT). The MMUEXT_[UN]MARK_SUPER arguments to the + * HYPERVISOR_mmuext_op serve as mechanism to set a pagetable to be 4MB + * (or 2MB) instead of using the PAGE_PSE bit. + * + * The reason that the PAGE_PSE (bit 7) is not being utilized is due to Xen + * using it as the Page Attribute Table (PAT) bit - for details on it please + * refer to Intel SDM 10.12. The PAT allows to set the caching attributes of + * pages instead of using MTRRs. + * + * The PAT MSR is as follows (it is a 64-bit value, each entry is 8 bits): + * PAT4 PAT0 + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WC | WB | UC | UC- | WC | WB | <= Linux + * +-----+-----+----+----+----+-----+----+----+ + * | UC | UC- | WT | WB | UC | UC- | WT | WB | <= BIOS (default when machine boots) + * +-----+-----+----+----+----+-----+----+----+ + * | rsv | rsv | WP | WC | UC | UC- | WT | WB | <= Xen + * +-----+-----+----+----+----+-----+----+----+ + * + * The lookup of this index table translates to looking up + * Bit 7, Bit 4, and Bit 3 of val entry: + * + * PAT/PSE (bit 7) ... PCD (bit 4) .. PWT (bit 3). + * + * If all bits are off, then we are using PAT0. If bit 3 turned on, + * then we are using PAT1, if bit 3 and bit 4, then PAT2.. + * + * As you can see, the Linux PAT1 translates to PAT4 under Xen. Which means + * that if a guest that follows Linux's PAT setup and would like to set Write + * Combined on pages it MUST use PAT4 entry. Meaning that Bit 7 (PAGE_PAT) is + * set. For example, under Linux it only uses PAT0, PAT1, and PAT2 for the + * caching as: + * + * WB = none (so PAT0) + * WC = PWT (bit 3 on) + * UC = PWT | PCD (bit 3 and 4 are on). + * + * To make it work with Xen, it needs to translate the WC bit as so: + * + * PWT (so bit 3 on) --> PAT (so bit 7 is on) and clear bit 3 + * + * And to translate back it would: + * + * PAT (bit 7 on) --> PWT (bit 3 on) and clear bit 7. */ #define MMU_NORMAL_PT_UPDATE 0 /* checked '*ptr = val'. ptr is MA. */ #define MMU_MACHPHYS_UPDATE 1 /* ptr = MA of frame to modify entry for */ @@ -127,7 +265,12 @@ /* * MMU EXTENDED OPERATIONS * - * HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. + * enum neg_errnoval HYPERVISOR_mmuext_op(mmuext_op_t uops[], + * unsigned int count, + * unsigned int *pdone, + * unsigned int foreigndom) + */ +/* HYPERVISOR_mmuext_op() accepts a list of mmuext_op structures. * A foreigndom (FD) can be specified (or DOMID_SELF for none). * Where the FD has some effect, it is described below. * @@ -164,9 +307,23 @@ * cmd: MMUEXT_FLUSH_CACHE * No additional arguments. Writes back and flushes cache contents. * + * cmd: MMUEXT_FLUSH_CACHE_GLOBAL + * No additional arguments. Writes back and flushes cache contents + * on all CPUs in the system. + * * cmd: MMUEXT_SET_LDT * linear_addr: Linear address of LDT base (NB. must be page-aligned). * nr_ents: Number of entries in LDT. + * + * cmd: MMUEXT_CLEAR_PAGE + * mfn: Machine frame number to be cleared. + * + * cmd: MMUEXT_COPY_PAGE + * mfn: Machine frame number of the destination page. + * src_mfn: Machine frame number of the source page. + * + * cmd: MMUEXT_[UN]MARK_SUPER + * mfn: Machine frame number of head of superpage to be [un]marked. */ #define MMUEXT_PIN_L1_TABLE 0 #define MMUEXT_PIN_L2_TABLE 1 @@ -183,12 +340,18 @@ #define MMUEXT_FLUSH_CACHE 12 #define MMUEXT_SET_LDT 13 #define MMUEXT_NEW_USER_BASEPTR 15 +#define MMUEXT_CLEAR_PAGE 16 +#define MMUEXT_COPY_PAGE 17 +#define MMUEXT_FLUSH_CACHE_GLOBAL 18 +#define MMUEXT_MARK_SUPER 19 +#define MMUEXT_UNMARK_SUPER 20 #ifndef __ASSEMBLY__ struct mmuext_op { unsigned int cmd; union { - /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR */ + /* [UN]PIN_TABLE, NEW_BASEPTR, NEW_USER_BASEPTR + * CLEAR_PAGE, COPY_PAGE, [UN]MARK_SUPER */ xen_pfn_t mfn; /* INVLPG_LOCAL, INVLPG_ALL, SET_LDT */ unsigned long linear_addr; @@ -198,6 +361,8 @@ struct mmuext_op { unsigned int nr_ents; /* TLB_FLUSH_MULTI, INVLPG_MULTI */ void *vcpumask; + /* COPY_PAGE */ + xen_pfn_t src_mfn; } arg2; }; DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); @@ -225,10 +390,23 @@ DEFINE_GUEST_HANDLE_STRUCT(mmuext_op); */ #define VMASST_CMD_enable 0 #define VMASST_CMD_disable 1 + +/* x86/32 guests: simulate full 4GB segment limits. */ #define VMASST_TYPE_4gb_segments 0 + +/* x86/32 guests: trap (vector 15) whenever above vmassist is used. */ #define VMASST_TYPE_4gb_segments_notify 1 + +/* + * x86 guests: support writes to bottom-level PTEs. + * NB1. Page-directory entries cannot be written. + * NB2. Guest must continue to remove all writable mappings of PTEs. + */ #define VMASST_TYPE_writable_pagetables 2 + +/* x86/PAE guests: support PDPTs above 4GB. */ #define VMASST_TYPE_pae_extended_cr3 3 + #define MAX_VMASST_TYPE 3 #ifndef __ASSEMBLY__ @@ -260,6 +438,15 @@ typedef uint16_t domid_t; */ #define DOMID_XEN (0x7FF2U) +/* DOMID_COW is used as the owner of sharable pages */ +#define DOMID_COW (0x7FF3U) + +/* DOMID_INVALID is used to identify pages with unknown owner. */ +#define DOMID_INVALID (0x7FF4U) + +/* Idle domain. */ +#define DOMID_IDLE (0x7FFFU) + /* * Send an array of these to HYPERVISOR_mmu_update(). * NB. The fields are natural pointer/address size for this architecture. @@ -272,7 +459,9 @@ DEFINE_GUEST_HANDLE_STRUCT(mmu_update); /* * Send an array of these to HYPERVISOR_multicall(). - * NB. The fields are natural register size for this architecture. + * NB. The fields are logically the natural register size for this + * architecture. In cases where xen_ulong_t is larger than this then + * any unused bits in the upper portion must be zero. */ struct multicall_entry { xen_ulong_t op; @@ -442,8 +631,48 @@ struct start_info { unsigned long mod_start; /* VIRTUAL address of pre-loaded module. */ unsigned long mod_len; /* Size (bytes) of pre-loaded module. */ int8_t cmd_line[MAX_GUEST_CMDLINE]; + /* The pfn range here covers both page table and p->m table frames. */ + unsigned long first_p2m_pfn;/* 1st pfn forming initial P->M table. */ + unsigned long nr_p2m_frames;/* # of pfns forming initial P->M table. */ }; +/* These flags are passed in the 'flags' field of start_info_t. */ +#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ +#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ +#define SIF_MULTIBOOT_MOD (1<<2) /* Is mod_start a multiboot module? */ +#define SIF_MOD_START_PFN (1<<3) /* Is mod_start a PFN? */ +#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ + +/* + * A multiboot module is a package containing modules very similar to a + * multiboot module array. The only differences are: + * - the array of module descriptors is by convention simply at the beginning + * of the multiboot module, + * - addresses in the module descriptors are based on the beginning of the + * multiboot module, + * - the number of modules is determined by a termination descriptor that has + * mod_start == 0. + * + * This permits to both build it statically and reference it in a configuration + * file, and let the PV guest easily rebase the addresses to virtual addresses + * and at the same time count the number of modules. + */ +struct xen_multiboot_mod_list { + /* Address of first byte of the module */ + uint32_t mod_start; + /* Address of last byte of the module (inclusive) */ + uint32_t mod_end; + /* Address of zero-terminated command line */ + uint32_t cmdline; + /* Unused, must be zero */ + uint32_t pad; +}; +/* + * The console structure in start_info.console.dom0 + * + * This structure includes a variety of information required to + * have a working VGA/VESA console. + */ struct dom0_vga_console_info { uint8_t video_type; #define XEN_VGATYPE_TEXT_MODE_3 0x03 @@ -484,11 +713,6 @@ struct dom0_vga_console_info { } u; }; -/* These flags are passed in the 'flags' field of start_info_t. */ -#define SIF_PRIVILEGED (1<<0) /* Is the domain privileged? */ -#define SIF_INITDOMAIN (1<<1) /* Is this the initial control domain? */ -#define SIF_PM_MASK (0xFF<<8) /* reserve 1 byte for xen-pm options */ - typedef uint64_t cpumap_t; typedef uint8_t xen_domain_handle_t[16]; diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0324c6d..b78f21c 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -86,6 +86,7 @@ struct xenbus_device_id /* A xenbus driver. */ struct xenbus_driver { + const char *name; /* defaults to ids[0].devicetype */ const struct xenbus_device_id *ids; int (*probe)(struct xenbus_device *dev, const struct xenbus_device_id *id); @@ -100,20 +101,22 @@ struct xenbus_driver { int (*is_ready)(struct xenbus_device *dev); }; -#define DEFINE_XENBUS_DRIVER(var, drvname, methods...) \ -struct xenbus_driver var ## _driver = { \ - .driver.name = drvname + 0 ?: var ## _ids->devicetype, \ - .driver.owner = THIS_MODULE, \ - .ids = var ## _ids, ## methods \ -} - static inline struct xenbus_driver *to_xenbus_driver(struct device_driver *drv) { return container_of(drv, struct xenbus_driver, driver); } -int __must_check xenbus_register_frontend(struct xenbus_driver *); -int __must_check xenbus_register_backend(struct xenbus_driver *); +int __must_check __xenbus_register_frontend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); +int __must_check __xenbus_register_backend(struct xenbus_driver *drv, + struct module *owner, + const char *mod_name); + +#define xenbus_register_frontend(drv) \ + __xenbus_register_frontend(drv, THIS_MODULE, KBUILD_MODNAME); +#define xenbus_register_backend(drv) \ + __xenbus_register_backend(drv, THIS_MODULE, KBUILD_MODNAME); void xenbus_unregister_driver(struct xenbus_driver *drv); |