diff options
Diffstat (limited to 'drivers/lguest')
-rw-r--r-- | drivers/lguest/Makefile | 3 | ||||
-rw-r--r-- | drivers/lguest/core.c | 29 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 7 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 26 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 540 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 221 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 75 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 198 |
8 files changed, 252 insertions, 847 deletions
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile index c419750..16f52ee 100644 --- a/drivers/lguest/Makefile +++ b/drivers/lguest/Makefile @@ -1,6 +1,3 @@ -# Guest requires the device configuration and probing code. -obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o - # Host requires the other files, which can be a module. obj-$(CONFIG_LGUEST) += lg.o lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \ diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 6590558..7dc93aa 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { + /* If the launcher asked for a register with LHREQ_GETREG */ + if (cpu->reg_read) { + if (put_user(*cpu->reg_read, user)) + return -EFAULT; + cpu->reg_read = NULL; + return sizeof(*cpu->reg_read); + } + /* We stop running once the Guest is dead. */ while (!cpu->lg->dead) { unsigned int irq; @@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* - * It's possible the Guest did a NOTIFY hypercall to the - * Launcher. - */ - if (cpu->pending_notify) { - /* - * Does it just needs to write to a registered - * eventfd (ie. the appropriate virtqueue thread)? - */ - if (!send_notify_to_eventfd(cpu)) { - /* OK, we tell the main Launcher. */ - if (put_user(cpu->pending_notify, user)) - return -EFAULT; - return sizeof(cpu->pending_notify); - } + /* Do we have to tell the Launcher about a trap? */ + if (cpu->pending.trap) { + if (copy_to_user(user, &cpu->pending, + sizeof(cpu->pending))) + return -EFAULT; + return sizeof(cpu->pending); } /* diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 83511eb..1219af4 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) /* Similarly, this sets the halted flag for run_guest(). */ cpu->halted = 1; break; - case LHCALL_NOTIFY: - cpu->pending_notify = args->arg1; - break; default: /* It should be an architecture-specific hypercall. */ if (lguest_arch_do_hcall(cpu, args)) @@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu) * Stop doing hypercalls if they want to notify the Launcher: * it needs to service this first. */ - if (cpu->pending_notify) + if (cpu->pending.trap) break; } } @@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu) * NOTIFY to the Launcher, we want to return now. Otherwise we do * the hypercall. */ - if (!cpu->pending_notify) { + if (!cpu->pending.trap) { do_hcall(cpu, cpu->hcall); /* * Tricky point: we reset the hcall pointer to mark the diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 2eef40b..307e8b3 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -50,7 +50,10 @@ struct lg_cpu { /* Bitmap of what has changed: see CHANGED_* above. */ int changed; - unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ + /* Pending operation. */ + struct lguest_pending pending; + + unsigned long *reg_read; /* register from LHREQ_GETREG */ /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; @@ -78,24 +81,18 @@ struct lg_cpu { struct lg_cpu_arch arch; }; -struct lg_eventfd { - unsigned long addr; - struct eventfd_ctx *event; -}; - -struct lg_eventfd_map { - unsigned int num; - struct lg_eventfd map[]; -}; - /* The private info the thread maintains about the guest. */ struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; + /* Valid guest memory pages must be < this. */ u32 pfn_limit; + /* Device memory is >= pfn_limit and < device_limit. */ + u32 device_limit; + /* * This provides the offset to the base of guest-physical memory in the * Launcher. @@ -110,8 +107,6 @@ struct lguest { unsigned int stack_pages; u32 tsc_khz; - struct lg_eventfd_map *eventfds; - /* Dead? */ const char *dead; }; @@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu); void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t val); void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages); -bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode); +bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode, + unsigned long *iomem); void pin_page(struct lg_cpu *cpu, unsigned long vaddr); +bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr); unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr); void page_table_guest_data_init(struct lg_cpu *cpu); @@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu); int lguest_arch_init_hypercalls(struct lg_cpu *cpu); int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args); void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start); +unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any); /* <arch>/switcher.S: */ extern char start_switcher_text[], end_switcher_text[], switch_to_guest[]; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c deleted file mode 100644 index 89088d6..0000000 --- a/drivers/lguest/lguest_device.c +++ /dev/null @@ -1,540 +0,0 @@ -/*P:050 - * Lguest guests use a very simple method to describe devices. It's a - * series of device descriptors contained just above the top of normal Guest - * memory. - * - * We use the standard "virtio" device infrastructure, which provides us with a - * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. -:*/ -#include <linux/init.h> -#include <linux/bootmem.h> -#include <linux/lguest_launcher.h> -#include <linux/virtio.h> -#include <linux/virtio_config.h> -#include <linux/interrupt.h> -#include <linux/virtio_ring.h> -#include <linux/err.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <asm/io.h> -#include <asm/paravirt.h> -#include <asm/lguest_hcall.h> - -/* The pointer to our (page) of device descriptions. */ -static void *lguest_devices; - -/* - * For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. - */ -static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) -{ - return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); -} - -static inline void lguest_unmap(void *addr) -{ - iounmap((__force void __iomem *)addr); -} - -/*D:100 - * Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. - */ -struct lguest_device { - struct virtio_device vdev; - - /* The entry in the lguest_devices page for this device. */ - struct lguest_device_desc *desc; -}; - -/* - * Since the virtio infrastructure hands us a pointer to the virtio_device all - * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. - */ -#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) - -/*D:130 - * Device configurations - * - * The configuration information for a device consists of one or more - * virtqueues, a feature bitmap, and some configuration bytes. The - * configuration bytes don't really matter to us: the Launcher sets them up, and - * the driver will look at them during setup. - * - * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. - */ -static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) -{ - return (void *)(desc + 1); -} - -/* The features come immediately after the virtqueues. */ -static u8 *lg_features(const struct lguest_device_desc *desc) -{ - return (void *)(lg_vq(desc) + desc->num_vq); -} - -/* The config space comes after the two feature bitmasks. */ -static u8 *lg_config(const struct lguest_device_desc *desc) -{ - return lg_features(desc) + desc->feature_len * 2; -} - -/* The total size of the config page used by this device (incl. desc) */ -static unsigned desc_size(const struct lguest_device_desc *desc) -{ - return sizeof(*desc) - + desc->num_vq * sizeof(struct lguest_vqconfig) - + desc->feature_len * 2 - + desc->config_len; -} - -/* This gets the device's feature bits. */ -static u64 lg_get_features(struct virtio_device *vdev) -{ - unsigned int i; - u32 features = 0; - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - u8 *in_features = lg_features(desc); - - /* We do this the slow but generic way. */ - for (i = 0; i < min(desc->feature_len * 8, 32); i++) - if (in_features[i / 8] & (1 << (i % 8))) - features |= (1 << i); - - return features; -} - -/* - * To notify on reset or feature finalization, we (ab)use the NOTIFY - * hypercall, with the descriptor address of the device. - */ -static void status_notify(struct virtio_device *vdev) -{ - unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; - - hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0); -} - -/* - * The virtio core takes the features the Host offers, and copies the ones - * supported by the driver into the vdev->features array. Once that's all - * sorted out, this routine is called so we can tell the Host which features we - * understand and accept. - */ -static int lg_finalize_features(struct virtio_device *vdev) -{ - unsigned int i, bits; - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - /* Second half of bitmap is features we accept. */ - u8 *out_features = lg_features(desc) + desc->feature_len; - - /* Give virtio_ring a chance to accept features. */ - vring_transport_features(vdev); - - /* Make sure we don't have any features > 32 bits! */ - BUG_ON((u32)vdev->features != vdev->features); - - /* - * Since lguest is currently x86-only, we're little-endian. That - * means we could just memcpy. But it's not time critical, and in - * case someone copies this code, we do it the slow, obvious way. - */ - memset(out_features, 0, desc->feature_len); - bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; - for (i = 0; i < bits; i++) { - if (__virtio_test_bit(vdev, i)) - out_features[i / 8] |= (1 << (i % 8)); - } - - /* Tell Host we've finished with this device's feature negotiation */ - status_notify(vdev); - - return 0; -} - -/* Once they've found a field, getting a copy of it is easy. */ -static void lg_get(struct virtio_device *vdev, unsigned int offset, - void *buf, unsigned len) -{ - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - - /* Check they didn't ask for more than the length of the config! */ - BUG_ON(offset + len > desc->config_len); - memcpy(buf, lg_config(desc) + offset, len); -} - -/* Setting the contents is also trivial. */ -static void lg_set(struct virtio_device *vdev, unsigned int offset, - const void *buf, unsigned len) -{ - struct lguest_device_desc *desc = to_lgdev(vdev)->desc; - - /* Check they didn't ask for more than the length of the config! */ - BUG_ON(offset + len > desc->config_len); - memcpy(lg_config(desc) + offset, buf, len); -} - -/* - * The operations to get and set the status word just access the status field - * of the device descriptor. - */ -static u8 lg_get_status(struct virtio_device *vdev) -{ - return to_lgdev(vdev)->desc->status; -} - -static void lg_set_status(struct virtio_device *vdev, u8 status) -{ - BUG_ON(!status); - to_lgdev(vdev)->desc->status = status; - - /* Tell Host immediately if we failed. */ - if (status & VIRTIO_CONFIG_S_FAILED) - status_notify(vdev); -} - -static void lg_reset(struct virtio_device *vdev) -{ - /* 0 status means "reset" */ - to_lgdev(vdev)->desc->status = 0; - status_notify(vdev); -} - -/* - * Virtqueues - * - * The other piece of infrastructure virtio needs is a "virtqueue": a way of - * the Guest device registering buffers for the other side to read from or - * write into (ie. send and receive buffers). Each device can have multiple - * virtqueues: for example the console driver uses one queue for sending and - * another for receiving. - * - * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue - * already exists in virtio_ring.c. We just need to connect it up. - * - * We start with the information we need to keep about each virtqueue. - */ - -/*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info { - /* A copy of the information contained in the device config. */ - struct lguest_vqconfig config; - - /* The address where we mapped the virtio ring, so we can unmap it. */ - void *pages; -}; - -/* - * When the virtio_ring code wants to prod the Host, it calls us here and we - * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. - */ -static bool lg_notify(struct virtqueue *vq) -{ - /* - * We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. - */ - struct lguest_vq_info *lvq = vq->priv; - - hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0); - return true; -} - -/* An extern declaration inside a C file is bad form. Don't do it. */ -extern int lguest_setup_irq(unsigned int irq); - -/* - * This routine finds the Nth virtqueue described in the configuration of - * this device and sets it up. - * - * This is kind of an ugly duckling. It'd be nicer to have a standard - * representation of a virtqueue in the configuration space, but it seems that - * everyone wants to do it differently. The KVM coders want the Guest to - * allocate its own pages and tell the Host where they are, but for lguest it's - * simpler for the Host to simply tell us where the pages are. - */ -static struct virtqueue *lg_find_vq(struct virtio_device *vdev, - unsigned index, - void (*callback)(struct virtqueue *vq), - const char *name) -{ - struct lguest_device *ldev = to_lgdev(vdev); - struct lguest_vq_info *lvq; - struct virtqueue *vq; - int err; - - if (!name) - return NULL; - - /* We must have this many virtqueues. */ - if (index >= ldev->desc->num_vq) - return ERR_PTR(-ENOENT); - - lvq = kmalloc(sizeof(*lvq), GFP_KERNEL); - if (!lvq) - return ERR_PTR(-ENOMEM); - - /* - * Make a copy of the "struct lguest_vqconfig" entry, which sits after - * the descriptor. We need a copy because the config space might not - * be aligned correctly. - */ - memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); - - printk("Mapping virtqueue %i addr %lx\n", index, - (unsigned long)lvq->config.pfn << PAGE_SHIFT); - /* Figure out how many pages the ring will take, and map that memory */ - lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT, - DIV_ROUND_UP(vring_size(lvq->config.num, - LGUEST_VRING_ALIGN), - PAGE_SIZE)); - if (!lvq->pages) { - err = -ENOMEM; - goto free_lvq; - } - - /* - * OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. Note that we set weak_barriers - * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu - * barriers. - */ - vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev, - true, lvq->pages, lg_notify, callback, name); - if (!vq) { - err = -ENOMEM; - goto unmap; - } - - /* Make sure the interrupt is allocated. */ - err = lguest_setup_irq(lvq->config.irq); - if (err) - goto destroy_vring; - - /* - * Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. - * - * FIXME: We used to have a flag for the Host to tell us we could use - * the interrupt as a source of randomness: it'd be nice to have that - * back. - */ - err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, - dev_name(&vdev->dev), vq); - if (err) - goto free_desc; - - /* - * Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. - */ - vq->priv = lvq; - return vq; - -free_desc: - irq_free_desc(lvq->config.irq); -destroy_vring: - vring_del_virtqueue(vq); -unmap: - lguest_unmap(lvq->pages); -free_lvq: - kfree(lvq); - return ERR_PTR(err); -} -/*:*/ - -/* Cleaning up a virtqueue is easy */ -static void lg_del_vq(struct virtqueue *vq) -{ - struct lguest_vq_info *lvq = vq->priv; - - /* Release the interrupt */ - free_irq(lvq->config.irq, vq); - /* Tell virtio_ring.c to free the virtqueue. */ - vring_del_virtqueue(vq); - /* Unmap the pages containing the ring. */ - lguest_unmap(lvq->pages); - /* Free our own queue information. */ - kfree(lvq); -} - -static void lg_del_vqs(struct virtio_device *vdev) -{ - struct virtqueue *vq, *n; - - list_for_each_entry_safe(vq, n, &vdev->vqs, list) - lg_del_vq(vq); -} - -static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) -{ - struct lguest_device *ldev = to_lgdev(vdev); - int i; - - /* We must have this many virtqueues. */ - if (nvqs > ldev->desc->num_vq) - return -ENOENT; - - for (i = 0; i < nvqs; ++i) { - vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) - goto error; - } - return 0; - -error: - lg_del_vqs(vdev); - return PTR_ERR(vqs[i]); -} - -static const char *lg_bus_name(struct virtio_device *vdev) -{ - return ""; -} - -/* The ops structure which hooks everything together. */ -static const struct virtio_config_ops lguest_config_ops = { - .get_features = lg_get_features, - .finalize_features = lg_finalize_features, - .get = lg_get, - .set = lg_set, - .get_status = lg_get_status, - .set_status = lg_set_status, - .reset = lg_reset, - .find_vqs = lg_find_vqs, - .del_vqs = lg_del_vqs, - .bus_name = lg_bus_name, -}; - -/* - * The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. - */ -static struct device *lguest_root; - -/*D:120 - * This is the core of the lguest bus: actually adding a new device. - * It's a separate function because it's neater that way, and because an - * earlier version of the code supported hotplug and unplug. They were removed - * early on because they were never used. - * - * As Andrew Tridgell says, "Untested code is buggy code". - * - * It's worth reading this carefully: we start with a pointer to the new device - * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. - */ -static void add_lguest_device(struct lguest_device_desc *d, - unsigned int offset) -{ - struct lguest_device *ldev; - - /* Start with zeroed memory; Linux's device layer counts on it. */ - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) { - printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", - offset, d->type); - return; - } - - /* This devices' parent is the lguest/ dir. */ - ldev->vdev.dev.parent = lguest_root; - /* - * The device type comes straight from the descriptor. There's also a - * device vendor field in the virtio_device struct, which we leave as - * 0. - */ - ldev->vdev.id.device = d->type; - /* - * We have a simple set of routines for querying the device's - * configuration information and setting its status. - */ - ldev->vdev.config = &lguest_config_ops; - /* And we remember the device's descriptor for lguest_config_ops. */ - ldev->desc = d; - - /* - * register_virtio_device() sets up the generic fields for the struct - * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. - */ - if (register_virtio_device(&ldev->vdev) != 0) { - printk(KERN_ERR "Failed to register lguest dev %u type %u\n", - offset, d->type); - kfree(ldev); - } -} - -/*D:110 - * scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". - */ -static void scan_devices(void) -{ - unsigned int i; - struct lguest_device_desc *d; - - /* We start at the page beginning, and skip over each entry. */ - for (i = 0; i < PAGE_SIZE; i += desc_size(d)) { - d = lguest_devices + i; - - /* Once we hit a zero, stop. */ - if (d->type == 0) - break; - - printk("Device at %i has size %u\n", i, desc_size(d)); - add_lguest_device(d, i); - } -} - -/*D:105 - * Fairly early in boot, lguest_devices_init() is called to set up the - * lguest device infrastructure. We check that we are a Guest by checking - * pv_info.name: there are other ways of checking, but this seems most - * obvious to me. - * - * So we can access the "struct lguest_device_desc"s easily, we map that memory - * and store the pointer in the global "lguest_devices". Then we register a - * root device from which all our devices will hang (this seems to be the - * correct sysfs incantation). - * - * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. - */ -static int __init lguest_devices_init(void) -{ - if (strcmp(pv_info.name, "lguest") != 0) - return 0; - - lguest_root = root_device_register("lguest"); - if (IS_ERR(lguest_root)) - panic("Could not register lguest root"); - - /* Devices are in a single page above top of "normal" mem */ - lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1); - - scan_devices(); - return 0; -} -/* We do this after core stuff, but before the drivers. */ -postcore_initcall(lguest_devices_init); - -/*D:150 - * At this point in the journey we used to now wade through the lguest - * devices themselves: net, block and console. Since they're all now virtio - * devices rather than lguest-specific, I've decided to ignore them. Mostly, - * they're kind of boring. But this does mean you'll never experience the - * thrill of reading the forbidden love scene buried deep in the block driver. - * - * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". - */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 4263f4c..c4c6113 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -2,175 +2,62 @@ * launcher controls and communicates with the Guest. For example, * the first write will tell us the Guest's memory layout and entry * point. A read will run the Guest until something happens, such as - * a signal or the Guest doing a NOTIFY out to the Launcher. There is - * also a way for the Launcher to attach eventfds to particular NOTIFY - * values instead of returning from the read() call. + * a signal or the Guest accessing a device. :*/ #include <linux/uaccess.h> #include <linux/miscdevice.h> #include <linux/fs.h> #include <linux/sched.h> -#include <linux/eventfd.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/export.h> #include "lg.h" -/*L:056 - * Before we move on, let's jump ahead and look at what the kernel does when - * it needs to look up the eventfds. That will complete our picture of how we - * use RCU. - * - * The notification value is in cpu->pending_notify: we return true if it went - * to an eventfd. - */ -bool send_notify_to_eventfd(struct lg_cpu *cpu) -{ - unsigned int i; - struct lg_eventfd_map *map; - - /* - * This "rcu_read_lock()" helps track when someone is still looking at - * the (RCU-using) eventfds array. It's not actually a lock at all; - * indeed it's a noop in many configurations. (You didn't expect me to - * explain all the RCU secrets here, did you?) - */ - rcu_read_lock(); - /* - * rcu_dereference is the counter-side of rcu_assign_pointer(); it - * makes sure we don't access the memory pointed to by - * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, - * but Alpha allows this! Paul McKenney points out that a really - * aggressive compiler could have the same effect: - * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html - * - * So play safe, use rcu_dereference to get the rcu-protected pointer: - */ - map = rcu_dereference(cpu->lg->eventfds); - /* - * Simple array search: even if they add an eventfd while we do this, - * we'll continue to use the old array and just won't see the new one. - */ - for (i = 0; i < map->num; i++) { - if (map->map[i].addr == cpu->pending_notify) { - eventfd_signal(map->map[i].event, 1); - cpu->pending_notify = 0; - break; - } - } - /* We're done with the rcu-protected variable cpu->lg->eventfds. */ - rcu_read_unlock(); - - /* If we cleared the notification, it's because we found a match. */ - return cpu->pending_notify == 0; -} - -/*L:055 - * One of the more tricksy tricks in the Linux Kernel is a technique called - * Read Copy Update. Since one point of lguest is to teach lguest journeyers - * about kernel coding, I use it here. (In case you're curious, other purposes - * include learning about virtualization and instilling a deep appreciation for - * simplicity and puppies). - * - * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we - * add new eventfds without ever blocking readers from accessing the array. - * The current Launcher only does this during boot, so that never happens. But - * Read Copy Update is cool, and adding a lock risks damaging even more puppies - * than this code does. - * - * We allocate a brand new one-larger array, copy the old one and add our new - * element. Then we make the lg eventfd pointer point to the new array. - * That's the easy part: now we need to free the old one, but we need to make - * sure no slow CPU somewhere is still looking at it. That's what - * synchronize_rcu does for us: waits until every CPU has indicated that it has - * moved on to know it's no longer using the old one. - * - * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. - */ -static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) +/*L:052 + The Launcher can get the registers, and also set some of them. +*/ +static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input) { - struct lg_eventfd_map *new, *old = lg->eventfds; - - /* - * We don't allow notifications on value 0 anyway (pending_notify of - * 0 means "nothing pending"). - */ - if (!addr) - return -EINVAL; - - /* - * Replace the old array with the new one, carefully: others can - * be accessing it at the same time. - */ - new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), - GFP_KERNEL); - if (!new) - return -ENOMEM; + unsigned long which; - /* First make identical copy. */ - memcpy(new->map, old->map, sizeof(old->map[0]) * old->num); - new->num = old->num; - - /* Now append new entry. */ - new->map[new->num].addr = addr; - new->map[new->num].event = eventfd_ctx_fdget(fd); - if (IS_ERR(new->map[new->num].event)) { - int err = PTR_ERR(new->map[new->num].event); - kfree(new); - return err; - } - new->num++; + /* We re-use the ptrace structure to specify which register to read. */ + if (get_user(which, input) != 0) + return -EFAULT; /* - * Now put new one in place: rcu_assign_pointer() is a fancy way of - * doing "lg->eventfds = new", but it uses memory barriers to make - * absolutely sure that the contents of "new" written above is nailed - * down before we actually do the assignment. + * We set up the cpu register pointer, and their next read will + * actually get the value (instead of running the guest). * - * We have to think about these kinds of things when we're operating on - * live data without locks. + * The last argument 'true' says we can access any register. */ - rcu_assign_pointer(lg->eventfds, new); + cpu->reg_read = lguest_arch_regptr(cpu, which, true); + if (!cpu->reg_read) + return -ENOENT; - /* - * We're not in a big hurry. Wait until no one's looking at old - * version, then free it. - */ - synchronize_rcu(); - kfree(old); - - return 0; + /* And because this is a write() call, we return the length used. */ + return sizeof(unsigned long) * 2; } -/*L:052 - * Receiving notifications from the Guest is usually done by attaching a - * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will - * become readable when the Guest does an LHCALL_NOTIFY with that value. - * - * This is really convenient for processing each virtqueue in a separate - * thread. - */ -static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) +static int setreg(struct lg_cpu *cpu, const unsigned long __user *input) { - unsigned long addr, fd; - int err; + unsigned long which, value, *reg; - if (get_user(addr, input) != 0) + /* We re-use the ptrace structure to specify which register to read. */ + if (get_user(which, input) != 0) return -EFAULT; input++; - if (get_user(fd, input) != 0) + if (get_user(value, input) != 0) return -EFAULT; - /* - * Just make sure two callers don't add eventfds at once. We really - * only need to lock against callers adding to the same Guest, so using - * the Big Lguest Lock is overkill. But this is setup, not a fast path. - */ - mutex_lock(&lguest_lock); - err = add_eventfd(lg, addr, fd); - mutex_unlock(&lguest_lock); + /* The last argument 'false' means we can't access all registers. */ + reg = lguest_arch_regptr(cpu, which, false); + if (!reg) + return -ENOENT; - return err; + *reg = value; + + /* And because this is a write() call, we return the length used. */ + return sizeof(unsigned long) * 3; } /*L:050 @@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } +/*L:053 + * Deliver a trap: this is used by the Launcher if it can't emulate + * an instruction. + */ +static int trap(struct lg_cpu *cpu, const unsigned long __user *input) +{ + unsigned long trapnum; + + if (get_user(trapnum, input) != 0) + return -EFAULT; + + if (!deliver_trap(cpu, trapnum)) + return -EINVAL; + + return 0; +} + /*L:040 * Once our Guest is initialized, the Launcher makes it run by reading * from /dev/lguest. @@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) * If we returned from read() last time because the Guest sent I/O, * clear the flag. */ - if (cpu->pending_notify) - cpu->pending_notify = 0; + if (cpu->pending.trap) + cpu->pending.trap = 0; /* Run the Guest until something interesting happens. */ return run_guest(cpu, (unsigned long __user *)user); @@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input) /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; - unsigned long args[3]; + unsigned long args[4]; /* * We grab the Big Lguest lock, which protects against multiple @@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input) goto unlock; } - lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL); - if (!lg->eventfds) { - err = -ENOMEM; - goto free_lg; - } - lg->eventfds->num = 0; - /* Populate the easy fields of our "struct lguest" */ lg->mem_base = (void __user *)args[0]; lg->pfn_limit = args[1]; + lg->device_limit = args[3]; /* This is the first cpu (cpu 0) and it will start booting at args[2] */ err = lg_cpu_start(&lg->cpus[0], 0, args[2]); if (err) - goto free_eventfds; + goto free_lg; /* * Initialize the Guest's shadow page tables. This allocates @@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input) free_regs: /* FIXME: This should be in free_vcpu */ free_page(lg->cpus[0].regs_page); -free_eventfds: - kfree(lg->eventfds); free_lg: kfree(lg); unlock: @@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in, return initialize(file, input); case LHREQ_IRQ: return user_send_irq(cpu, input); - case LHREQ_EVENTFD: - return attach_eventfd(lg, input); + case LHREQ_GETREG: + return getreg_setup(cpu, input); + case LHREQ_SETREG: + return setreg(cpu, input); + case LHREQ_TRAP: + return trap(cpu, input); default: return -EINVAL; } @@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file) mmput(lg->cpus[i].mm); } - /* Release any eventfds they registered. */ - for (i = 0; i < lg->eventfds->num; i++) - eventfd_ctx_put(lg->eventfds->map[i].event); - kfree(lg->eventfds); - /* * If lg->dead doesn't contain an error code it will be NULL or a * kmalloc()ed string, either of which is ok to hand to kfree(). diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index e8b55c3..e3abebc9 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -250,6 +250,16 @@ static void release_pte(pte_t pte) } /*:*/ +static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte) +{ + /* We don't handle large pages. */ + if (pte_flags(gpte) & _PAGE_PSE) + return false; + + return (pte_pfn(gpte) >= cpu->lg->pfn_limit + && pte_pfn(gpte) < cpu->lg->device_limit); +} + static bool check_gpte(struct lg_cpu *cpu, pte_t gpte) { if ((pte_flags(gpte) & _PAGE_PSE) || @@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate, * * If we fixed up the fault (ie. we mapped the address), this routine returns * true. Otherwise, it was a real fault and we need to tell the Guest. + * + * There's a corner case: they're trying to access memory between + * pfn_limit and device_limit, which is I/O memory. In this case, we + * return false and set @iomem to the physical address, so the the + * Launcher can handle the instruction manually. */ -bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) +bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode, + unsigned long *iomem) { unsigned long gpte_ptr; pte_t gpte; @@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pmd_t gpmd; pgd_t gpgd; + *iomem = 0; + /* We never demand page the Switcher, so trying is a mistake. */ if (vaddr >= switcher_addr) return false; @@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; + /* If they're accessing io memory, we expect a fault. */ + if (gpte_in_iomem(cpu, gpte)) { + *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); + return false; + } + /* * Check that the Guest PTE flags are OK, and the page number is below * the pfn_limit (ie. not mapping the Launcher binary). @@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { - if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) + unsigned long iomem; + + if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem)) kill_guest(cpu, "bad stack page %#lx", vaddr); } /*:*/ @@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu) /*:*/ /* We walk down the guest page tables to get a guest-physical address */ -unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) +bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr) { pgd_t gpgd; pte_t gpte; @@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) #endif /* Still not set up? Just map 1:1. */ - if (unlikely(cpu->linear_pages)) - return vaddr; + if (unlikely(cpu->linear_pages)) { + *paddr = vaddr; + return true; + } /* First step: get the top-level Guest page table entry. */ gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t); /* Toplevel not present? We can't map it in. */ - if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) { - kill_guest(cpu, "Bad address %#lx", vaddr); - return -1UL; - } + if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) + goto fail; #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) { - kill_guest(cpu, "Bad address %#lx", vaddr); - return -1UL; - } + if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) + goto fail; gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t); #else gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t); #endif if (!(pte_flags(gpte) & _PAGE_PRESENT)) - kill_guest(cpu, "Bad address %#lx", vaddr); + goto fail; + + *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); + return true; + +fail: + *paddr = -1UL; + return false; +} - return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); +/* + * This is the version we normally use: kills the Guest if it uses a + * bad address + */ +unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) +{ + unsigned long paddr; + + if (!__guest_pa(cpu, vaddr, &paddr)) + kill_guest(cpu, "Bad address %#lx", vaddr); + return paddr; } /* @@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx, * now. This shaves 10% off a copy-on-write * micro-benchmark. */ - if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { + if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) + && !gpte_in_iomem(cpu, gpte)) { if (!check_gpte(cpu, gpte)) return; set_pte(spte, diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 6adfd7b..30f2aef 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) } /*:*/ +unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any) +{ + switch (reg_off) { + case offsetof(struct pt_regs, bx): + return &cpu->regs->ebx; + case offsetof(struct pt_regs, cx): + return &cpu->regs->ecx; + case offsetof(struct pt_regs, dx): + return &cpu->regs->edx; + case offsetof(struct pt_regs, si): + return &cpu->regs->esi; + case offsetof(struct pt_regs, di): + return &cpu->regs->edi; + case offsetof(struct pt_regs, bp): + return &cpu->regs->ebp; + case offsetof(struct pt_regs, ax): + return &cpu->regs->eax; + case offsetof(struct pt_regs, ip): + return &cpu->regs->eip; + case offsetof(struct pt_regs, sp): + return &cpu->regs->esp; + } + + /* Launcher can read these, but we don't allow any setting. */ + if (any) { + switch (reg_off) { + case offsetof(struct pt_regs, ds): + return &cpu->regs->ds; + case offsetof(struct pt_regs, es): + return &cpu->regs->es; + case offsetof(struct pt_regs, fs): + return &cpu->regs->fs; + case offsetof(struct pt_regs, gs): + return &cpu->regs->gs; + case offsetof(struct pt_regs, cs): + return &cpu->regs->cs; + case offsetof(struct pt_regs, flags): + return &cpu->regs->eflags; + case offsetof(struct pt_regs, ss): + return &cpu->regs->ss; + } + } + + return NULL; +} + /*M:002 * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us @@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * usually attached to a PC. * * When the Guest uses one of these instructions, we get a trap (General - * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. + * Protection Fault) and come here. We queue this to be sent out to the + * Launcher to handle. */ -static int emulate_insn(struct lg_cpu *cpu) -{ - u8 insn; - unsigned int insnlen = 0, in = 0, small_operand = 0; - /* - * The eip contains the *virtual* address of the Guest's instruction: - * walk the Guest's page tables to find the "physical" address. - */ - unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - - /* - * This must be the Guest kernel trying to do something, not userspace! - * The bottom two bits of the CS segment register are the privilege - * level. - */ - if ((cpu->regs->cs & 3) != GUEST_PL) - return 0; - - /* Decoding x86 instructions is icky. */ - insn = lgread(cpu, physaddr, u8); - /* - * Around 2.6.33, the kernel started using an emulation for the - * cmpxchg8b instruction in early boot on many configurations. This - * code isn't paravirtualized, and it tries to disable interrupts. - * Ignore it, which will Mostly Work. - */ - if (insn == 0xfa) { - /* "cli", or Clear Interrupt Enable instruction. Skip it. */ - cpu->regs->eip++; - return 1; +/* + * The eip contains the *virtual* address of the Guest's instruction: + * we copy the instruction here so the Launcher doesn't have to walk + * the page tables to decode it. We handle the case (eg. in a kernel + * module) where the instruction is over two pages, and the pages are + * virtually but not physically contiguous. + * + * The longest possible x86 instruction is 15 bytes, but we don't handle + * anything that strange. + */ +static void copy_from_guest(struct lg_cpu *cpu, + void *dst, unsigned long vaddr, size_t len) +{ + size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE); + unsigned long paddr; + + BUG_ON(len > PAGE_SIZE); + + /* If it goes over a page, copy in two parts. */ + if (len > to_page_end) { + /* But make sure the next page is mapped! */ + if (__guest_pa(cpu, vaddr + to_page_end, &paddr)) + copy_from_guest(cpu, dst + to_page_end, + vaddr + to_page_end, + len - to_page_end); + else + /* Otherwise fill with zeroes. */ + memset(dst + to_page_end, 0, len - to_page_end); + len = to_page_end; } - /* - * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out. - */ - if (insn == 0x66) { - small_operand = 1; - /* The instruction is 1 byte so far, read the next byte. */ - insnlen = 1; - insn = lgread(cpu, physaddr + insnlen, u8); - } + /* This will kill the guest if it isn't mapped, but that + * shouldn't happen. */ + __lgread(cpu, dst, guest_pa(cpu, vaddr), len); +} - /* - * We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. - */ - switch (insn & 0xFE) { - case 0xE4: /* in <next byte>,%al */ - insnlen += 2; - in = 1; - break; - case 0xEC: /* in (%dx),%al */ - insnlen += 1; - in = 1; - break; - case 0xE6: /* out %al,<next byte> */ - insnlen += 2; - break; - case 0xEE: /* out %al,(%dx) */ - insnlen += 1; - break; - default: - /* OK, we don't know what this is, can't emulate. */ - return 0; - } - /* - * If it was an "IN" instruction, they expect the result to be read - * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". - */ - if (in) { - /* Lower bit tells means it's a 32/16 bit access */ - if (insn & 0x1) { - if (small_operand) - cpu->regs->eax |= 0xFFFF; - else - cpu->regs->eax = 0xFFFFFFFF; - } else - cpu->regs->eax |= 0xFF; - } - /* Finally, we've "done" the instruction, so move past it. */ - cpu->regs->eip += insnlen; - /* Success! */ - return 1; +static void setup_emulate_insn(struct lg_cpu *cpu) +{ + cpu->pending.trap = 13; + copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, + sizeof(cpu->pending.insn)); +} + +static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr) +{ + cpu->pending.trap = 14; + cpu->pending.addr = iomem_addr; + copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip, + sizeof(cpu->pending.insn)); } /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */ void lguest_arch_handle_trap(struct lg_cpu *cpu) { + unsigned long iomem_addr; + switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* - * Check if this was one of those annoying IN or OUT - * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. - */ + /* Hand to Launcher to emulate those pesky IN and OUT insns */ if (cpu->regs->errcode == 0) { - if (emulate_insn(cpu)) - return; + setup_emulate_insn(cpu); + return; } break; case 14: /* We've intercepted a Page Fault. */ @@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) * whether kernel or userspace code. */ if (demand_page(cpu, cpu->arch.last_pagefault, - cpu->regs->errcode)) + cpu->regs->errcode, &iomem_addr)) return; + /* Was this an access to memory mapped IO? */ + if (iomem_addr) { + /* Tell Launcher, let it handle it. */ + setup_iomem_insn(cpu, iomem_addr); + return; + } + /* * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the |