Commit 659a0e66 authored by Rusty Russell's avatar Rusty Russell

lguest: have example Launcher service all devices in separate threads

Currently lguest has three threads: the main Launcher thread, a Waker
thread, and a thread for the block device (because synchronous block
was simply too painful to bear).

The Waker selects() on all the input file descriptors (eg. stdin, net
devices, pipe to the block thread) and when one becomes readable it calls
into the kernel to kick the Launcher thread out into userspace, which
repeats the poll, services the device(s), and then tells the kernel to
release the Waker before re-entering the kernel to run the Guest.

Also, to make a slightly-decent network transmit routine, the Launcher
would suppress further network interrupts while it set a timer: that
signal handler would write to a pipe, which would rouse the Waker
which would prod the Launcher out of the kernel to check the network
device again.

Now we can convert all our virtqueues to separate threads: each one has
a separate eventfd for when the Guest pokes the device, and can trigger
interrupts in the Guest directly.

The linecount shows how much this simplifies, but to really bring it
home, here's an strace analysis of single Guest->Host ping before:

* Guest sends packet, notifies xmit vq, return control to Launcher
* Launcher clears notification flag on xmit ring
* Launcher writes packet to TUN device
	writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\366\r\224`\2058\272m\224vf\274\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108
* Launcher sets up interrupt for Guest (xmit ring is empty)
	write(10, "\2\0\0\0\3\0\0\0", 8) = 0
* Launcher sets up timer for interrupt mitigation
	setitimer(ITIMER_REAL, {it_interval={0, 0}, it_value={0, 505}}, NULL) = 0
* Launcher re-runs guest
	pread64(10, 0xbfa5f4d4, 4, 0) ...
* Waker notices reply packet in tun device (it was in select)
	select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [4])
* Waker kicks Launcher out of guest:
	pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0
* Launcher returns from running guest:
	... = -1 EAGAIN (Resource temporarily unavailable)
* Launcher looks at input fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [4], left {0, 0})
* Launcher reads pong from tun device:
	readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"\272m\224vf\274\366\r\224`\2058\10\0E\0\0T\364\26\0\0@"..., 1518}], 2) = 108
* Launcher injects guest notification:
	write(10, "\2\0\0\0\2\0\0\0", 8) = 0
* Launcher rechecks fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout)
* Launcher clears Waker:
	pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0
* Launcher reruns Guest:
	pread64(10, 0xbfa5f4d4, 4, 0) = ? ERESTARTSYS (To be restarted)
* Signal comes in, uses pipe to wake up Launcher:
	--- SIGALRM (Alarm clock) @ 0 (0) ---
	write(8, "\0", 1)       = 1
	sigreturn()             = ? (mask now [])
* Waker sees write on pipe:
	select(12, [0 3 4 6 11], NULL, NULL, NULL) = 1 (in [6])
* Waker kicks Launcher out of Guest:
	pwrite64(10, "\3\0\0\0\1\0\0\0", 8, 0) = 0
* Launcher exits from kernel:
	pread64(10, 0xbfa5f4d4, 4, 0) = -1 EAGAIN (Resource temporarily unavailable)
* Launcher looks to see what fd woke it:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 1 (in [6], left {0, 0})
* Launcher reads timeout fd, sets notification flag on xmit ring
	read(6, "\0", 32)       = 1
* Launcher rechecks fds:
	select(7, [0 3 4 6], NULL, NULL, {0, 0}) = 0 (Timeout)
* Launcher clears Waker:
	pwrite64(10, "\3\0\0\0\0\0\0\0", 8, 0) = 0
* Launcher resumes Guest:
	pread64(10, "\0p\0\4", 4, 0) ....

strace analysis of single Guest->Host ping after:

* Guest sends packet, notifies xmit vq, creates event on eventfd.
* Network xmit thread wakes from read on eventfd:
	read(7, "\1\0\0\0\0\0\0\0", 8)          = 8
* Network xmit thread writes packet to TUN device
	writev(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"J\217\232FI\37j\27\375\276\0\304\10\0E\0\0T\0\0@\0@\1\265"..., 98}], 2) = 108
* Network recv thread wakes up from read on tunfd:
	readv(4, [{"\0\0\0\0\0\0\0\0\0\0", 10}, {"j\27\375\276\0\304J\217\232FI\37\10\0E\0\0TiO\0\0@\1\214"..., 1518}], 2) = 108
* Network recv thread sets up interrupt for the Guest
	write(6, "\2\0\0\0\2\0\0\0", 8) = 0
* Network recv thread goes back to reading tunfd
	13:39:42.460285 readv(4,  <unfinished ...>
* Network xmit thread sets up interrupt for Guest (xmit ring is empty)
	write(6, "\2\0\0\0\3\0\0\0", 8) = 0
* Network xmit thread goes back to reading from eventfd
	read(7, <unfinished ...>
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
parent df60aeef
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <sys/eventfd.h>
#include <fcntl.h> #include <fcntl.h>
#include <stdbool.h> #include <stdbool.h>
#include <errno.h> #include <errno.h>
...@@ -59,7 +60,6 @@ typedef uint8_t u8; ...@@ -59,7 +60,6 @@ typedef uint8_t u8;
/*:*/ /*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
#define BRIDGE_PFX "bridge:" #define BRIDGE_PFX "bridge:"
#ifndef SIOCBRADDIF #ifndef SIOCBRADDIF
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */ #define SIOCBRADDIF 0x89a2 /* add interface to bridge */
...@@ -76,18 +76,10 @@ static bool verbose; ...@@ -76,18 +76,10 @@ static bool verbose;
do { if (verbose) printf(args); } while(0) do { if (verbose) printf(args); } while(0)
/*:*/ /*:*/
/* File descriptors for the Waker. */
struct {
int pipe[2];
} waker_fds;
/* The pointer to the start of guest memory. */ /* The pointer to the start of guest memory. */
static void *guest_base; static void *guest_base;
/* The maximum guest physical address allowed, and maximum possible. */ /* The maximum guest physical address allowed, and maximum possible. */
static unsigned long guest_limit, guest_max; static unsigned long guest_limit, guest_max;
/* The pipe for signal hander to write to. */
static int timeoutpipe[2];
static unsigned int timeout_usec = 500;
/* The /dev/lguest file descriptor. */ /* The /dev/lguest file descriptor. */
static int lguest_fd; static int lguest_fd;
...@@ -97,11 +89,6 @@ static unsigned int __thread cpu_id; ...@@ -97,11 +89,6 @@ static unsigned int __thread cpu_id;
/* This is our list of devices. */ /* This is our list of devices. */
struct device_list struct device_list
{ {
/* Summary information about the devices in our list: ready to pass to
* select() to ask which need servicing.*/
fd_set infds;
int max_infd;
/* Counter to assign interrupt numbers. */ /* Counter to assign interrupt numbers. */
unsigned int next_irq; unsigned int next_irq;
...@@ -137,16 +124,11 @@ struct device ...@@ -137,16 +124,11 @@ struct device
/* The name of this device, for --verbose. */ /* The name of this device, for --verbose. */
const char *name; const char *name;
/* If handle_input is set, it wants to be called when this file
* descriptor is ready. */
int fd;
bool (*handle_input)(struct device *me);
/* Any queues attached to this device */ /* Any queues attached to this device */
struct virtqueue *vq; struct virtqueue *vq;
/* Handle status being finalized (ie. feature bits stable). */ /* Is it operational */
void (*ready)(struct device *me); bool running;
/* Device-specific data. */ /* Device-specific data. */
void *priv; void *priv;
...@@ -169,16 +151,20 @@ struct virtqueue ...@@ -169,16 +151,20 @@ struct virtqueue
/* Last available index we saw. */ /* Last available index we saw. */
u16 last_avail_idx; u16 last_avail_idx;
/* The routine to call when the Guest pings us, or timeout. */ /* Eventfd where Guest notifications arrive. */
void (*handle_output)(struct virtqueue *me, bool timeout); int eventfd;
/* Is this blocked awaiting a timer? */ /* Function for the thread which is servicing this virtqueue. */
bool blocked; void (*service)(struct virtqueue *vq);
pid_t thread;
}; };
/* Remember the arguments to the program so we can "reboot" */ /* Remember the arguments to the program so we can "reboot" */
static char **main_args; static char **main_args;
/* The original tty settings to restore on exit. */
static struct termios orig_term;
/* We have to be careful with barriers: our devices are all run in separate /* We have to be careful with barriers: our devices are all run in separate
* threads and so we need to make sure that changes visible to the Guest happen * threads and so we need to make sure that changes visible to the Guest happen
* in precise order. */ * in precise order. */
...@@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start) ...@@ -521,78 +507,6 @@ static void tell_kernel(unsigned long start)
} }
/*:*/ /*:*/
static void add_device_fd(int fd)
{
FD_SET(fd, &devices.infds);
if (fd > devices.max_infd)
devices.max_infd = fd;
}
/*L:200
* The Waker.
*
* With console, block and network devices, we can have lots of input which we
* need to process. We could try to tell the kernel what file descriptors to
* watch, but handing a file descriptor mask through to the kernel is fairly
* icky.
*
* Instead, we clone off a thread which watches the file descriptors and writes
* the LHREQ_BREAK command to the /dev/lguest file descriptor to tell the Host
* stop running the Guest. This causes the Launcher to return from the
* /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
* the LHREQ_BREAK and wake us up again.
*
* This, of course, is merely a different *kind* of icky.
*
* Given my well-known antipathy to threads, I'd prefer to use processes. But
* it's easier to share Guest memory with threads, and trivial to share the
* devices.infds as the Launcher changes it.
*/
static int waker(void *unused)
{
/* Close the write end of the pipe: only the Launcher has it open. */
close(waker_fds.pipe[1]);
for (;;) {
fd_set rfds = devices.infds;
unsigned long args[] = { LHREQ_BREAK, 1 };
unsigned int maxfd = devices.max_infd;
/* We also listen to the pipe from the Launcher. */
FD_SET(waker_fds.pipe[0], &rfds);
if (waker_fds.pipe[0] > maxfd)
maxfd = waker_fds.pipe[0];
/* Wait until input is ready from one of the devices. */
select(maxfd+1, &rfds, NULL, NULL, NULL);
/* Message from Launcher? */
if (FD_ISSET(waker_fds.pipe[0], &rfds)) {
char c;
/* If this fails, then assume Launcher has exited.
* Don't do anything on exit: we're just a thread! */
if (read(waker_fds.pipe[0], &c, 1) != 1)
_exit(0);
continue;
}
/* Send LHREQ_BREAK command to snap the Launcher out of it. */
pwrite(lguest_fd, args, sizeof(args), cpu_id);
}
return 0;
}
/* This routine just sets up a pipe to the Waker process. */
static void setup_waker(void)
{
/* This pipe is closed when Launcher dies, telling Waker. */
if (pipe(waker_fds.pipe) != 0)
err(1, "Creating pipe for Waker");
if (clone(waker, malloc(4096) + 4096, CLONE_VM | SIGCHLD, NULL) == -1)
err(1, "Creating Waker");
}
/* /*
* Device Handling. * Device Handling.
* *
...@@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i) ...@@ -642,25 +556,27 @@ static unsigned next_desc(struct virtqueue *vq, unsigned int i)
* number of output then some number of input descriptors, it's actually two * number of output then some number of input descriptors, it's actually two
* iovecs, but we pack them into one and note how many of each there were. * iovecs, but we pack them into one and note how many of each there were.
* *
* This function returns the descriptor number found, or vq->vring.num (which * This function returns the descriptor number found. */
* is never a valid descriptor number) if none was found. */ static unsigned wait_for_vq_desc(struct virtqueue *vq,
static unsigned get_vq_desc(struct virtqueue *vq,
struct iovec iov[], struct iovec iov[],
unsigned int *out_num, unsigned int *in_num) unsigned int *out_num, unsigned int *in_num)
{ {
unsigned int i, head; unsigned int i, head;
u16 last_avail; u16 last_avail = lg_last_avail(vq);
while (last_avail == vq->vring.avail->idx) {
u64 event;
/* Nothing new? Wait for eventfd to tell us they refilled. */
if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
errx(1, "Event read failed?");
}
/* Check it isn't doing very strange things with descriptor numbers. */ /* Check it isn't doing very strange things with descriptor numbers. */
last_avail = lg_last_avail(vq);
if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num) if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
errx(1, "Guest moved used index from %u to %u", errx(1, "Guest moved used index from %u to %u",
last_avail, vq->vring.avail->idx); last_avail, vq->vring.avail->idx);
/* If there's nothing new since last we looked, return invalid. */
if (vq->vring.avail->idx == last_avail)
return vq->vring.num;
/* Grab the next descriptor number they're advertising, and increment /* Grab the next descriptor number they're advertising, and increment
* the index we've seen. */ * the index we've seen. */
head = vq->vring.avail->ring[last_avail % vq->vring.num]; head = vq->vring.avail->ring[last_avail % vq->vring.num];
...@@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) ...@@ -740,15 +656,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
/* /*
* The Console * The Console
* *
* Here is the input terminal setting we save, and the routine to restore them * We associate some data with the console for our exit hack. */
* on exit so the user gets their terminal back. */
static struct termios orig_term;
static void restore_term(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
/* We associate some data with the console for our exit hack. */
struct console_abort struct console_abort
{ {
/* How many times have they hit ^C? */ /* How many times have they hit ^C? */
...@@ -758,84 +666,64 @@ struct console_abort ...@@ -758,84 +666,64 @@ struct console_abort
}; };
/* This is the routine which handles console input (ie. stdin). */ /* This is the routine which handles console input (ie. stdin). */
static bool handle_console_input(struct device *dev) static void console_input(struct virtqueue *vq)
{ {
int len; int len;
unsigned int head, in_num, out_num; unsigned int head, in_num, out_num;
struct iovec iov[dev->vq->vring.num]; struct console_abort *abort = vq->dev->priv;
struct console_abort *abort = dev->priv; struct iovec iov[vq->vring.num];
/* First we need a console buffer from the Guests's input virtqueue. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
/* If they're not ready for input, stop listening to this file
* descriptor. We'll start again once they add an input buffer. */
if (head == dev->vq->vring.num)
return false;
/* Make sure there's a descriptor waiting. */
head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
if (out_num) if (out_num)
errx(1, "Output buffers in console in queue?"); errx(1, "Output buffers in console in queue?");
/* This is why we convert to iovecs: the readv() call uses them, and so /* Read it in. */
* it reads straight into the Guest's buffer. */ len = readv(STDIN_FILENO, iov, in_num);
len = readv(dev->fd, iov, in_num);
if (len <= 0) { if (len <= 0) {
/* This implies that the console is closed, is /dev/null, or /* Ran out of input? */
* something went terribly wrong. */
warnx("Failed to get console input, ignoring console."); warnx("Failed to get console input, ignoring console.");
/* Put the input terminal back. */ /* For simplicity, dying threads kill the whole Launcher. So
restore_term(); * just nap here. */
/* Remove callback from input vq, so it doesn't restart us. */ for (;;)
dev->vq->handle_output = NULL; pause();
/* Stop listening to this fd: don't call us again. */
return false;
} }
/* Tell the Guest about the new input. */ add_used_and_trigger(vq, head, len);
add_used_and_trigger(dev->vq, head, len);
/* Three ^C within one second? Exit. /* Three ^C within one second? Exit.
* *
* This is such a hack, but works surprisingly well. Each ^C has to be * This is such a hack, but works surprisingly well. Each ^C has to
* in a buffer by itself, so they can't be too fast. But we check that * be in a buffer by itself, so they can't be too fast. But we check
* we get three within about a second, so they can't be too slow. */ * that we get three within about a second, so they can't be too
if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) { * slow. */
if (!abort->count++) if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
abort->count = 0;
return;
}
abort->count++;
if (abort->count == 1)
gettimeofday(&abort->start, NULL); gettimeofday(&abort->start, NULL);
else if (abort->count == 3) { else if (abort->count == 3) {
struct timeval now; struct timeval now;
gettimeofday(&now, NULL); gettimeofday(&now, NULL);
if (now.tv_sec <= abort->start.tv_sec+1) { /* Kill all Launcher processes with SIGINT, like normal ^C */
unsigned long args[] = { LHREQ_BREAK, 0 }; if (now.tv_sec <= abort->start.tv_sec+1)
/* Close the fd so Waker will know it has to kill(0, SIGINT);
* exit. */
close(waker_fds.pipe[1]);
/* Just in case Waker is blocked in BREAK, send
* unbreak now. */
write(lguest_fd, args, sizeof(args));
exit(2);
}
abort->count = 0; abort->count = 0;
} }
} else
/* Any other key resets the abort counter. */
abort->count = 0;
/* Everything went OK! */
return true;
} }
/* Handling output for console is simple: we just get all the output buffers /* This is the routine which handles console output (ie. stdout). */
* and write them to stdout. */ static void console_output(struct virtqueue *vq)
static void handle_console_output(struct virtqueue *vq, bool timeout)
{ {
unsigned int head, out, in; unsigned int head, out, in;
struct iovec iov[vq->vring.num]; struct iovec iov[vq->vring.num];
/* Keep getting output buffers from the Guest until we run out. */ head = wait_for_vq_desc(vq, iov, &out, &in);
while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
if (in) if (in)
errx(1, "Input buffers in output queue?"); errx(1, "Input buffers in console output queue?");
while (!iov_empty(iov, out)) { while (!iov_empty(iov, out)) {
int len = writev(STDOUT_FILENO, iov, out); int len = writev(STDOUT_FILENO, iov, out);
if (len <= 0) if (len <= 0)
...@@ -843,149 +731,129 @@ static void handle_console_output(struct virtqueue *vq, bool timeout) ...@@ -843,149 +731,129 @@ static void handle_console_output(struct virtqueue *vq, bool timeout)
iov_consume(iov, out, len); iov_consume(iov, out, len);
} }
add_used_and_trigger(vq, head, 0); add_used_and_trigger(vq, head, 0);
}
}
/* This is called when we no longer want to hear about Guest changes to a
* virtqueue. This is more efficient in high-traffic cases, but it means we
* have to set a timer to check if any more changes have occurred. */
static void block_vq(struct virtqueue *vq)
{
struct itimerval itm;
vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
vq->blocked = true;
itm.it_interval.tv_sec = 0;
itm.it_interval.tv_usec = 0;
itm.it_value.tv_sec = 0;
itm.it_value.tv_usec = timeout_usec;
setitimer(ITIMER_REAL, &itm, NULL);
} }
/* /*
* The Network * The Network
* *
* Handling output for network is also simple: we get all the output buffers * Handling output for network is also simple: we get all the output buffers
* and write them (ignoring the first element) to this device's file descriptor * and write them to /dev/net/tun.
* (/dev/net/tun).
*/ */
static void handle_net_output(struct virtqueue *vq, bool timeout) struct net_info {
int tunfd;
};
static void net_output(struct virtqueue *vq)
{ {
unsigned int head, out, in, num = 0; struct net_info *net_info = vq->dev->priv;
unsigned int head, out, in;
struct iovec iov[vq->vring.num]; struct iovec iov[vq->vring.num];
static int last_timeout_num;
/* Keep getting output buffers from the Guest until we run out. */ head = wait_for_vq_desc(vq, iov, &out, &in);
while ((head = get_vq_desc(vq, iov, &out, &in)) != vq->vring.num) {
if (in) if (in)
errx(1, "Input buffers in output queue?"); errx(1, "Input buffers in net output queue?");
if (writev(vq->dev->fd, iov, out) < 0) if (writev(net_info->tunfd, iov, out) < 0)
err(1, "Writing network packet to tun"); errx(1, "Write to tun failed?");
add_used_and_trigger(vq, head, 0); add_used_and_trigger(vq, head, 0);
num++;
}
/* Block further kicks and set up a timer if we saw anything. */
if (!timeout && num)
block_vq(vq);
/* We never quite know how long should we wait before we check the
* queue again for more packets. We start at 500 microseconds, and if
* we get fewer packets than last time, we assume we made the timeout
* too small and increase it by 10 microseconds. Otherwise, we drop it
* by one microsecond every time. It seems to work well enough. */
if (timeout) {
if (num < last_timeout_num)
timeout_usec += 10;
else if (timeout_usec > 1)
timeout_usec--;
last_timeout_num = num;
}
} }
/* This is where we handle a packet coming in from the tun device to our /* This is where we handle packets coming in from the tun device to our
* Guest. */ * Guest. */
static bool handle_tun_input(struct device *dev) static void net_input(struct virtqueue *vq)
{ {
unsigned int head, in_num, out_num;
int len; int len;
struct iovec iov[dev->vq->vring.num]; unsigned int head, out, in;
struct iovec iov[vq->vring.num];
/* First we need a network buffer from the Guests's recv virtqueue. */ struct net_info *net_info = vq->dev->priv;
head = get_vq_desc(dev->vq, iov, &out_num, &in_num);
if (head == dev->vq->vring.num) {
/* Now, it's expected that if we try to send a packet too
* early, the Guest won't be ready yet. Wait until the device
* status says it's ready. */
/* FIXME: Actually want DRIVER_ACTIVE here. */
/* Now tell it we want to know if new things appear. */
dev->vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
wmb();
/* We'll turn this back on if input buffers are registered. */
return false;
} else if (out_num)
errx(1, "Output buffers in network recv queue?");
/* Read the packet from the device directly into the Guest's buffer. */ head = wait_for_vq_desc(vq, iov, &out, &in);
len = readv(dev->fd, iov, in_num); if (out)
errx(1, "Output buffers in net input queue?");
len = readv(net_info->tunfd, iov, in);
if (len <= 0) if (len <= 0)
err(1, "reading network"); err(1, "Failed to read from tun.");
add_used_and_trigger(vq, head, len);
/* Tell the Guest about the new packet. */
add_used_and_trigger(dev->vq, head, len);
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
((u8 *)iov[1].iov_base)[0], ((u8 *)iov[1].iov_base)[1],
head != dev->vq->vring.num ? "sent" : "discarded");
/* All good. */
return true;
} }
/*L:215 This is the callback attached to the network and console input /* This is the helper to create threads. */
* virtqueues: it ensures we try again, in case we stopped console or net static int do_thread(void *_vq)
* delivery because Guest didn't have any buffers. */
static void enable_fd(struct virtqueue *vq, bool timeout)
{ {
add_device_fd(vq->dev->fd); struct virtqueue *vq = _vq;
/* Snap the Waker out of its select loop. */
write(waker_fds.pipe[1], "", 1); for (;;)
vq->service(vq);
return 0;
} }
static void net_enable_fd(struct virtqueue *vq, bool timeout) /* When a child dies, we kill our entire process group with SIGTERM. This
* also has the side effect that the shell restores the console for us! */
static void kill_launcher(int signal)
{ {
/* We don't need to know again when Guest refills receive buffer. */ kill(0, SIGTERM);
vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
enable_fd(vq, timeout);
} }
/* When the Guest tells us they updated the status field, we handle it. */ static void reset_device(struct device *dev)
static void update_device_status(struct device *dev)
{ {
struct virtqueue *vq; struct virtqueue *vq;
/* This is a reset. */
if (dev->desc->status == 0) {
verbose("Resetting device %s\n", dev->name); verbose("Resetting device %s\n", dev->name);
/* Clear any features they've acked. */ /* Clear any features they've acked. */
memset(get_feature_bits(dev) + dev->feature_len, 0, memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
dev->feature_len);
/* Zero out the virtqueues. */ /* We're going to be explicitly killing threads, so ignore them. */
signal(SIGCHLD, SIG_IGN);
/* Zero out the virtqueues, get rid of their threads */
for (vq = dev->vq; vq; vq = vq->next) { for (vq = dev->vq; vq; vq = vq->next) {
if (vq->thread != (pid_t)-1) {
kill(vq->thread, SIGTERM);
waitpid(vq->thread, NULL, 0);
vq->thread = (pid_t)-1;
}
memset(vq->vring.desc, 0, memset(vq->vring.desc, 0,
vring_size(vq->config.num, LGUEST_VRING_ALIGN)); vring_size(vq->config.num, LGUEST_VRING_ALIGN));
lg_last_avail(vq) = 0; lg_last_avail(vq) = 0;
} }
} else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) { dev->running = false;
warnx("Device %s configuration FAILED", dev->name);
} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) { /* Now we care if threads die. */
signal(SIGCHLD, (void *)kill_launcher);
}
static void create_thread(struct virtqueue *vq)
{
/* Create stack for thread and run it. Since stack grows
* upwards, we point the stack pointer to the end of this
* region. */
char *stack = malloc(32768);
unsigned long args[] = { LHREQ_EVENTFD,
vq->config.pfn*getpagesize(), 0 };
/* Create a zero-initialized eventfd. */
vq->eventfd = eventfd(0, 0);
if (vq->eventfd < 0)
err(1, "Creating eventfd");
args[2] = vq->eventfd;
/* Attach an eventfd to this virtqueue: it will go off
* when the Guest does an LHCALL_NOTIFY for this vq. */
if (write(lguest_fd, &args, sizeof(args)) != 0)
err(1, "Attaching eventfd");
/* CLONE_VM: because it has to access the Guest memory, and
* SIGCHLD so we get a signal if it dies. */
vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
if (vq->thread == (pid_t)-1)
err(1, "Creating clone");
/* We close our local copy, now the child has it. */
close(vq->eventfd);
}
static void start_device(struct device *dev)
{
unsigned int i; unsigned int i;
struct virtqueue *vq;
verbose("Device %s OK: offered", dev->name); verbose("Device %s OK: offered", dev->name);
for (i = 0; i < dev->feature_len; i++) for (i = 0; i < dev->feature_len; i++)
...@@ -995,8 +863,38 @@ static void update_device_status(struct device *dev) ...@@ -995,8 +863,38 @@ static void update_device_status(struct device *dev)
verbose(" %02x", get_feature_bits(dev) verbose(" %02x", get_feature_bits(dev)
[dev->feature_len+i]); [dev->feature_len+i]);
if (dev->ready) for (vq = dev->vq; vq; vq = vq->next) {
dev->ready(dev); if (vq->service)
create_thread(vq);
}
dev->running = true;
}
static void cleanup_devices(void)
{
struct device *dev;
for (dev = devices.dev; dev; dev = dev->next)
reset_device(dev);
/* If we saved off the original terminal settings, restore them now. */
if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
/* When the Guest tells us they updated the status field, we handle it. */
static void update_device_status(struct device *dev)
{
/* A zero status is a reset, otherwise it's a set of flags. */
if (dev->desc->status == 0)
reset_device(dev);
else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
warnx("Device %s configuration FAILED", dev->name);
if (dev->running)
reset_device(dev);
} else if (dev->desc->status & VIRTIO_CONFIG_S_DRIVER_OK) {
if (!dev->running)
start_device(dev);
} }
} }
...@@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev) ...@@ -1004,32 +902,24 @@ static void update_device_status(struct device *dev)
static void handle_output(unsigned long addr) static void handle_output(unsigned long addr)
{ {
struct device *i; struct device *i;
struct virtqueue *vq;
/* Check each device and virtqueue. */ /* Check each device. */
for (i = devices.dev; i; i = i->next) { for (i = devices.dev; i; i = i->next) {
struct virtqueue *vq;
/* Notifications to device descriptors update device status. */ /* Notifications to device descriptors update device status. */
if (from_guest_phys(addr) == i->desc) { if (from_guest_phys(addr) == i->desc) {
update_device_status(i); update_device_status(i);
return; return;
} }
/* Notifications to virtqueues mean output has occurred. */ /* Devices *can* be used before status is set to DRIVER_OK. */
for (vq = i->vq; vq; vq = vq->next) { for (vq = i->vq; vq; vq = vq->next) {
if (vq->config.pfn != addr/getpagesize()) if (addr != vq->config.pfn*getpagesize())
continue; continue;
if (i->running)
/* Guest should acknowledge (and set features!) before errx(1, "Notification on running %s", i->name);
* using the device. */ start_device(i);
if (i->desc->status == 0) {
warnx("%s gave early output", i->name);
return;
}
if (strcmp(vq->dev->name, "console") != 0)
verbose("Output to %s\n", vq->dev->name);
if (vq->handle_output)
vq->handle_output(vq, false);
return; return;
} }
} }
...@@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr) ...@@ -1043,71 +933,6 @@ static void handle_output(unsigned long addr)
strnlen(from_guest_phys(addr), guest_limit - addr)); strnlen(from_guest_phys(addr), guest_limit - addr));
} }
static void handle_timeout(void)
{
char buf[32];
struct device *i;
struct virtqueue *vq;
/* Clear the pipe */
read(timeoutpipe[0], buf, sizeof(buf));
/* Check each device and virtqueue: flush blocked ones. */
for (i = devices.dev; i; i = i->next) {
for (vq = i->vq; vq; vq = vq->next) {
if (!vq->blocked)
continue;
vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
vq->blocked = false;
if (vq->handle_output)
vq->handle_output(vq, true);
}
}
}
/* This is called when the Waker wakes us up: check for incoming file
* descriptors. */
static void handle_input(void)
{
/* select() wants a zeroed timeval to mean "don't wait". */
struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
for (;;) {
struct device *i;
fd_set fds = devices.infds;
int num;
num = select(devices.max_infd+1, &fds, NULL, NULL, &poll);
/* Could get interrupted */
if (num < 0)
continue;
/* If nothing is ready, we're done. */
if (num == 0)
break;
/* Otherwise, call the device(s) which have readable file
* descriptors and a method of handling them. */
for (i = devices.dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
if (i->handle_input(i))
continue;
/* If handle_input() returns false, it means we
* should no longer service it. Networking and
* console do this when there's no input
* buffers to deliver into. Console also uses
* it when it discovers that stdin is closed. */
FD_CLR(i->fd, &devices.infds);
}
}
/* Is this the timeout fd? */
if (FD_ISSET(timeoutpipe[0], &fds))
handle_timeout();
}
}
/*L:190 /*L:190
* Device Setup * Device Setup
* *
...@@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type) ...@@ -1153,7 +978,7 @@ static struct lguest_device_desc *new_dev_desc(u16 type)
/* Each device descriptor is followed by the description of its virtqueues. We /* Each device descriptor is followed by the description of its virtqueues. We
* specify how many descriptors the virtqueue is to have. */ * specify how many descriptors the virtqueue is to have. */
static void add_virtqueue(struct device *dev, unsigned int num_descs, static void add_virtqueue(struct device *dev, unsigned int num_descs,
void (*handle_output)(struct virtqueue *, bool)) void (*service)(struct virtqueue *))
{ {
unsigned int pages; unsigned int pages;
struct virtqueue **i, *vq = malloc(sizeof(*vq)); struct virtqueue **i, *vq = malloc(sizeof(*vq));
...@@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, ...@@ -1168,7 +993,8 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
vq->next = NULL; vq->next = NULL;
vq->last_avail_idx = 0; vq->last_avail_idx = 0;
vq->dev = dev; vq->dev = dev;
vq->blocked = false; vq->service = service;
vq->thread = (pid_t)-1;
/* Initialize the configuration. */ /* Initialize the configuration. */
vq->config.num = num_descs; vq->config.num = num_descs;
...@@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, ...@@ -1193,15 +1019,6 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs,
* second. */ * second. */
for (i = &dev->vq; *i; i = &(*i)->next); for (i = &dev->vq; *i; i = &(*i)->next);
*i = vq; *i = vq;
/* Set the routine to call when the Guest does something to this
* virtqueue. */
vq->handle_output = handle_output;
/* As an optimization, set the advisory "Don't Notify Me" flag if we
* don't have a handler */
if (!handle_output)
vq->vring.used->flags = VRING_USED_F_NO_NOTIFY;
} }
/* The first half of the feature bitmask is for us to advertise features. The /* The first half of the feature bitmask is for us to advertise features. The
...@@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf) ...@@ -1237,24 +1054,17 @@ static void set_config(struct device *dev, unsigned len, const void *conf)
* calling new_dev_desc() to allocate the descriptor and device memory. * calling new_dev_desc() to allocate the descriptor and device memory.
* *
* See what I mean about userspace being boring? */ * See what I mean about userspace being boring? */
static struct device *new_device(const char *name, u16 type, int fd, static struct device *new_device(const char *name, u16 type)
bool (*handle_input)(struct device *))
{ {
struct device *dev = malloc(sizeof(*dev)); struct device *dev = malloc(sizeof(*dev));
/* Now we populate the fields one at a time. */ /* Now we populate the fields one at a time. */
dev->fd = fd;
/* If we have an input handler for this file descriptor, then we add it
* to the device_list's fdset and maxfd. */
if (handle_input)
add_device_fd(dev->fd);
dev->desc = new_dev_desc(type); dev->desc = new_dev_desc(type);
dev->handle_input = handle_input;
dev->name = name; dev->name = name;
dev->vq = NULL; dev->vq = NULL;
dev->ready = NULL;
dev->feature_len = 0; dev->feature_len = 0;
dev->num_vq = 0; dev->num_vq = 0;
dev->running = false;
/* Append to device list. Prepending to a single-linked list is /* Append to device list. Prepending to a single-linked list is
* easier, but the user expects the devices to be arranged on the bus * easier, but the user expects the devices to be arranged on the bus
...@@ -1282,13 +1092,10 @@ static void setup_console(void) ...@@ -1282,13 +1092,10 @@ static void setup_console(void)
* raw input stream to the Guest. */ * raw input stream to the Guest. */
term.c_lflag &= ~(ISIG|ICANON|ECHO); term.c_lflag &= ~(ISIG|ICANON|ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &term); tcsetattr(STDIN_FILENO, TCSANOW, &term);
/* If we exit gracefully, the original settings will be
* restored so the user can see what they're typing. */
atexit(restore_term);
} }
dev = new_device("console", VIRTIO_ID_CONSOLE, dev = new_device("console", VIRTIO_ID_CONSOLE);
STDIN_FILENO, handle_console_input);
/* We store the console state in dev->priv, and initialize it. */ /* We store the console state in dev->priv, and initialize it. */
dev->priv = malloc(sizeof(struct console_abort)); dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0; ((struct console_abort *)dev->priv)->count = 0;
...@@ -1297,31 +1104,13 @@ static void setup_console(void) ...@@ -1297,31 +1104,13 @@ static void setup_console(void)
* they put something the input queue, we make sure we're listening to * they put something the input queue, we make sure we're listening to
* stdin. When they put something in the output queue, we write it to * stdin. When they put something in the output queue, we write it to
* stdout. */ * stdout. */
add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_console_output); add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
verbose("device %u: console\n", devices.device_num++); verbose("device %u: console\n", ++devices.device_num);
} }
/*:*/ /*:*/
static void timeout_alarm(int sig)
{
write(timeoutpipe[1], "", 1);
}
static void setup_timeout(void)
{
if (pipe(timeoutpipe) != 0)
err(1, "Creating timeout pipe");
if (fcntl(timeoutpipe[1], F_SETFL,
fcntl(timeoutpipe[1], F_GETFL) | O_NONBLOCK) != 0)
err(1, "Making timeout pipe nonblocking");
add_device_fd(timeoutpipe[0]);
signal(SIGALRM, timeout_alarm);
}
/*M:010 Inter-guest networking is an interesting area. Simplest is to have a /*M:010 Inter-guest networking is an interesting area. Simplest is to have a
* --sharenet=<name> option which opens or creates a named pipe. This can be * --sharenet=<name> option which opens or creates a named pipe. This can be
* used to send packets to another guest in a 1:1 manner. * used to send packets to another guest in a 1:1 manner.
...@@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ]) ...@@ -1443,21 +1232,23 @@ static int get_tun_device(char tapif[IFNAMSIZ])
static void setup_tun_net(char *arg) static void setup_tun_net(char *arg)
{ {
struct device *dev; struct device *dev;
int netfd, ipfd; struct net_info *net_info = malloc(sizeof(*net_info));
int ipfd;
u32 ip = INADDR_ANY; u32 ip = INADDR_ANY;
bool bridging = false; bool bridging = false;
char tapif[IFNAMSIZ], *p; char tapif[IFNAMSIZ], *p;
struct virtio_net_config conf; struct virtio_net_config conf;
netfd = get_tun_device(tapif); net_info->tunfd = get_tun_device(tapif);
/* First we create a new network device. */ /* First we create a new network device. */
dev = new_device("net", VIRTIO_ID_NET, netfd, handle_tun_input); dev = new_device("net", VIRTIO_ID_NET);
dev->priv = net_info;
/* Network devices need a receive and a send queue, just like /* Network devices need a receive and a send queue, just like
* console. */ * console. */
add_virtqueue(dev, VIRTQUEUE_NUM, net_enable_fd); add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
add_virtqueue(dev, VIRTQUEUE_NUM, handle_net_output); add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
/* We need a socket to perform the magic network ioctls to bring up the /* We need a socket to perform the magic network ioctls to bring up the
* tap interface, connect to the bridge etc. Any socket will do! */ * tap interface, connect to the bridge etc. Any socket will do! */
...@@ -1546,20 +1337,18 @@ struct vblk_info ...@@ -1546,20 +1337,18 @@ struct vblk_info
* Remember that the block device is handled by a separate I/O thread. We head * Remember that the block device is handled by a separate I/O thread. We head
* straight into the core of that thread here: * straight into the core of that thread here:
*/ */
static bool service_io(struct device *dev) static void blk_request(struct virtqueue *vq)
{ {
struct vblk_info *vblk = dev->priv; struct vblk_info *vblk = vq->dev->priv;
unsigned int head, out_num, in_num, wlen; unsigned int head, out_num, in_num, wlen;
int ret; int ret;
u8 *in; u8 *in;
struct virtio_blk_outhdr *out; struct virtio_blk_outhdr *out;
struct iovec iov[dev->vq->vring.num]; struct iovec iov[vq->vring.num];
off64_t off; off64_t off;
/* See if there's a request waiting. If not, nothing to do. */ /* Get the next request. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num); head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
if (head == dev->vq->vring.num)
return false;
/* Every block request should contain at least one output buffer /* Every block request should contain at least one output buffer
* (detailing the location on disk and the type of request) and one * (detailing the location on disk and the type of request) and one
...@@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev) ...@@ -1633,83 +1422,21 @@ static bool service_io(struct device *dev)
if (out->type & VIRTIO_BLK_T_BARRIER) if (out->type & VIRTIO_BLK_T_BARRIER)
fdatasync(vblk->fd); fdatasync(vblk->fd);
/* We can't trigger an IRQ, because we're not the Launcher. It does add_used_and_trigger(vq, head, wlen);
* that when we tell it we're done. */
add_used(dev->vq, head, wlen);
return true;
}
/* This is the thread which actually services the I/O. */
static int io_thread(void *_dev)
{
struct device *dev = _dev;
struct vblk_info *vblk = dev->priv;
char c;
/* Close other side of workpipe so we get 0 read when main dies. */
close(vblk->workpipe[1]);
/* Close the other side of the done_fd pipe. */
close(dev->fd);
/* When this read fails, it means Launcher died, so we follow. */
while (read(vblk->workpipe[0], &c, 1) == 1) {
/* We acknowledge each request immediately to reduce latency,
* rather than waiting until we've done them all. I haven't
* measured to see if it makes any difference.
*
* That would be an interesting test, wouldn't it? You could
* also try having more than one I/O thread. */
while (service_io(dev))
write(vblk->done_fd, &c, 1);
}
return 0;
}
/* Now we've seen the I/O thread, we return to the Launcher to see what happens
* when that thread tells us it's completed some I/O. */
static bool handle_io_finish(struct device *dev)
{
char c;
/* If the I/O thread died, presumably it printed the error, so we
* simply exit. */
if (read(dev->fd, &c, 1) != 1)
exit(1);
/* It did some work, so trigger the irq. */
trigger_irq(dev->vq);
return true;
}
/* When the Guest submits some I/O, we just need to wake the I/O thread. */
static void handle_virtblk_output(struct virtqueue *vq, bool timeout)
{
struct vblk_info *vblk = vq->dev->priv;
char c = 0;
/* Wake up I/O thread and tell it to go to work! */
if (write(vblk->workpipe[1], &c, 1) != 1)
/* Presumably it indicated why it died. */
exit(1);
} }
/*L:198 This actually sets up a virtual block device. */ /*L:198 This actually sets up a virtual block device. */
static void setup_block_file(const char *filename) static void setup_block_file(const char *filename)
{ {
int p[2];
struct device *dev; struct device *dev;
struct vblk_info *vblk; struct vblk_info *vblk;
void *stack;
struct virtio_blk_config conf; struct virtio_blk_config conf;
/* This is the pipe the I/O thread will use to tell us I/O is done. */
pipe(p);
/* The device responds to return from I/O thread. */ /* The device responds to return from I/O thread. */
dev = new_device("block", VIRTIO_ID_BLOCK, p[0], handle_io_finish); dev = new_device("block", VIRTIO_ID_BLOCK);
/* The device has one virtqueue, where the Guest places requests. */ /* The device has one virtqueue, where the Guest places requests. */
add_virtqueue(dev, VIRTQUEUE_NUM, handle_virtblk_output); add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
/* Allocate the room for our own bookkeeping */ /* Allocate the room for our own bookkeeping */
vblk = dev->priv = malloc(sizeof(*vblk)); vblk = dev->priv = malloc(sizeof(*vblk));
...@@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename) ...@@ -1731,49 +1458,29 @@ static void setup_block_file(const char *filename)
set_config(dev, sizeof(conf), &conf); set_config(dev, sizeof(conf), &conf);
/* The I/O thread writes to this end of the pipe when done. */
vblk->done_fd = p[1];
/* This is the second pipe, which is how we tell the I/O thread about
* more work. */
pipe(vblk->workpipe);
/* Create stack for thread and run it. Since stack grows upwards, we
* point the stack pointer to the end of this region. */
stack = malloc(32768);
/* SIGCHLD - We dont "wait" for our cloned thread, so prevent it from
* becoming a zombie. */
if (clone(io_thread, stack + 32768, CLONE_VM | SIGCHLD, dev) == -1)
err(1, "Creating clone");
/* We don't need to keep the I/O thread's end of the pipes open. */
close(vblk->done_fd);
close(vblk->workpipe[0]);
verbose("device %u: virtblock %llu sectors\n", verbose("device %u: virtblock %llu sectors\n",
devices.device_num, le64_to_cpu(conf.capacity)); ++devices.device_num, le64_to_cpu(conf.capacity));
} }
struct rng_info {
int rfd;
};
/* Our random number generator device reads from /dev/random into the Guest's /* Our random number generator device reads from /dev/random into the Guest's
* input buffers. The usual case is that the Guest doesn't want random numbers * input buffers. The usual case is that the Guest doesn't want random numbers
* and so has no buffers although /dev/random is still readable, whereas * and so has no buffers although /dev/random is still readable, whereas
* console is the reverse. * console is the reverse.
* *
* The same logic applies, however. */ * The same logic applies, however. */
static bool handle_rng_input(struct device *dev) static void rng_input(struct virtqueue *vq)
{ {
int len; int len;
unsigned int head, in_num, out_num, totlen = 0; unsigned int head, in_num, out_num, totlen = 0;
struct iovec iov[dev->vq->vring.num]; struct rng_info *rng_info = vq->dev->priv;
struct iovec iov[vq->vring.num];
/* First we need a buffer from the Guests's virtqueue. */ /* First we need a buffer from the Guests's virtqueue. */
head = get_vq_desc(dev->vq, iov, &out_num, &in_num); head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
/* If they're not ready for input, stop listening to this file
* descriptor. We'll start again once they add an input buffer. */
if (head == dev->vq->vring.num)
return false;
if (out_num) if (out_num)
errx(1, "Output buffers in rng?"); errx(1, "Output buffers in rng?");
...@@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev) ...@@ -1781,7 +1488,7 @@ static bool handle_rng_input(struct device *dev)
* it reads straight into the Guest's buffer. We loop to make sure we * it reads straight into the Guest's buffer. We loop to make sure we
* fill it. */ * fill it. */
while (!iov_empty(iov, in_num)) { while (!iov_empty(iov, in_num)) {
len = readv(dev->fd, iov, in_num); len = readv(rng_info->rfd, iov, in_num);
if (len <= 0) if (len <= 0)
err(1, "Read from /dev/random gave %i", len); err(1, "Read from /dev/random gave %i", len);
iov_consume(iov, in_num, len); iov_consume(iov, in_num, len);
...@@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev) ...@@ -1789,25 +1496,23 @@ static bool handle_rng_input(struct device *dev)
} }
/* Tell the Guest about the new input. */ /* Tell the Guest about the new input. */
add_used_and_trigger(dev->vq, head, totlen); add_used_and_trigger(vq, head, totlen);
/* Everything went OK! */
return true;
} }
/* And this creates a "hardware" random number device for the Guest. */ /* And this creates a "hardware" random number device for the Guest. */
static void setup_rng(void) static void setup_rng(void)
{ {
struct device *dev; struct device *dev;
int fd; struct rng_info *rng_info = malloc(sizeof(*rng_info));
fd = open_or_die("/dev/random", O_RDONLY); rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
/* The device responds to return from I/O thread. */ /* The device responds to return from I/O thread. */
dev = new_device("rng", VIRTIO_ID_RNG, fd, handle_rng_input); dev = new_device("rng", VIRTIO_ID_RNG);
dev->priv = rng_info;
/* The device has one virtqueue, where the Guest places inbufs. */ /* The device has one virtqueue, where the Guest places inbufs. */
add_virtqueue(dev, VIRTQUEUE_NUM, enable_fd); add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
verbose("device %u: rng\n", devices.device_num++); verbose("device %u: rng\n", devices.device_num++);
} }
...@@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void) ...@@ -1823,7 +1528,9 @@ static void __attribute__((noreturn)) restart_guest(void)
for (i = 3; i < FD_SETSIZE; i++) for (i = 3; i < FD_SETSIZE; i++)
close(i); close(i);
/* The exec automatically gets rid of the I/O and Waker threads. */ /* Reset all the devices (kills all threads). */
cleanup_devices();
execv(main_args[0], main_args); execv(main_args[0], main_args);
err(1, "Could not exec %s", main_args[0]); err(1, "Could not exec %s", main_args[0]);
} }
...@@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void) ...@@ -1833,7 +1540,6 @@ static void __attribute__((noreturn)) restart_guest(void)
static void __attribute__((noreturn)) run_guest(void) static void __attribute__((noreturn)) run_guest(void)
{ {
for (;;) { for (;;) {
unsigned long args[] = { LHREQ_BREAK, 0 };
unsigned long notify_addr; unsigned long notify_addr;
int readval; int readval;
...@@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void) ...@@ -1845,7 +1551,6 @@ static void __attribute__((noreturn)) run_guest(void)
if (readval == sizeof(notify_addr)) { if (readval == sizeof(notify_addr)) {
verbose("Notify on address %#lx\n", notify_addr); verbose("Notify on address %#lx\n", notify_addr);
handle_output(notify_addr); handle_output(notify_addr);
continue;
/* ENOENT means the Guest died. Reading tells us why. */ /* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) { } else if (errno == ENOENT) {
char reason[1024] = { 0 }; char reason[1024] = { 0 };
...@@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void) ...@@ -1854,19 +1559,9 @@ static void __attribute__((noreturn)) run_guest(void)
/* ERESTART means that we need to reboot the guest */ /* ERESTART means that we need to reboot the guest */
} else if (errno == ERESTART) { } else if (errno == ERESTART) {
restart_guest(); restart_guest();
/* EAGAIN means a signal (timeout). /* Anything else means a bug or incompatible change. */
* Anything else means a bug or incompatible change. */ } else
} else if (errno != EAGAIN)
err(1, "Running guest failed"); err(1, "Running guest failed");
/* Only service input on thread for CPU 0. */
if (cpu_id != 0)
continue;
/* Service input, then unset the BREAK to release the Waker. */
handle_input();
if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
err(1, "Resetting break");
} }
} }
/*L:240 /*L:240
...@@ -1909,18 +1604,10 @@ int main(int argc, char *argv[]) ...@@ -1909,18 +1604,10 @@ int main(int argc, char *argv[])
/* Save the args: we "reboot" by execing ourselves again. */ /* Save the args: we "reboot" by execing ourselves again. */
main_args = argv; main_args = argv;
/* We don't "wait" for the children, so prevent them from becoming
* zombies. */
signal(SIGCHLD, SIG_IGN);
/* First we initialize the device list. Since console and network /* First we initialize the device list. We keep a pointer to the last
* device receive input from a file descriptor, we keep an fdset * device, and the next interrupt number to use for devices (1:
* (infds) and the maximum fd number (max_infd) with the head of the * remember that 0 is used by the timer). */
* list. We also keep a pointer to the last device. Finally, we keep
* the next interrupt number to use for devices (1: remember that 0 is
* used by the timer). */
FD_ZERO(&devices.infds);
devices.max_infd = -1;
devices.lastdev = NULL; devices.lastdev = NULL;
devices.next_irq = 1; devices.next_irq = 1;
...@@ -1978,9 +1665,6 @@ int main(int argc, char *argv[]) ...@@ -1978,9 +1665,6 @@ int main(int argc, char *argv[])
/* We always have a console device */ /* We always have a console device */
setup_console(); setup_console();
/* We can timeout waiting for Guest network transmit. */
setup_timeout();
/* Now we load the kernel */ /* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY)); start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
...@@ -2021,10 +1705,11 @@ int main(int argc, char *argv[]) ...@@ -2021,10 +1705,11 @@ int main(int argc, char *argv[])
* /dev/lguest file descriptor. */ * /dev/lguest file descriptor. */
tell_kernel(start); tell_kernel(start);
/* We clone off a thread, which wakes the Launcher whenever one of the /* Ensure that we terminate if a child dies. */
* input file descriptors needs attention. We call this the Waker, and signal(SIGCHLD, kill_launcher);
* we'll cover it in a moment. */
setup_waker(); /* If we exit via err(), this kills all the threads, restores tty. */
atexit(cleanup_devices);
/* Finally, run the Guest. This doesn't return. */ /* Finally, run the Guest. This doesn't return. */
run_guest(); run_guest();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment