Introduce Virtio-loopback epsilon release:

Epsilon release introduces a new compatibility layer which make virtio-loopback design to work with QEMU and rust-vmm vhost-user backend without require any changes. Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
author: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> 2023-10-10 11:40:56 +0000
committer: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> 2023-10-10 11:40:56 +0000
commit: e02cda008591317b1625707ff8e115a4841aa889 (patch)
tree: aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /util
parent: cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)
98 files changed, 31989 insertions, 0 deletions
diff --git a/util/aio-posix.c b/util/aio-posix.c
new file mode 100644
index 000000000..2b86777e9
--- /dev/null
+++ b/util/aio-posix.c
@@ -0,0 +1,730 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "qemu/main-loop.h"
+#include "qemu/rcu.h"
+#include "qemu/rcu_queue.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+#include "aio-posix.h"
+
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
+bool aio_poll_disabled(AioContext *ctx)
+{
+    return qatomic_read(&ctx->poll_disable_cnt);
+}
+
+void aio_add_ready_handler(AioHandlerList *ready_list,
+                           AioHandler *node,
+                           int revents)
+{
+    QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+    node->pfd.revents = revents;
+    QLIST_INSERT_HEAD(ready_list, node, node_ready);
+}
+
+static AioHandler *find_aio_handler(AioContext *ctx, int fd)
+{
+    AioHandler *node;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->pfd.fd == fd) {
+            if (!QLIST_IS_INSERTED(node, node_deleted)) {
+                return node;
+            }
+        }
+    }
+
+    return NULL;
+}
+
+static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+{
+    /* If the GSource is in the process of being destroyed then
+     * g_source_remove_poll() causes an assertion failure.  Skip
+     * removal in that case, because glib cleans up its state during
+     * destruction anyway.
+     */
+    if (!g_source_is_destroyed(&ctx->source)) {
+        g_source_remove_poll(&ctx->source, &node->pfd);
+    }
+
+    node->pfd.revents = 0;
+
+    /* If the fd monitor has already marked it deleted, leave it alone */
+    if (QLIST_IS_INSERTED(node, node_deleted)) {
+        return false;
+    }
+
+    /* If a read is in progress, just mark the node as deleted */
+    if (qemu_lockcnt_count(&ctx->list_lock)) {
+        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        return false;
+    }
+    /* Otherwise, delete it for real.  We can't just mark it as
+     * deleted because deleted nodes are only cleaned up while
+     * no one is walking the handlers list.
+     */
+    QLIST_SAFE_REMOVE(node, node_poll);
+    QLIST_REMOVE(node, node);
+    return true;
+}
+
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        bool is_external,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioPollFn *io_poll,
+                        void *opaque)
+{
+    AioHandler *node;
+    AioHandler *new_node = NULL;
+    bool is_new = false;
+    bool deleted = false;
+    int poll_disable_change;
+
+    qemu_lockcnt_lock(&ctx->list_lock);
+
+    node = find_aio_handler(ctx, fd);
+
+    /* Are we deleting the fd handler? */
+    if (!io_read && !io_write && !io_poll) {
+        if (node == NULL) {
+            qemu_lockcnt_unlock(&ctx->list_lock);
+            return;
+        }
+        /* Clean events in order to unregister fd from the ctx epoll. */
+        node->pfd.events = 0;
+
+        poll_disable_change = -!node->io_poll;
+    } else {
+        poll_disable_change = !io_poll - (node && !node->io_poll);
+        if (node == NULL) {
+            is_new = true;
+        }
+        /* Alloc and insert if it's not already there */
+        new_node = g_new0(AioHandler, 1);
+
+        /* Update handler with latest information */
+        new_node->io_read = io_read;
+        new_node->io_write = io_write;
+        new_node->io_poll = io_poll;
+        new_node->opaque = opaque;
+        new_node->is_external = is_external;
+
+        if (is_new) {
+            new_node->pfd.fd = fd;
+        } else {
+            new_node->pfd = node->pfd;
+        }
+        g_source_add_poll(&ctx->source, &new_node->pfd);
+
+        new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
+        new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
+
+        QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
+    }
+
+    /* No need to order poll_disable_cnt writes against other updates;
+     * the counter is only used to avoid wasting time and latency on
+     * iterated polling when the system call will be ultimately necessary.
+     * Changing handlers is a rare event, and a little wasted polling until
+     * the aio_notify below is not an issue.
+     */
+    qatomic_set(&ctx->poll_disable_cnt,
+               qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
+
+    ctx->fdmon_ops->update(ctx, node, new_node);
+    if (node) {
+        deleted = aio_remove_fd_handler(ctx, node);
+    }
+    qemu_lockcnt_unlock(&ctx->list_lock);
+    aio_notify(ctx);
+
+    if (deleted) {
+        g_free(node);
+    }
+}
+
+void aio_set_fd_poll(AioContext *ctx, int fd,
+                     IOHandler *io_poll_begin,
+                     IOHandler *io_poll_end)
+{
+    AioHandler *node = find_aio_handler(ctx, fd);
+
+    if (!node) {
+        return;
+    }
+
+    node->io_poll_begin = io_poll_begin;
+    node->io_poll_end = io_poll_end;
+}
+
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            bool is_external,
+                            EventNotifierHandler *io_read,
+                            AioPollFn *io_poll)
+{
+    aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
+                       (IOHandler *)io_read, NULL, io_poll, notifier);
+}
+
+void aio_set_event_notifier_poll(AioContext *ctx,
+                                 EventNotifier *notifier,
+                                 EventNotifierHandler *io_poll_begin,
+                                 EventNotifierHandler *io_poll_end)
+{
+    aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
+                    (IOHandler *)io_poll_begin,
+                    (IOHandler *)io_poll_end);
+}
+
+static bool poll_set_started(AioContext *ctx, bool started)
+{
+    AioHandler *node;
+    bool progress = false;
+
+    if (started == ctx->poll_started) {
+        return false;
+    }
+
+    ctx->poll_started = started;
+
+    qemu_lockcnt_inc(&ctx->list_lock);
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+        IOHandler *fn;
+
+        if (QLIST_IS_INSERTED(node, node_deleted)) {
+            continue;
+        }
+
+        if (started) {
+            fn = node->io_poll_begin;
+        } else {
+            fn = node->io_poll_end;
+        }
+
+        if (fn) {
+            fn(node->opaque);
+        }
+
+        /* Poll one last time in case ->io_poll_end() raced with the event */
+        if (!started) {
+            progress = node->io_poll(node->opaque) || progress;
+        }
+    }
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    return progress;
+}
+
+
+bool aio_prepare(AioContext *ctx)
+{
+    /* Poll mode cannot be used with glib's event loop, disable it. */
+    poll_set_started(ctx, false);
+
+    return false;
+}
+
+bool aio_pending(AioContext *ctx)
+{
+    AioHandler *node;
+    bool result = false;
+
+    /*
+     * We have to walk very carefully in case aio_set_fd_handler is
+     * called while we're walking.
+     */
+    qemu_lockcnt_inc(&ctx->list_lock);
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        int revents;
+
+        revents = node->pfd.revents & node->pfd.events;
+        if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
+            aio_node_check(ctx, node->is_external)) {
+            result = true;
+            break;
+        }
+        if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
+            aio_node_check(ctx, node->is_external)) {
+            result = true;
+            break;
+        }
+    }
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    return result;
+}
+
+static void aio_free_deleted_handlers(AioContext *ctx)
+{
+    AioHandler *node;
+
+    if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
+        return;
+    }
+    if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+        return; /* we are nested, let the parent do the freeing */
+    }
+
+    while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
+        QLIST_REMOVE(node, node);
+        QLIST_REMOVE(node, node_deleted);
+        QLIST_SAFE_REMOVE(node, node_poll);
+        g_free(node);
+    }
+
+    qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+}
+
+static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
+{
+    bool progress = false;
+    int revents;
+
+    revents = node->pfd.revents & node->pfd.events;
+    node->pfd.revents = 0;
+
+    /*
+     * Start polling AioHandlers when they become ready because activity is
+     * likely to continue.  Note that starvation is theoretically possible when
+     * fdmon_supports_polling(), but only until the fd fires for the first
+     * time.
+     */
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        !QLIST_IS_INSERTED(node, node_poll) &&
+        node->io_poll) {
+        trace_poll_add(ctx, node, node->pfd.fd, revents);
+        if (ctx->poll_started && node->io_poll_begin) {
+            node->io_poll_begin(node->opaque);
+        }
+        QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+    }
+
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_read) {
+        node->io_read(node->opaque);
+
+        /* aio_notify() does not count as progress */
+        if (node->opaque != &ctx->notifier) {
+            progress = true;
+        }
+    }
+    if (!QLIST_IS_INSERTED(node, node_deleted) &&
+        (revents & (G_IO_OUT | G_IO_ERR)) &&
+        aio_node_check(ctx, node->is_external) &&
+        node->io_write) {
+        node->io_write(node->opaque);
+        progress = true;
+    }
+
+    return progress;
+}
+
+/*
+ * If we have a list of ready handlers then this is more efficient than
+ * scanning all handlers with aio_dispatch_handlers().
+ */
+static bool aio_dispatch_ready_handlers(AioContext *ctx,
+                                        AioHandlerList *ready_list)
+{
+    bool progress = false;
+    AioHandler *node;
+
+    while ((node = QLIST_FIRST(ready_list))) {
+        QLIST_REMOVE(node, node_ready);
+        progress = aio_dispatch_handler(ctx, node) || progress;
+    }
+
+    return progress;
+}
+
+/* Slower than aio_dispatch_ready_handlers() but only used via glib */
+static bool aio_dispatch_handlers(AioContext *ctx)
+{
+    AioHandler *node, *tmp;
+    bool progress = false;
+
+    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+        progress = aio_dispatch_handler(ctx, node) || progress;
+    }
+
+    return progress;
+}
+
+void aio_dispatch(AioContext *ctx)
+{
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx);
+    aio_free_deleted_handlers(ctx);
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    timerlistgroup_run_timers(&ctx->tlg);
+}
+
+static bool run_poll_handlers_once(AioContext *ctx,
+                                   int64_t now,
+                                   int64_t *timeout)
+{
+    bool progress = false;
+    AioHandler *node;
+    AioHandler *tmp;
+
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (aio_node_check(ctx, node->is_external) &&
+            node->io_poll(node->opaque)) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
+            /*
+             * Polling was successful, exit try_poll_mode immediately
+             * to adjust the next polling time.
+             */
+            *timeout = 0;
+            if (node->opaque != &ctx->notifier) {
+                progress = true;
+            }
+        }
+
+        /* Caller handles freeing deleted nodes.  Don't do it here. */
+    }
+
+    return progress;
+}
+
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+    return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+    AioHandler *node;
+    AioHandler *tmp;
+    bool progress = false;
+
+    /*
+     * File descriptor monitoring implementations without userspace polling
+     * support suffer from starvation when a subset of handlers is polled
+     * because fds will not be processed in a timely fashion.  Don't remove
+     * idle poll handlers.
+     */
+    if (!fdmon_supports_polling(ctx)) {
+        return false;
+    }
+
+    QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+        if (node->poll_idle_timeout == 0LL) {
+            node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+        } else if (now >= node->poll_idle_timeout) {
+            trace_poll_remove(ctx, node, node->pfd.fd);
+            node->poll_idle_timeout = 0LL;
+            QLIST_SAFE_REMOVE(node, node_poll);
+            if (ctx->poll_started && node->io_poll_end) {
+                node->io_poll_end(node->opaque);
+
+                /*
+                 * Final poll in case ->io_poll_end() races with an event.
+                 * Nevermind about re-adding the handler in the rare case where
+                 * this causes progress.
+                 */
+                progress = node->io_poll(node->opaque) || progress;
+            }
+        }
+    }
+
+    return progress;
+}
+
+/* run_poll_handlers:
+ * @ctx: the AioContext
+ * @max_ns: maximum time to poll for, in nanoseconds
+ *
+ * Polls for a given time.
+ *
+ * Note that the caller must have incremented ctx->list_lock.
+ *
+ * Returns: true if progress was made, false otherwise
+ */
+static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
+{
+    bool progress;
+    int64_t start_time, elapsed_time;
+
+    assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
+
+    trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
+
+    /*
+     * Optimization: ->io_poll() handlers often contain RCU read critical
+     * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+     * -> rcu_read_lock() -> ... sequences with expensive memory
+     * synchronization primitives.  Make the entire polling loop an RCU
+     * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+     * are cheap.
+     */
+    RCU_READ_LOCK_GUARD();
+
+    start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    do {
+        progress = run_poll_handlers_once(ctx, start_time, timeout);
+        elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
+        max_ns = qemu_soonest_timeout(*timeout, max_ns);
+        assert(!(max_ns && progress));
+    } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
+
+    if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+        *timeout = 0;
+        progress = true;
+    }
+
+    /* If time has passed with no successful polling, adjust *timeout to
+     * keep the same ending time.
+     */
+    if (*timeout != -1) {
+        *timeout -= MIN(*timeout, elapsed_time);
+    }
+
+    trace_run_poll_handlers_end(ctx, progress, *timeout);
+    return progress;
+}
+
+/* try_poll_mode:
+ * @ctx: the AioContext
+ * @timeout: timeout for blocking wait, computed by the caller and updated if
+ *    polling succeeds.
+ *
+ * Note that the caller must have incremented ctx->list_lock.
+ *
+ * Returns: true if progress was made, false otherwise
+ */
+static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
+{
+    int64_t max_ns;
+
+    if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+        return false;
+    }
+
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+    if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
+        poll_set_started(ctx, true);
+
+        if (run_poll_handlers(ctx, max_ns, timeout)) {
+            return true;
+        }
+    }
+
+    if (poll_set_started(ctx, false)) {
+        *timeout = 0;
+        return true;
+    }
+
+    return false;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
+    int ret = 0;
+    bool progress;
+    bool use_notify_me;
+    int64_t timeout;
+    int64_t start = 0;
+
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     *
+     * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
+     * is special in that it runs in the main thread, but that thread's context
+     * is qemu_aio_context.
+     */
+    assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
+                                      qemu_get_aio_context() : ctx));
+
+    qemu_lockcnt_inc(&ctx->list_lock);
+
+    if (ctx->poll_max_ns) {
+        start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    }
+
+    timeout = blocking ? aio_compute_timeout(ctx) : 0;
+    progress = try_poll_mode(ctx, &timeout);
+    assert(!(timeout && progress));
+
+    /*
+     * aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns,
+     * so disable the optimization now.
+     */
+    use_notify_me = timeout != 0;
+    if (use_notify_me) {
+        qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before reading ctx->notified.  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
+
+        /* Don't block if aio_notify() was called */
+        if (qatomic_read(&ctx->notified)) {
+            timeout = 0;
+        }
+    }
+
+    /* If polling is allowed, non-blocking aio_poll does not need the
+     * system call---a single round of run_poll_handlers_once suffices.
+     */
+    if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
+        ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
+    }
+
+    if (use_notify_me) {
+        /* Finish the poll before clearing the flag.  */
+        qatomic_store_release(&ctx->notify_me,
+                             qatomic_read(&ctx->notify_me) - 2);
+    }
+
+    aio_notify_accept(ctx);
+
+    /* Adjust polling time */
+    if (ctx->poll_max_ns) {
+        int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
+
+        if (block_ns <= ctx->poll_ns) {
+            /* This is the sweet spot, no adjustment needed */
+        } else if (block_ns > ctx->poll_max_ns) {
+            /* We'd have to poll for too long, poll less */
+            int64_t old = ctx->poll_ns;
+
+            if (ctx->poll_shrink) {
+                ctx->poll_ns /= ctx->poll_shrink;
+            } else {
+                ctx->poll_ns = 0;
+            }
+
+            trace_poll_shrink(ctx, old, ctx->poll_ns);
+        } else if (ctx->poll_ns < ctx->poll_max_ns &&
+                   block_ns < ctx->poll_max_ns) {
+            /* There is room to grow, poll longer */
+            int64_t old = ctx->poll_ns;
+            int64_t grow = ctx->poll_grow;
+
+            if (grow == 0) {
+                grow = 2;
+            }
+
+            if (ctx->poll_ns) {
+                ctx->poll_ns *= grow;
+            } else {
+                ctx->poll_ns = 4000; /* start polling at 4 microseconds */
+            }
+
+            if (ctx->poll_ns > ctx->poll_max_ns) {
+                ctx->poll_ns = ctx->poll_max_ns;
+            }
+
+            trace_poll_grow(ctx, old, ctx->poll_ns);
+        }
+    }
+
+    progress |= aio_bh_poll(ctx);
+
+    if (ret > 0) {
+        progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
+    }
+
+    aio_free_deleted_handlers(ctx);
+
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+
+    return progress;
+}
+
+void aio_context_setup(AioContext *ctx)
+{
+    ctx->fdmon_ops = &fdmon_poll_ops;
+    ctx->epollfd = -1;
+
+    /* Use the fastest fd monitoring implementation if available */
+    if (fdmon_io_uring_setup(ctx)) {
+        return;
+    }
+
+    fdmon_epoll_setup(ctx);
+}
+
+void aio_context_destroy(AioContext *ctx)
+{
+    fdmon_io_uring_destroy(ctx);
+    fdmon_epoll_disable(ctx);
+    aio_free_deleted_handlers(ctx);
+}
+
+void aio_context_use_g_source(AioContext *ctx)
+{
+    /*
+     * Disable io_uring when the glib main loop is used because it doesn't
+     * support mixed glib/aio_poll() usage. It relies on aio_poll() being
+     * called regularly so that changes to the monitored file descriptors are
+     * submitted, otherwise a list of pending fd handlers builds up.
+     */
+    fdmon_io_uring_destroy(ctx);
+    aio_free_deleted_handlers(ctx);
+}
+
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+                                 int64_t grow, int64_t shrink, Error **errp)
+{
+    /* No thread synchronization here, it doesn't matter if an incorrect value
+     * is used once.
+     */
+    ctx->poll_max_ns = max_ns;
+    ctx->poll_ns = 0;
+    ctx->poll_grow = grow;
+    ctx->poll_shrink = shrink;
+
+    aio_notify(ctx);
+}
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp)
+{
+    /*
+     * No thread synchronization here, it doesn't matter if an incorrect value
+     * is used once.
+     */
+    ctx->aio_max_batch = max_batch;
+
+    aio_notify(ctx);
+}
diff --git a/util/aio-posix.h b/util/aio-posix.h
new file mode 100644
index 000000000..c80c04506
--- /dev/null
+++ b/util/aio-posix.h
@@ -0,0 +1,81 @@
+/*
+ * AioContext POSIX event loop implementation internal APIs
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2020
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef AIO_POSIX_H
+#define AIO_POSIX_H
+
+#include "block/aio.h"
+
+struct AioHandler {
+    GPollFD pfd;
+    IOHandler *io_read;
+    IOHandler *io_write;
+    AioPollFn *io_poll;
+    IOHandler *io_poll_begin;
+    IOHandler *io_poll_end;
+    void *opaque;
+    QLIST_ENTRY(AioHandler) node;
+    QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
+    QLIST_ENTRY(AioHandler) node_deleted;
+    QLIST_ENTRY(AioHandler) node_poll;
+#ifdef CONFIG_LINUX_IO_URING
+    QSLIST_ENTRY(AioHandler) node_submitted;
+    unsigned flags; /* see fdmon-io_uring.c */
+#endif
+    int64_t poll_idle_timeout; /* when to stop userspace polling */
+    bool is_external;
+};
+
+/* Add a handler to a ready list */
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
+                           int revents);
+
+extern const FDMonOps fdmon_poll_ops;
+
+#ifdef CONFIG_EPOLL_CREATE1
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
+void fdmon_epoll_setup(AioContext *ctx);
+void fdmon_epoll_disable(AioContext *ctx);
+#else
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+    return false;
+}
+
+static inline void fdmon_epoll_setup(AioContext *ctx)
+{
+}
+
+static inline void fdmon_epoll_disable(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_EPOLL_CREATE1 */
+
+#ifdef CONFIG_LINUX_IO_URING
+bool fdmon_io_uring_setup(AioContext *ctx);
+void fdmon_io_uring_destroy(AioContext *ctx);
+#else
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
+{
+    return false;
+}
+
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_LINUX_IO_URING */
+
+#endif /* AIO_POSIX_H */
diff --git a/util/aio-wait.c b/util/aio-wait.c
new file mode 100644
index 000000000..bdb3d3af2
--- /dev/null
+++ b/util/aio-wait.c
@@ -0,0 +1,72 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "block/aio-wait.h"
+
+AioWait global_aio_wait;
+
+static void dummy_bh_cb(void *opaque)
+{
+    /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */
+}
+
+void aio_wait_kick(void)
+{
+    /* The barrier (or an atomic op) is in the caller.  */
+    if (qatomic_read(&global_aio_wait.num_waiters)) {
+        aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+    }
+}
+
+typedef struct {
+    bool done;
+    QEMUBHFunc *cb;
+    void *opaque;
+} AioWaitBHData;
+
+/* Context: BH in IOThread */
+static void aio_wait_bh(void *opaque)
+{
+    AioWaitBHData *data = opaque;
+
+    data->cb(data->opaque);
+
+    data->done = true;
+    aio_wait_kick();
+}
+
+void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
+{
+    AioWaitBHData data = {
+        .cb = cb,
+        .opaque = opaque,
+    };
+
+    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+
+    aio_bh_schedule_oneshot(ctx, aio_wait_bh, &data);
+    AIO_WAIT_WHILE(ctx, !data.done);
+}
diff --git a/util/aio-win32.c b/util/aio-win32.c
new file mode 100644
index 000000000..d5b09a119
--- /dev/null
+++ b/util/aio-win32.c
@@ -0,0 +1,447 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM Corp., 2008
+ * Copyright Red Hat Inc., 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/block.h"
+#include "qemu/main-loop.h"
+#include "qemu/queue.h"
+#include "qemu/sockets.h"
+#include "qapi/error.h"
+#include "qemu/rcu_queue.h"
+
+struct AioHandler {
+    EventNotifier *e;
+    IOHandler *io_read;
+    IOHandler *io_write;
+    EventNotifierHandler *io_notify;
+    GPollFD pfd;
+    int deleted;
+    void *opaque;
+    bool is_external;
+    QLIST_ENTRY(AioHandler) node;
+};
+
+static void aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+{
+    /*
+     * If the GSource is in the process of being destroyed then
+     * g_source_remove_poll() causes an assertion failure.  Skip
+     * removal in that case, because glib cleans up its state during
+     * destruction anyway.
+     */
+    if (!g_source_is_destroyed(&ctx->source)) {
+        g_source_remove_poll(&ctx->source, &node->pfd);
+    }
+
+    /* If aio_poll is in progress, just mark the node as deleted */
+    if (qemu_lockcnt_count(&ctx->list_lock)) {
+        node->deleted = 1;
+        node->pfd.revents = 0;
+    } else {
+        /* Otherwise, delete it for real.  We can't just mark it as
+         * deleted because deleted nodes are only cleaned up after
+         * releasing the list_lock.
+         */
+        QLIST_REMOVE(node, node);
+        g_free(node);
+    }
+}
+
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        bool is_external,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioPollFn *io_poll,
+                        void *opaque)
+{
+    /* fd is a SOCKET in our case */
+    AioHandler *old_node;
+    AioHandler *node = NULL;
+
+    qemu_lockcnt_lock(&ctx->list_lock);
+    QLIST_FOREACH(old_node, &ctx->aio_handlers, node) {
+        if (old_node->pfd.fd == fd && !old_node->deleted) {
+            break;
+        }
+    }
+
+    if (io_read || io_write) {
+        HANDLE event;
+        long bitmask = 0;
+
+        /* Alloc and insert if it's not already there */
+        node = g_new0(AioHandler, 1);
+        node->pfd.fd = fd;
+
+        node->pfd.events = 0;
+        if (node->io_read) {
+            node->pfd.events |= G_IO_IN;
+        }
+        if (node->io_write) {
+            node->pfd.events |= G_IO_OUT;
+        }
+
+        node->e = &ctx->notifier;
+
+        /* Update handler with latest information */
+        node->opaque = opaque;
+        node->io_read = io_read;
+        node->io_write = io_write;
+        node->is_external = is_external;
+
+        if (io_read) {
+            bitmask |= FD_READ | FD_ACCEPT | FD_CLOSE;
+        }
+
+        if (io_write) {
+            bitmask |= FD_WRITE | FD_CONNECT;
+        }
+
+        QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node);
+        event = event_notifier_get_handle(&ctx->notifier);
+        WSAEventSelect(node->pfd.fd, event, bitmask);
+    }
+    if (old_node) {
+        aio_remove_fd_handler(ctx, old_node);
+    }
+
+    qemu_lockcnt_unlock(&ctx->list_lock);
+    aio_notify(ctx);
+}
+
+void aio_set_fd_poll(AioContext *ctx, int fd,
+                     IOHandler *io_poll_begin,
+                     IOHandler *io_poll_end)
+{
+    /* Not implemented */
+}
+
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *e,
+                            bool is_external,
+                            EventNotifierHandler *io_notify,
+                            AioPollFn *io_poll)
+{
+    AioHandler *node;
+
+    qemu_lockcnt_lock(&ctx->list_lock);
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        if (node->e == e && !node->deleted) {
+            break;
+        }
+    }
+
+    /* Are we deleting the fd handler? */
+    if (!io_notify) {
+        if (node) {
+            aio_remove_fd_handler(ctx, node);
+        }
+    } else {
+        if (node == NULL) {
+            /* Alloc and insert if it's not already there */
+            node = g_new0(AioHandler, 1);
+            node->e = e;
+            node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
+            node->pfd.events = G_IO_IN;
+            node->is_external = is_external;
+            QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node);
+
+            g_source_add_poll(&ctx->source, &node->pfd);
+        }
+        /* Update handler with latest information */
+        node->io_notify = io_notify;
+    }
+
+    qemu_lockcnt_unlock(&ctx->list_lock);
+    aio_notify(ctx);
+}
+
+void aio_set_event_notifier_poll(AioContext *ctx,
+                                 EventNotifier *notifier,
+                                 EventNotifierHandler *io_poll_begin,
+                                 EventNotifierHandler *io_poll_end)
+{
+    /* Not implemented */
+}
+
+bool aio_prepare(AioContext *ctx)
+{
+    static struct timeval tv0;
+    AioHandler *node;
+    bool have_select_revents = false;
+    fd_set rfds, wfds;
+
+    /*
+     * We have to walk very carefully in case aio_set_fd_handler is
+     * called while we're walking.
+     */
+    qemu_lockcnt_inc(&ctx->list_lock);
+
+    /* fill fd sets */
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (node->io_read) {
+            FD_SET ((SOCKET)node->pfd.fd, &rfds);
+        }
+        if (node->io_write) {
+            FD_SET ((SOCKET)node->pfd.fd, &wfds);
+        }
+    }
+
+    if (select(0, &rfds, &wfds, NULL, &tv0) > 0) {
+        QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+            node->pfd.revents = 0;
+            if (FD_ISSET(node->pfd.fd, &rfds)) {
+                node->pfd.revents |= G_IO_IN;
+                have_select_revents = true;
+            }
+
+            if (FD_ISSET(node->pfd.fd, &wfds)) {
+                node->pfd.revents |= G_IO_OUT;
+                have_select_revents = true;
+            }
+        }
+    }
+
+    qemu_lockcnt_dec(&ctx->list_lock);
+    return have_select_revents;
+}
+
+bool aio_pending(AioContext *ctx)
+{
+    AioHandler *node;
+    bool result = false;
+
+    /*
+     * We have to walk very carefully in case aio_set_fd_handler is
+     * called while we're walking.
+     */
+    qemu_lockcnt_inc(&ctx->list_lock);
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (node->pfd.revents && node->io_notify) {
+            result = true;
+            break;
+        }
+
+        if ((node->pfd.revents & G_IO_IN) && node->io_read) {
+            result = true;
+            break;
+        }
+        if ((node->pfd.revents & G_IO_OUT) && node->io_write) {
+            result = true;
+            break;
+        }
+    }
+
+    qemu_lockcnt_dec(&ctx->list_lock);
+    return result;
+}
+
+static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+{
+    AioHandler *node;
+    bool progress = false;
+    AioHandler *tmp;
+
+    /*
+     * We have to walk very carefully in case aio_set_fd_handler is
+     * called while we're walking.
+     */
+    QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+        int revents = node->pfd.revents;
+
+        if (!node->deleted &&
+            (revents || event_notifier_get_handle(node->e) == event) &&
+            node->io_notify) {
+            node->pfd.revents = 0;
+            node->io_notify(node->e);
+
+            /* aio_notify() does not count as progress */
+            if (node->e != &ctx->notifier) {
+                progress = true;
+            }
+        }
+
+        if (!node->deleted &&
+            (node->io_read || node->io_write)) {
+            node->pfd.revents = 0;
+            if ((revents & G_IO_IN) && node->io_read) {
+                node->io_read(node->opaque);
+                progress = true;
+            }
+            if ((revents & G_IO_OUT) && node->io_write) {
+                node->io_write(node->opaque);
+                progress = true;
+            }
+
+            /* if the next select() will return an event, we have progressed */
+            if (event == event_notifier_get_handle(&ctx->notifier)) {
+                WSANETWORKEVENTS ev;
+                WSAEnumNetworkEvents(node->pfd.fd, event, &ev);
+                if (ev.lNetworkEvents) {
+                    progress = true;
+                }
+            }
+        }
+
+        if (node->deleted) {
+            if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+                QLIST_REMOVE(node, node);
+                g_free(node);
+                qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+            }
+        }
+    }
+
+    return progress;
+}
+
+void aio_dispatch(AioContext *ctx)
+{
+    qemu_lockcnt_inc(&ctx->list_lock);
+    aio_bh_poll(ctx);
+    aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+    qemu_lockcnt_dec(&ctx->list_lock);
+    timerlistgroup_run_timers(&ctx->tlg);
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+    AioHandler *node;
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    bool progress, have_select_revents, first;
+    int count;
+    int timeout;
+
+    /*
+     * There cannot be two concurrent aio_poll calls for the same AioContext (or
+     * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+     * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+     *
+     * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
+     * is special in that it runs in the main thread, but that thread's context
+     * is qemu_aio_context.
+     */
+    assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
+                                      qemu_get_aio_context() : ctx));
+    progress = false;
+
+    /* aio_notify can avoid the expensive event_notifier_set if
+     * everything (file descriptors, bottom halves, timers) will
+     * be re-evaluated before the next blocking poll().  This is
+     * already true when aio_poll is called with blocking == false;
+     * if blocking == true, it is only true after poll() returns,
+     * so disable the optimization now.
+     */
+    if (blocking) {
+        qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
+        /*
+         * Write ctx->notify_me before computing the timeout
+         * (reading bottom half flags, etc.).  Pairs with
+         * smp_mb in aio_notify().
+         */
+        smp_mb();
+    }
+
+    qemu_lockcnt_inc(&ctx->list_lock);
+    have_select_revents = aio_prepare(ctx);
+
+    /* fill fd sets */
+    count = 0;
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (!node->deleted && node->io_notify
+            && aio_node_check(ctx, node->is_external)) {
+            events[count++] = event_notifier_get_handle(node->e);
+        }
+    }
+
+    first = true;
+
+    /* ctx->notifier is always registered.  */
+    assert(count > 0);
+
+    /* Multiple iterations, all of them non-blocking except the first,
+     * may be necessary to process all pending events.  After the first
+     * WaitForMultipleObjects call ctx->notify_me will be decremented.
+     */
+    do {
+        HANDLE event;
+        int ret;
+
+        timeout = blocking && !have_select_revents
+            ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+        ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+        if (blocking) {
+            assert(first);
+            qatomic_store_release(&ctx->notify_me,
+                                  qatomic_read(&ctx->notify_me) - 2);
+            aio_notify_accept(ctx);
+        }
+
+        if (first) {
+            progress |= aio_bh_poll(ctx);
+            first = false;
+        }
+
+        /* if we have any signaled events, dispatch event */
+        event = NULL;
+        if ((DWORD) (ret - WAIT_OBJECT_0) < count) {
+            event = events[ret - WAIT_OBJECT_0];
+            events[ret - WAIT_OBJECT_0] = events[--count];
+        } else if (!have_select_revents) {
+            break;
+        }
+
+        have_select_revents = false;
+        blocking = false;
+
+        progress |= aio_dispatch_handlers(ctx, event);
+    } while (count > 0);
+
+    qemu_lockcnt_dec(&ctx->list_lock);
+
+    progress |= timerlistgroup_run_timers(&ctx->tlg);
+    return progress;
+}
+
+void aio_context_setup(AioContext *ctx)
+{
+}
+
+void aio_context_destroy(AioContext *ctx)
+{
+}
+
+void aio_context_use_g_source(AioContext *ctx)
+{
+}
+
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+                                 int64_t grow, int64_t shrink, Error **errp)
+{
+    if (max_ns) {
+        error_setg(errp, "AioContext polling is not implemented on Windows");
+    }
+}
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp)
+{
+}
diff --git a/util/aiocb.c b/util/aiocb.c
new file mode 100644
index 000000000..5aef3a069
--- /dev/null
+++ b/util/aiocb.c
@@ -0,0 +1,55 @@
+/*
+ * BlockAIOCB allocation
+ *
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque)
+{
+    BlockAIOCB *acb;
+
+    acb = g_malloc(aiocb_info->aiocb_size);
+    acb->aiocb_info = aiocb_info;
+    acb->bs = bs;
+    acb->cb = cb;
+    acb->opaque = opaque;
+    acb->refcnt = 1;
+    return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+    BlockAIOCB *acb = p;
+    acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+    BlockAIOCB *acb = p;
+    assert(acb->refcnt > 0);
+    if (--acb->refcnt == 0) {
+        g_free(acb);
+    }
+}
diff --git a/util/async.c b/util/async.c
new file mode 100644
index 000000000..6f6717a34
--- /dev/null
+++ b/util/async.c
@@ -0,0 +1,690 @@
+/*
+ * Data plane event loop
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009-2017 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "block/aio.h"
+#include "block/thread-pool.h"
+#include "qemu/main-loop.h"
+#include "qemu/atomic.h"
+#include "qemu/rcu_queue.h"
+#include "block/raw-aio.h"
+#include "qemu/coroutine_int.h"
+#include "trace.h"
+
+/***********************************************************/
+/* bottom halves (can be seen as timers which expire ASAP) */
+
+/* QEMUBH::flags values */
+enum {
+    /* Already enqueued and waiting for aio_bh_poll() */
+    BH_PENDING   = (1 << 0),
+
+    /* Invoke the callback */
+    BH_SCHEDULED = (1 << 1),
+
+    /* Delete without invoking callback */
+    BH_DELETED   = (1 << 2),
+
+    /* Delete after invoking callback */
+    BH_ONESHOT   = (1 << 3),
+
+    /* Schedule periodically when the event loop is idle */
+    BH_IDLE      = (1 << 4),
+};
+
+struct QEMUBH {
+    AioContext *ctx;
+    const char *name;
+    QEMUBHFunc *cb;
+    void *opaque;
+    QSLIST_ENTRY(QEMUBH) next;
+    unsigned flags;
+};
+
+/* Called concurrently from any thread */
+static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
+{
+    AioContext *ctx = bh->ctx;
+    unsigned old_flags;
+
+    /*
+     * The memory barrier implicit in qatomic_fetch_or makes sure that:
+     * 1. idle & any writes needed by the callback are done before the
+     *    locations are read in the aio_bh_poll.
+     * 2. ctx is loaded before the callback has a chance to execute and bh
+     *    could be freed.
+     */
+    old_flags = qatomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
+    if (!(old_flags & BH_PENDING)) {
+        QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
+    }
+
+    aio_notify(ctx);
+}
+
+/* Only called from aio_bh_poll() and aio_ctx_finalize() */
+static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
+{
+    QEMUBH *bh = QSLIST_FIRST_RCU(head);
+
+    if (!bh) {
+        return NULL;
+    }
+
+    QSLIST_REMOVE_HEAD(head, next);
+
+    /*
+     * The qatomic_and is paired with aio_bh_enqueue().  The implicit memory
+     * barrier ensures that the callback sees all writes done by the scheduling
+     * thread.  It also ensures that the scheduling thread sees the cleared
+     * flag before bh->cb has run, and thus will call aio_notify again if
+     * necessary.
+     */
+    *flags = qatomic_fetch_and(&bh->flags,
+                              ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
+    return bh;
+}
+
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb,
+                                  void *opaque, const char *name)
+{
+    QEMUBH *bh;
+    bh = g_new(QEMUBH, 1);
+    *bh = (QEMUBH){
+        .ctx = ctx,
+        .cb = cb,
+        .opaque = opaque,
+        .name = name,
+    };
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
+}
+
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                        const char *name)
+{
+    QEMUBH *bh;
+    bh = g_new(QEMUBH, 1);
+    *bh = (QEMUBH){
+        .ctx = ctx,
+        .cb = cb,
+        .opaque = opaque,
+        .name = name,
+    };
+    return bh;
+}
+
+void aio_bh_call(QEMUBH *bh)
+{
+    bh->cb(bh->opaque);
+}
+
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
+int aio_bh_poll(AioContext *ctx)
+{
+    BHListSlice slice;
+    BHListSlice *s;
+    int ret = 0;
+
+    QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
+    QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
+
+    while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) {
+        QEMUBH *bh;
+        unsigned flags;
+
+        bh = aio_bh_dequeue(&s->bh_list, &flags);
+        if (!bh) {
+            QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next);
+            continue;
+        }
+
+        if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+            /* Idle BHs don't count as progress */
+            if (!(flags & BH_IDLE)) {
+                ret = 1;
+            }
+            aio_bh_call(bh);
+        }
+        if (flags & (BH_DELETED | BH_ONESHOT)) {
+            g_free(bh);
+        }
+    }
+
+    return ret;
+}
+
+void qemu_bh_schedule_idle(QEMUBH *bh)
+{
+    aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE);
+}
+
+void qemu_bh_schedule(QEMUBH *bh)
+{
+    aio_bh_enqueue(bh, BH_SCHEDULED);
+}
+
+/* This func is async.
+ */
+void qemu_bh_cancel(QEMUBH *bh)
+{
+    qatomic_and(&bh->flags, ~BH_SCHEDULED);
+}
+
+/* This func is async.The bottom half will do the delete action at the finial
+ * end.
+ */
+void qemu_bh_delete(QEMUBH *bh)
+{
+    aio_bh_enqueue(bh, BH_DELETED);
+}
+
+static int64_t aio_compute_bh_timeout(BHList *head, int timeout)
+{
+    QEMUBH *bh;
+
+    QSLIST_FOREACH_RCU(bh, head, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+            if (bh->flags & BH_IDLE) {
+                /* idle bottom halves will be polled at least
+                 * every 10ms */
+                timeout = 10000000;
+            } else {
+                /* non-idle bottom halves will be executed
+                 * immediately */
+                return 0;
+            }
+        }
+    }
+
+    return timeout;
+}
+
+int64_t
+aio_compute_timeout(AioContext *ctx)
+{
+    BHListSlice *s;
+    int64_t deadline;
+    int timeout = -1;
+
+    timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout);
+    if (timeout == 0) {
+        return 0;
+    }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        timeout = aio_compute_bh_timeout(&s->bh_list, timeout);
+        if (timeout == 0) {
+            return 0;
+        }
+    }
+
+    deadline = timerlistgroup_deadline_ns(&ctx->tlg);
+    if (deadline == 0) {
+        return 0;
+    } else {
+        return qemu_soonest_timeout(timeout, deadline);
+    }
+}
+
+static gboolean
+aio_ctx_prepare(GSource *source, gint    *timeout)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) | 1);
+
+    /*
+     * Write ctx->notify_me before computing the timeout
+     * (reading bottom half flags, etc.).  Pairs with
+     * smp_mb in aio_notify().
+     */
+    smp_mb();
+
+    /* We assume there is no timeout already supplied */
+    *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+
+    if (aio_prepare(ctx)) {
+        *timeout = 0;
+    }
+
+    return *timeout == 0;
+}
+
+static gboolean
+aio_ctx_check(GSource *source)
+{
+    AioContext *ctx = (AioContext *) source;
+    QEMUBH *bh;
+    BHListSlice *s;
+
+    /* Finish computing the timeout before clearing the flag.  */
+    qatomic_store_release(&ctx->notify_me, qatomic_read(&ctx->notify_me) & ~1);
+    aio_notify_accept(ctx);
+
+    QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
+        if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+            return true;
+        }
+    }
+
+    QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+        QSLIST_FOREACH_RCU(bh, &s->bh_list, next) {
+            if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+                return true;
+            }
+        }
+    }
+    return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
+}
+
+static gboolean
+aio_ctx_dispatch(GSource     *source,
+                 GSourceFunc  callback,
+                 gpointer     user_data)
+{
+    AioContext *ctx = (AioContext *) source;
+
+    assert(callback == NULL);
+    aio_dispatch(ctx);
+    return true;
+}
+
+static void
+aio_ctx_finalize(GSource     *source)
+{
+    AioContext *ctx = (AioContext *) source;
+    QEMUBH *bh;
+    unsigned flags;
+
+    thread_pool_free(ctx->thread_pool);
+
+#ifdef CONFIG_LINUX_AIO
+    if (ctx->linux_aio) {
+        laio_detach_aio_context(ctx->linux_aio, ctx);
+        laio_cleanup(ctx->linux_aio);
+        ctx->linux_aio = NULL;
+    }
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+    if (ctx->linux_io_uring) {
+        luring_detach_aio_context(ctx->linux_io_uring, ctx);
+        luring_cleanup(ctx->linux_io_uring);
+        ctx->linux_io_uring = NULL;
+    }
+#endif
+
+    assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
+    qemu_bh_delete(ctx->co_schedule_bh);
+
+    /* There must be no aio_bh_poll() calls going on */
+    assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list));
+
+    while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) {
+        /*
+         * qemu_bh_delete() must have been called on BHs in this AioContext. In
+         * many cases memory leaks, hangs, or inconsistent state occur when a
+         * BH is leaked because something still expects it to run.
+         *
+         * If you hit this, fix the lifecycle of the BH so that
+         * qemu_bh_delete() and any associated cleanup is called before the
+         * AioContext is finalized.
+         */
+        if (unlikely(!(flags & BH_DELETED))) {
+            fprintf(stderr, "%s: BH '%s' leaked, aborting...\n",
+                    __func__, bh->name);
+            abort();
+        }
+
+        g_free(bh);
+    }
+
+    aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL);
+    event_notifier_cleanup(&ctx->notifier);
+    qemu_rec_mutex_destroy(&ctx->lock);
+    qemu_lockcnt_destroy(&ctx->list_lock);
+    timerlistgroup_deinit(&ctx->tlg);
+    aio_context_destroy(ctx);
+}
+
+static GSourceFuncs aio_source_funcs = {
+    aio_ctx_prepare,
+    aio_ctx_check,
+    aio_ctx_dispatch,
+    aio_ctx_finalize
+};
+
+GSource *aio_get_g_source(AioContext *ctx)
+{
+    aio_context_use_g_source(ctx);
+    g_source_ref(&ctx->source);
+    return &ctx->source;
+}
+
+ThreadPool *aio_get_thread_pool(AioContext *ctx)
+{
+    if (!ctx->thread_pool) {
+        ctx->thread_pool = thread_pool_new(ctx);
+    }
+    return ctx->thread_pool;
+}
+
+#ifdef CONFIG_LINUX_AIO
+LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp)
+{
+    if (!ctx->linux_aio) {
+        ctx->linux_aio = laio_init(errp);
+        if (ctx->linux_aio) {
+            laio_attach_aio_context(ctx->linux_aio, ctx);
+        }
+    }
+    return ctx->linux_aio;
+}
+
+LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+{
+    assert(ctx->linux_aio);
+    return ctx->linux_aio;
+}
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
+{
+    if (ctx->linux_io_uring) {
+        return ctx->linux_io_uring;
+    }
+
+    ctx->linux_io_uring = luring_init(errp);
+    if (!ctx->linux_io_uring) {
+        return NULL;
+    }
+
+    luring_attach_aio_context(ctx->linux_io_uring, ctx);
+    return ctx->linux_io_uring;
+}
+
+LuringState *aio_get_linux_io_uring(AioContext *ctx)
+{
+    assert(ctx->linux_io_uring);
+    return ctx->linux_io_uring;
+}
+#endif
+
+void aio_notify(AioContext *ctx)
+{
+    /*
+     * Write e.g. bh->flags before writing ctx->notified.  Pairs with smp_mb in
+     * aio_notify_accept.
+     */
+    smp_wmb();
+    qatomic_set(&ctx->notified, true);
+
+    /*
+     * Write ctx->notified before reading ctx->notify_me.  Pairs
+     * with smp_mb in aio_ctx_prepare or aio_poll.
+     */
+    smp_mb();
+    if (qatomic_read(&ctx->notify_me)) {
+        event_notifier_set(&ctx->notifier);
+    }
+}
+
+void aio_notify_accept(AioContext *ctx)
+{
+    qatomic_set(&ctx->notified, false);
+
+    /*
+     * Write ctx->notified before reading e.g. bh->flags.  Pairs with smp_wmb
+     * in aio_notify.
+     */
+    smp_mb();
+}
+
+static void aio_timerlist_notify(void *opaque, QEMUClockType type)
+{
+    aio_notify(opaque);
+}
+
+static void aio_context_notifier_cb(EventNotifier *e)
+{
+    AioContext *ctx = container_of(e, AioContext, notifier);
+
+    event_notifier_test_and_clear(&ctx->notifier);
+}
+
+/* Returns true if aio_notify() was called (e.g. a BH was scheduled) */
+static bool aio_context_notifier_poll(void *opaque)
+{
+    EventNotifier *e = opaque;
+    AioContext *ctx = container_of(e, AioContext, notifier);
+
+    return qatomic_read(&ctx->notified);
+}
+
+static void co_schedule_bh_cb(void *opaque)
+{
+    AioContext *ctx = opaque;
+    QSLIST_HEAD(, Coroutine) straight, reversed;
+
+    QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
+    QSLIST_INIT(&straight);
+
+    while (!QSLIST_EMPTY(&reversed)) {
+        Coroutine *co = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
+        QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
+    }
+
+    while (!QSLIST_EMPTY(&straight)) {
+        Coroutine *co = QSLIST_FIRST(&straight);
+        QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+        trace_aio_co_schedule_bh_cb(ctx, co);
+        aio_context_acquire(ctx);
+
+        /* Protected by write barrier in qemu_aio_coroutine_enter */
+        qatomic_set(&co->scheduled, NULL);
+        qemu_aio_coroutine_enter(ctx, co);
+        aio_context_release(ctx);
+    }
+}
+
+AioContext *aio_context_new(Error **errp)
+{
+    int ret;
+    AioContext *ctx;
+
+    ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    QSLIST_INIT(&ctx->bh_list);
+    QSIMPLEQ_INIT(&ctx->bh_slice_list);
+    aio_context_setup(ctx);
+
+    ret = event_notifier_init(&ctx->notifier, false);
+    if (ret < 0) {
+        error_setg_errno(errp, -ret, "Failed to initialize event notifier");
+        goto fail;
+    }
+    g_source_set_can_recurse(&ctx->source, true);
+    qemu_lockcnt_init(&ctx->list_lock);
+
+    ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
+    QSLIST_INIT(&ctx->scheduled_coroutines);
+
+    aio_set_event_notifier(ctx, &ctx->notifier,
+                           false,
+                           aio_context_notifier_cb,
+                           aio_context_notifier_poll);
+#ifdef CONFIG_LINUX_AIO
+    ctx->linux_aio = NULL;
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+    ctx->linux_io_uring = NULL;
+#endif
+
+    ctx->thread_pool = NULL;
+    qemu_rec_mutex_init(&ctx->lock);
+    timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
+
+    ctx->poll_ns = 0;
+    ctx->poll_max_ns = 0;
+    ctx->poll_grow = 0;
+    ctx->poll_shrink = 0;
+
+    ctx->aio_max_batch = 0;
+
+    return ctx;
+fail:
+    g_source_destroy(&ctx->source);
+    return NULL;
+}
+
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
+{
+    trace_aio_co_schedule(ctx, co);
+    const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
+                                           __func__);
+
+    if (scheduled) {
+        fprintf(stderr,
+                "%s: Co-routine was already scheduled in '%s'\n",
+                __func__, scheduled);
+        abort();
+    }
+
+    /* The coroutine might run and release the last ctx reference before we
+     * invoke qemu_bh_schedule().  Take a reference to keep ctx alive until
+     * we're done.
+     */
+    aio_context_ref(ctx);
+
+    QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
+                              co, co_scheduled_next);
+    qemu_bh_schedule(ctx->co_schedule_bh);
+
+    aio_context_unref(ctx);
+}
+
+typedef struct AioCoRescheduleSelf {
+    Coroutine *co;
+    AioContext *new_ctx;
+} AioCoRescheduleSelf;
+
+static void aio_co_reschedule_self_bh(void *opaque)
+{
+    AioCoRescheduleSelf *data = opaque;
+    aio_co_schedule(data->new_ctx, data->co);
+}
+
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
+{
+    AioContext *old_ctx = qemu_get_current_aio_context();
+
+    if (old_ctx != new_ctx) {
+        AioCoRescheduleSelf data = {
+            .co = qemu_coroutine_self(),
+            .new_ctx = new_ctx,
+        };
+        /*
+         * We can't directly schedule the coroutine in the target context
+         * because this would be racy: The other thread could try to enter the
+         * coroutine before it has yielded in this one.
+         */
+        aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
+        qemu_coroutine_yield();
+    }
+}
+
+void aio_co_wake(struct Coroutine *co)
+{
+    AioContext *ctx;
+
+    /* Read coroutine before co->ctx.  Matches smp_wmb in
+     * qemu_coroutine_enter.
+     */
+    smp_read_barrier_depends();
+    ctx = qatomic_read(&co->ctx);
+
+    aio_co_enter(ctx, co);
+}
+
+void aio_co_enter(AioContext *ctx, struct Coroutine *co)
+{
+    if (ctx != qemu_get_current_aio_context()) {
+        aio_co_schedule(ctx, co);
+        return;
+    }
+
+    if (qemu_in_coroutine()) {
+        Coroutine *self = qemu_coroutine_self();
+        assert(self != co);
+        QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
+    } else {
+        aio_context_acquire(ctx);
+        qemu_aio_coroutine_enter(ctx, co);
+        aio_context_release(ctx);
+    }
+}
+
+void aio_context_ref(AioContext *ctx)
+{
+    g_source_ref(&ctx->source);
+}
+
+void aio_context_unref(AioContext *ctx)
+{
+    g_source_unref(&ctx->source);
+}
+
+void aio_context_acquire(AioContext *ctx)
+{
+    qemu_rec_mutex_lock(&ctx->lock);
+}
+
+void aio_context_release(AioContext *ctx)
+{
+    qemu_rec_mutex_unlock(&ctx->lock);
+}
+
+static __thread AioContext *my_aiocontext;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+    if (my_aiocontext) {
+        return my_aiocontext;
+    }
+    if (qemu_mutex_iothread_locked()) {
+        /* Possibly in a vCPU thread.  */
+        return qemu_get_aio_context();
+    }
+    return NULL;
+}
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+    assert(!my_aiocontext);
+    my_aiocontext = ctx;
+}
diff --git a/util/atomic64.c b/util/atomic64.c
new file mode 100644
index 000000000..93037d5b1
--- /dev/null
+++ b/util/atomic64.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+
+#ifdef CONFIG_ATOMIC64
+#error This file must only be compiled if !CONFIG_ATOMIC64
+#endif
+
+/*
+ * When !CONFIG_ATOMIC64, we serialize both reads and writes with spinlocks.
+ * We use an array of spinlocks, with padding computed at run-time based on
+ * the host's dcache line size.
+ * We point to the array with a void * to simplify the padding's computation.
+ * Each spinlock is located every lock_size bytes.
+ */
+static void *lock_array;
+static size_t lock_size;
+
+/*
+ * Systems without CONFIG_ATOMIC64 are unlikely to have many cores, so we use a
+ * small array of locks.
+ */
+#define NR_LOCKS 16
+
+static QemuSpin *addr_to_lock(const void *addr)
+{
+    uintptr_t a = (uintptr_t)addr;
+    uintptr_t idx;
+
+    idx = a >> qemu_dcache_linesize_log;
+    idx ^= (idx >> 8) ^ (idx >> 16);
+    idx &= NR_LOCKS - 1;
+    return lock_array + idx * lock_size;
+}
+
+#define GEN_READ(name, type)                    \
+    type name(const type *ptr)                  \
+    {                                           \
+        QemuSpin *lock = addr_to_lock(ptr);     \
+        type ret;                               \
+                                                \
+        qemu_spin_lock(lock);                   \
+        ret = *ptr;                             \
+        qemu_spin_unlock(lock);                 \
+        return ret;                             \
+    }
+
+GEN_READ(qatomic_read_i64, int64_t)
+GEN_READ(qatomic_read_u64, uint64_t)
+#undef GEN_READ
+
+#define GEN_SET(name, type)                     \
+    void name(type *ptr, type val)              \
+    {                                           \
+        QemuSpin *lock = addr_to_lock(ptr);     \
+                                                \
+        qemu_spin_lock(lock);                   \
+        *ptr = val;                             \
+        qemu_spin_unlock(lock);                 \
+    }
+
+GEN_SET(qatomic_set_i64, int64_t)
+GEN_SET(qatomic_set_u64, uint64_t)
+#undef GEN_SET
+
+void qatomic64_init(void)
+{
+    int i;
+
+    lock_size = ROUND_UP(sizeof(QemuSpin), qemu_dcache_linesize);
+    lock_array = qemu_memalign(qemu_dcache_linesize, lock_size * NR_LOCKS);
+    for (i = 0; i < NR_LOCKS; i++) {
+        QemuSpin *lock = lock_array + i * lock_size;
+
+        qemu_spin_init(lock);
+    }
+}
diff --git a/util/base64.c b/util/base64.c
new file mode 100644
index 000000000..811111ac4
--- /dev/null
+++ b/util/base64.c
@@ -0,0 +1,60 @@
+/*
+ * QEMU base64 helpers
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/base64.h"
+
+static const char *base64_valid_chars =
+    "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n";
+
+uint8_t *qbase64_decode(const char *input,
+                        size_t in_len,
+                        size_t *out_len,
+                        Error **errp)
+{
+    *out_len = 0;
+
+    if (in_len != -1) {
+        /* Lack of NUL terminator is an error */
+        if (input[in_len] != '\0') {
+            error_setg(errp, "Base64 data is not NUL terminated");
+            return NULL;
+        }
+        /* Check there's no NULs embedded since we expect
+         * this to be valid base64 data */
+        if (memchr(input, '\0', in_len) != NULL) {
+            error_setg(errp, "Base64 data contains embedded NUL characters");
+            return NULL;
+        }
+
+        /* Now we know its a valid nul terminated string
+         * strspn is safe to use... */
+    } else {
+        in_len = strlen(input);
+    }
+
+    if (strspn(input, base64_valid_chars) != in_len) {
+        error_setg(errp, "Base64 data contains invalid characters");
+        return NULL;
+    }
+
+    return g_base64_decode(input, out_len);
+}
diff --git a/util/bitmap.c b/util/bitmap.c
new file mode 100644
index 000000000..1f201393a
--- /dev/null
+++ b/util/bitmap.c
@@ -0,0 +1,489 @@
+/*
+ * Bitmap Module
+ *
+ * Stolen from linux/src/lib/bitmap.c
+ *
+ * Copyright (C) 2010 Corentin Chary
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/bitmap.h"
+#include "qemu/atomic.h"
+
+/*
+ * bitmaps provide an array of bits, implemented using an
+ * array of unsigned longs.  The number of valid bits in a
+ * given bitmap does _not_ need to be an exact multiple of
+ * BITS_PER_LONG.
+ *
+ * The possible unused bits in the last, partially used word
+ * of a bitmap are 'don't care'.  The implementation makes
+ * no particular effort to keep them zero.  It ensures that
+ * their value will not affect the results of any operation.
+ * The bitmap operations that return Boolean (bitmap_empty,
+ * for example) or scalar (bitmap_weight, for example) results
+ * carefully filter out these unused bits from impacting their
+ * results.
+ *
+ * These operations actually hold to a slightly stronger rule:
+ * if you don't input any bitmaps to these ops that have some
+ * unused bits set, then they won't output any set unused bits
+ * in output bitmaps.
+ *
+ * The byte ordering of bitmaps is more natural on little
+ * endian architectures.
+ */
+
+int slow_bitmap_empty(const unsigned long *bitmap, long bits)
+{
+    long k, lim = bits/BITS_PER_LONG;
+
+    for (k = 0; k < lim; ++k) {
+        if (bitmap[k]) {
+            return 0;
+        }
+    }
+    if (bits % BITS_PER_LONG) {
+        if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+int slow_bitmap_full(const unsigned long *bitmap, long bits)
+{
+    long k, lim = bits/BITS_PER_LONG;
+
+    for (k = 0; k < lim; ++k) {
+        if (~bitmap[k]) {
+            return 0;
+        }
+    }
+
+    if (bits % BITS_PER_LONG) {
+        if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+int slow_bitmap_equal(const unsigned long *bitmap1,
+                      const unsigned long *bitmap2, long bits)
+{
+    long k, lim = bits/BITS_PER_LONG;
+
+    for (k = 0; k < lim; ++k) {
+        if (bitmap1[k] != bitmap2[k]) {
+            return 0;
+        }
+    }
+
+    if (bits % BITS_PER_LONG) {
+        if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) {
+            return 0;
+        }
+    }
+
+    return 1;
+}
+
+void slow_bitmap_complement(unsigned long *dst, const unsigned long *src,
+                            long bits)
+{
+    long k, lim = bits/BITS_PER_LONG;
+
+    for (k = 0; k < lim; ++k) {
+        dst[k] = ~src[k];
+    }
+
+    if (bits % BITS_PER_LONG) {
+        dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits);
+    }
+}
+
+int slow_bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+                    const unsigned long *bitmap2, long bits)
+{
+    long k;
+    long nr = BITS_TO_LONGS(bits);
+    unsigned long result = 0;
+
+    for (k = 0; k < nr; k++) {
+        result |= (dst[k] = bitmap1[k] & bitmap2[k]);
+    }
+    return result != 0;
+}
+
+void slow_bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+                    const unsigned long *bitmap2, long bits)
+{
+    long k;
+    long nr = BITS_TO_LONGS(bits);
+
+    for (k = 0; k < nr; k++) {
+        dst[k] = bitmap1[k] | bitmap2[k];
+    }
+}
+
+void slow_bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
+                     const unsigned long *bitmap2, long bits)
+{
+    long k;
+    long nr = BITS_TO_LONGS(bits);
+
+    for (k = 0; k < nr; k++) {
+        dst[k] = bitmap1[k] ^ bitmap2[k];
+    }
+}
+
+int slow_bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+                       const unsigned long *bitmap2, long bits)
+{
+    long k;
+    long nr = BITS_TO_LONGS(bits);
+    unsigned long result = 0;
+
+    for (k = 0; k < nr; k++) {
+        result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
+    }
+    return result != 0;
+}
+
+void bitmap_set(unsigned long *map, long start, long nr)
+{
+    unsigned long *p = map + BIT_WORD(start);
+    const long size = start + nr;
+    int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+    unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+    assert(start >= 0 && nr >= 0);
+
+    while (nr - bits_to_set >= 0) {
+        *p |= mask_to_set;
+        nr -= bits_to_set;
+        bits_to_set = BITS_PER_LONG;
+        mask_to_set = ~0UL;
+        p++;
+    }
+    if (nr) {
+        mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+        *p |= mask_to_set;
+    }
+}
+
+void bitmap_set_atomic(unsigned long *map, long start, long nr)
+{
+    unsigned long *p = map + BIT_WORD(start);
+    const long size = start + nr;
+    int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+    unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+    assert(start >= 0 && nr >= 0);
+
+    /* First word */
+    if (nr - bits_to_set > 0) {
+        qatomic_or(p, mask_to_set);
+        nr -= bits_to_set;
+        bits_to_set = BITS_PER_LONG;
+        mask_to_set = ~0UL;
+        p++;
+    }
+
+    /* Full words */
+    if (bits_to_set == BITS_PER_LONG) {
+        while (nr >= BITS_PER_LONG) {
+            *p = ~0UL;
+            nr -= BITS_PER_LONG;
+            p++;
+        }
+    }
+
+    /* Last word */
+    if (nr) {
+        mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+        qatomic_or(p, mask_to_set);
+    } else {
+        /* If we avoided the full barrier in qatomic_or(), issue a
+         * barrier to account for the assignments in the while loop.
+         */
+        smp_mb();
+    }
+}
+
+void bitmap_clear(unsigned long *map, long start, long nr)
+{
+    unsigned long *p = map + BIT_WORD(start);
+    const long size = start + nr;
+    int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+    unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+    assert(start >= 0 && nr >= 0);
+
+    while (nr - bits_to_clear >= 0) {
+        *p &= ~mask_to_clear;
+        nr -= bits_to_clear;
+        bits_to_clear = BITS_PER_LONG;
+        mask_to_clear = ~0UL;
+        p++;
+    }
+    if (nr) {
+        mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+        *p &= ~mask_to_clear;
+    }
+}
+
+bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr)
+{
+    unsigned long *p = map + BIT_WORD(start);
+    const long size = start + nr;
+    int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+    unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+    unsigned long dirty = 0;
+    unsigned long old_bits;
+
+    assert(start >= 0 && nr >= 0);
+
+    /* First word */
+    if (nr - bits_to_clear > 0) {
+        old_bits = qatomic_fetch_and(p, ~mask_to_clear);
+        dirty |= old_bits & mask_to_clear;
+        nr -= bits_to_clear;
+        bits_to_clear = BITS_PER_LONG;
+        mask_to_clear = ~0UL;
+        p++;
+    }
+
+    /* Full words */
+    if (bits_to_clear == BITS_PER_LONG) {
+        while (nr >= BITS_PER_LONG) {
+            if (*p) {
+                old_bits = qatomic_xchg(p, 0);
+                dirty |= old_bits;
+            }
+            nr -= BITS_PER_LONG;
+            p++;
+        }
+    }
+
+    /* Last word */
+    if (nr) {
+        mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+        old_bits = qatomic_fetch_and(p, ~mask_to_clear);
+        dirty |= old_bits & mask_to_clear;
+    } else {
+        if (!dirty) {
+            smp_mb();
+        }
+    }
+
+    return dirty != 0;
+}
+
+void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src,
+                                  long nr)
+{
+    while (nr > 0) {
+        *dst = qatomic_xchg(src, 0);
+        dst++;
+        src++;
+        nr -= BITS_PER_LONG;
+    }
+}
+
+#define ALIGN_MASK(x,mask)      (((x)+(mask))&~(mask))
+
+/**
+ * bitmap_find_next_zero_area - find a contiguous aligned zero area
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @align_mask: Alignment mask for zero area
+ *
+ * The @align_mask should be one less than a power of 2; the effect is that
+ * the bit offset of all zero areas this function finds is multiples of that
+ * power of 2. A @align_mask of 0 means no alignment is required.
+ */
+unsigned long bitmap_find_next_zero_area(unsigned long *map,
+                                         unsigned long size,
+                                         unsigned long start,
+                                         unsigned long nr,
+                                         unsigned long align_mask)
+{
+    unsigned long index, end, i;
+again:
+    index = find_next_zero_bit(map, size, start);
+
+    /* Align allocation */
+    index = ALIGN_MASK(index, align_mask);
+
+    end = index + nr;
+    if (end > size) {
+        return end;
+    }
+    i = find_next_bit(map, end, index);
+    if (i < end) {
+        start = i + 1;
+        goto again;
+    }
+    return index;
+}
+
+int slow_bitmap_intersects(const unsigned long *bitmap1,
+                           const unsigned long *bitmap2, long bits)
+{
+    long k, lim = bits/BITS_PER_LONG;
+
+    for (k = 0; k < lim; ++k) {
+        if (bitmap1[k] & bitmap2[k]) {
+            return 1;
+        }
+    }
+
+    if (bits % BITS_PER_LONG) {
+        if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+long slow_bitmap_count_one(const unsigned long *bitmap, long nbits)
+{
+    long k, lim = nbits / BITS_PER_LONG, result = 0;
+
+    for (k = 0; k < lim; k++) {
+        result += ctpopl(bitmap[k]);
+    }
+
+    if (nbits % BITS_PER_LONG) {
+        result += ctpopl(bitmap[k] & BITMAP_LAST_WORD_MASK(nbits));
+    }
+
+    return result;
+}
+
+static void bitmap_to_from_le(unsigned long *dst,
+                              const unsigned long *src, long nbits)
+{
+    long len = BITS_TO_LONGS(nbits);
+
+#ifdef HOST_WORDS_BIGENDIAN
+    long index;
+
+    for (index = 0; index < len; index++) {
+# if HOST_LONG_BITS == 64
+        dst[index] = bswap64(src[index]);
+# else
+        dst[index] = bswap32(src[index]);
+# endif
+    }
+#else
+    memcpy(dst, src, len * sizeof(unsigned long));
+#endif
+}
+
+void bitmap_from_le(unsigned long *dst, const unsigned long *src,
+                    long nbits)
+{
+    bitmap_to_from_le(dst, src, nbits);
+}
+
+void bitmap_to_le(unsigned long *dst, const unsigned long *src,
+                  long nbits)
+{
+    bitmap_to_from_le(dst, src, nbits);
+}
+
+/*
+ * Copy "src" bitmap with a positive offset and put it into the "dst"
+ * bitmap.  The caller needs to make sure the bitmap size of "src"
+ * is bigger than (shift + nbits).
+ */
+void bitmap_copy_with_src_offset(unsigned long *dst, const unsigned long *src,
+                                 unsigned long shift, unsigned long nbits)
+{
+    unsigned long left_mask, right_mask, last_mask;
+
+    /* Proper shift src pointer to the first word to copy from */
+    src += BIT_WORD(shift);
+    shift %= BITS_PER_LONG;
+
+    if (!shift) {
+        /* Fast path */
+        bitmap_copy(dst, src, nbits);
+        return;
+    }
+
+    right_mask = (1ul << shift) - 1;
+    left_mask = ~right_mask;
+
+    while (nbits >= BITS_PER_LONG) {
+        *dst = (*src & left_mask) >> shift;
+        *dst |= (src[1] & right_mask) << (BITS_PER_LONG - shift);
+        dst++;
+        src++;
+        nbits -= BITS_PER_LONG;
+    }
+
+    if (nbits > BITS_PER_LONG - shift) {
+        *dst = (*src & left_mask) >> shift;
+        nbits -= BITS_PER_LONG - shift;
+        last_mask = (1ul << nbits) - 1;
+        *dst |= (src[1] & last_mask) << (BITS_PER_LONG - shift);
+    } else if (nbits) {
+        last_mask = (1ul << nbits) - 1;
+        *dst = (*src >> shift) & last_mask;
+    }
+}
+
+/*
+ * Copy "src" bitmap into the "dst" bitmap with an offset in the
+ * "dst".  The caller needs to make sure the bitmap size of "dst" is
+ * bigger than (shift + nbits).
+ */
+void bitmap_copy_with_dst_offset(unsigned long *dst, const unsigned long *src,
+                                 unsigned long shift, unsigned long nbits)
+{
+    unsigned long left_mask, right_mask, last_mask;
+
+    /* Proper shift dst pointer to the first word to copy from */
+    dst += BIT_WORD(shift);
+    shift %= BITS_PER_LONG;
+
+    if (!shift) {
+        /* Fast path */
+        bitmap_copy(dst, src, nbits);
+        return;
+    }
+
+    right_mask = (1ul << (BITS_PER_LONG - shift)) - 1;
+    left_mask = ~right_mask;
+
+    *dst &= (1ul << shift) - 1;
+    while (nbits >= BITS_PER_LONG) {
+        *dst |= (*src & right_mask) << shift;
+        dst[1] = (*src & left_mask) >> (BITS_PER_LONG - shift);
+        dst++;
+        src++;
+        nbits -= BITS_PER_LONG;
+    }
+
+    if (nbits > BITS_PER_LONG - shift) {
+        *dst |= (*src & right_mask) << shift;
+        nbits -= BITS_PER_LONG - shift;
+        last_mask = ((1ul << nbits) - 1) << (BITS_PER_LONG - shift);
+        dst[1] = (*src & last_mask) >> (BITS_PER_LONG - shift);
+    } else if (nbits) {
+        last_mask = (1ul << nbits) - 1;
+        *dst |= (*src & last_mask) << shift;
+    }
+}
diff --git a/util/bitops.c b/util/bitops.c
new file mode 100644
index 000000000..3fe6b1c4f
--- /dev/null
+++ b/util/bitops.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2008 IBM Corporation
+ * Written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+                            unsigned long offset)
+{
+    const unsigned long *p = addr + BIT_WORD(offset);
+    unsigned long result = offset & ~(BITS_PER_LONG-1);
+    unsigned long tmp;
+
+    if (offset >= size) {
+        return size;
+    }
+    size -= result;
+    offset %= BITS_PER_LONG;
+    if (offset) {
+        tmp = *(p++);
+        tmp &= (~0UL << offset);
+        if (size < BITS_PER_LONG) {
+            goto found_first;
+        }
+        if (tmp) {
+            goto found_middle;
+        }
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+    }
+    while (size >= 4*BITS_PER_LONG) {
+        unsigned long d1, d2, d3;
+        tmp = *p;
+        d1 = *(p+1);
+        d2 = *(p+2);
+        d3 = *(p+3);
+        if (tmp) {
+            goto found_middle;
+        }
+        if (d1 | d2 | d3) {
+            break;
+        }
+        p += 4;
+        result += 4*BITS_PER_LONG;
+        size -= 4*BITS_PER_LONG;
+    }
+    while (size >= BITS_PER_LONG) {
+        if ((tmp = *(p++))) {
+            goto found_middle;
+        }
+        result += BITS_PER_LONG;
+        size -= BITS_PER_LONG;
+    }
+    if (!size) {
+        return result;
+    }
+    tmp = *p;
+
+found_first:
+    tmp &= (~0UL >> (BITS_PER_LONG - size));
+    if (tmp == 0UL) {		/* Are any bits set? */
+        return result + size;	/* Nope. */
+    }
+found_middle:
+    return result + ctzl(tmp);
+}
+
+/*
+ * This implementation of find_{first,next}_zero_bit was stolen from
+ * Linus' asm-alpha/bitops.h.
+ */
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+                                 unsigned long offset)
+{
+    const unsigned long *p = addr + BIT_WORD(offset);
+    unsigned long result = offset & ~(BITS_PER_LONG-1);
+    unsigned long tmp;
+
+    if (offset >= size) {
+        return size;
+    }
+    size -= result;
+    offset %= BITS_PER_LONG;
+    if (offset) {
+        tmp = *(p++);
+        tmp |= ~0UL >> (BITS_PER_LONG - offset);
+        if (size < BITS_PER_LONG) {
+            goto found_first;
+        }
+        if (~tmp) {
+            goto found_middle;
+        }
+        size -= BITS_PER_LONG;
+        result += BITS_PER_LONG;
+    }
+    while (size & ~(BITS_PER_LONG-1)) {
+        if (~(tmp = *(p++))) {
+            goto found_middle;
+        }
+        result += BITS_PER_LONG;
+        size -= BITS_PER_LONG;
+    }
+    if (!size) {
+        return result;
+    }
+    tmp = *p;
+
+found_first:
+    tmp |= ~0UL << size;
+    if (tmp == ~0UL) {	/* Are any bits zero? */
+        return result + size;	/* Nope. */
+    }
+found_middle:
+    return result + ctzl(~tmp);
+}
+
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+    unsigned long words;
+    unsigned long tmp;
+
+    /* Start at final word. */
+    words = size / BITS_PER_LONG;
+
+    /* Partial final word? */
+    if (size & (BITS_PER_LONG-1)) {
+        tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
+                                       - (size & (BITS_PER_LONG-1)))));
+        if (tmp) {
+            goto found;
+        }
+    }
+
+    while (words) {
+        tmp = addr[--words];
+        if (tmp) {
+        found:
+            return words * BITS_PER_LONG + BITS_PER_LONG - 1 - clzl(tmp);
+        }
+    }
+
+    /* Not found */
+    return size;
+}
diff --git a/util/block-helpers.c b/util/block-helpers.c
new file mode 100644
index 000000000..c4851432f
--- /dev/null
+++ b/util/block-helpers.c
@@ -0,0 +1,46 @@
+/*
+ * Block utility functions
+ *
+ * Copyright IBM, Corp. 2011
+ * Copyright (c) 2020 Coiby Xu <coiby.xu@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "block-helpers.h"
+
+/**
+ * check_block_size:
+ * @id: The unique ID of the object
+ * @name: The name of the property being validated
+ * @value: The block size in bytes
+ * @errp: A pointer to an area to store an error
+ *
+ * This function checks that the block size meets the following conditions:
+ * 1. At least MIN_BLOCK_SIZE
+ * 2. No larger than MAX_BLOCK_SIZE
+ * 3. A power of 2
+ */
+void check_block_size(const char *id, const char *name, int64_t value,
+                      Error **errp)
+{
+    /* value of 0 means "unset" */
+    if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) {
+        error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
+                   id, name, value, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
+        return;
+    }
+
+    /* We rely on power-of-2 blocksizes for bitmasks */
+    if ((value & (value - 1)) != 0) {
+        error_setg(errp,
+                   "Property %s.%s doesn't take value '%" PRId64
+                   "', it's not a power of 2",
+                   id, name, value);
+        return;
+    }
+}
diff --git a/util/block-helpers.h b/util/block-helpers.h
new file mode 100644
index 000000000..b53295a52
--- /dev/null
+++ b/util/block-helpers.h
@@ -0,0 +1,19 @@
+#ifndef BLOCK_HELPERS_H
+#define BLOCK_HELPERS_H
+
+#include "qemu/units.h"
+
+/* lower limit is sector size */
+#define MIN_BLOCK_SIZE          INT64_C(512)
+#define MIN_BLOCK_SIZE_STR      "512 B"
+/*
+ * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, and
+ * matches qcow2 cluster size limit
+ */
+#define MAX_BLOCK_SIZE          (2 * MiB)
+#define MAX_BLOCK_SIZE_STR      "2 MiB"
+
+void check_block_size(const char *id, const char *name, int64_t value,
+                      Error **errp);
+
+#endif /* BLOCK_HELPERS_H */
diff --git a/util/buffer.c b/util/buffer.c
new file mode 100644
index 000000000..743eaa930
--- /dev/null
+++ b/util/buffer.c
@@ -0,0 +1,173 @@
+/*
+ * QEMU generic buffers
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/buffer.h"
+#include "trace.h"
+
+#define BUFFER_MIN_INIT_SIZE     4096
+#define BUFFER_MIN_SHRINK_SIZE  65536
+
+/* define the factor alpha for the exponential smoothing
+ * that is used in the average size calculation. a shift
+ * of 7 results in an alpha of 1/2^7. */
+#define BUFFER_AVG_SIZE_SHIFT       7
+
+static size_t buffer_req_size(Buffer *buffer, size_t len)
+{
+    return MAX(BUFFER_MIN_INIT_SIZE,
+               pow2ceil(buffer->offset + len));
+}
+
+static void buffer_adj_size(Buffer *buffer, size_t len)
+{
+    size_t old = buffer->capacity;
+    buffer->capacity = buffer_req_size(buffer, len);
+    buffer->buffer = g_realloc(buffer->buffer, buffer->capacity);
+    trace_buffer_resize(buffer->name ?: "unnamed",
+                        old, buffer->capacity);
+
+    /* make it even harder for the buffer to shrink, reset average size
+     * to current capacity if it is larger than the average. */
+    buffer->avg_size = MAX(buffer->avg_size,
+                           buffer->capacity << BUFFER_AVG_SIZE_SHIFT);
+}
+
+void buffer_init(Buffer *buffer, const char *name, ...)
+{
+    va_list ap;
+
+    va_start(ap, name);
+    buffer->name = g_strdup_vprintf(name, ap);
+    va_end(ap);
+}
+
+static uint64_t buffer_get_avg_size(Buffer *buffer)
+{
+    return buffer->avg_size >> BUFFER_AVG_SIZE_SHIFT;
+}
+
+void buffer_shrink(Buffer *buffer)
+{
+    size_t new;
+
+    /* Calculate the average size of the buffer as
+     * avg_size = avg_size * ( 1 - a ) + required_size * a
+     * where a is 1 / 2 ^ BUFFER_AVG_SIZE_SHIFT. */
+    buffer->avg_size *= (1 << BUFFER_AVG_SIZE_SHIFT) - 1;
+    buffer->avg_size >>= BUFFER_AVG_SIZE_SHIFT;
+    buffer->avg_size += buffer_req_size(buffer, 0);
+
+    /* And then only shrink if the average size of the buffer is much
+     * too big, to avoid bumping up & down the buffers all the time.
+     * realloc() isn't exactly cheap ...  */
+    new = buffer_req_size(buffer, buffer_get_avg_size(buffer));
+    if (new < buffer->capacity >> 3 &&
+        new >= BUFFER_MIN_SHRINK_SIZE) {
+        buffer_adj_size(buffer, buffer_get_avg_size(buffer));
+    }
+
+    buffer_adj_size(buffer, 0);
+}
+
+void buffer_reserve(Buffer *buffer, size_t len)
+{
+    if ((buffer->capacity - buffer->offset) < len) {
+        buffer_adj_size(buffer, len);
+    }
+}
+
+gboolean buffer_empty(Buffer *buffer)
+{
+    return buffer->offset == 0;
+}
+
+uint8_t *buffer_end(Buffer *buffer)
+{
+    return buffer->buffer + buffer->offset;
+}
+
+void buffer_reset(Buffer *buffer)
+{
+    buffer->offset = 0;
+    buffer_shrink(buffer);
+}
+
+void buffer_free(Buffer *buffer)
+{
+    trace_buffer_free(buffer->name ?: "unnamed", buffer->capacity);
+    g_free(buffer->buffer);
+    g_free(buffer->name);
+    buffer->offset = 0;
+    buffer->capacity = 0;
+    buffer->buffer = NULL;
+    buffer->name = NULL;
+}
+
+void buffer_append(Buffer *buffer, const void *data, size_t len)
+{
+    memcpy(buffer->buffer + buffer->offset, data, len);
+    buffer->offset += len;
+}
+
+void buffer_advance(Buffer *buffer, size_t len)
+{
+    memmove(buffer->buffer, buffer->buffer + len,
+            (buffer->offset - len));
+    buffer->offset -= len;
+    buffer_shrink(buffer);
+}
+
+void buffer_move_empty(Buffer *to, Buffer *from)
+{
+    trace_buffer_move_empty(to->name ?: "unnamed",
+                            from->offset,
+                            from->name ?: "unnamed");
+    assert(to->offset == 0);
+
+    g_free(to->buffer);
+    to->offset = from->offset;
+    to->capacity = from->capacity;
+    to->buffer = from->buffer;
+
+    from->offset = 0;
+    from->capacity = 0;
+    from->buffer = NULL;
+}
+
+void buffer_move(Buffer *to, Buffer *from)
+{
+    if (to->offset == 0) {
+        buffer_move_empty(to, from);
+        return;
+    }
+
+    trace_buffer_move(to->name ?: "unnamed",
+                      from->offset,
+                      from->name ?: "unnamed");
+    buffer_reserve(to, from->offset);
+    buffer_append(to, from->buffer, from->offset);
+
+    g_free(from->buffer);
+    from->offset = 0;
+    from->capacity = 0;
+    from->buffer = NULL;
+}
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
new file mode 100644
index 000000000..695bb4ce2
--- /dev/null
+++ b/util/bufferiszero.c
@@ -0,0 +1,355 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/bswap.h"
+
+static bool
+buffer_zero_int(const void *buf, size_t len)
+{
+    if (unlikely(len < 8)) {
+        /* For a very small buffer, simply accumulate all the bytes.  */
+        const unsigned char *p = buf;
+        const unsigned char *e = buf + len;
+        unsigned char t = 0;
+
+        do {
+            t |= *p++;
+        } while (p < e);
+
+        return t == 0;
+    } else {
+        /* Otherwise, use the unaligned memory access functions to
+           handle the beginning and end of the buffer, with a couple
+           of loops handling the middle aligned section.  */
+        uint64_t t = ldq_he_p(buf);
+        const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
+        const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
+
+        for (; p + 8 <= e; p += 8) {
+            __builtin_prefetch(p + 8);
+            if (t) {
+                return false;
+            }
+            t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
+        }
+        while (p < e) {
+            t |= *p++;
+        }
+        t |= ldq_he_p(buf + len - 8);
+
+        return t == 0;
+    }
+}
+
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
+/* Do not use push_options pragmas unnecessarily, because clang
+ * does not support them.
+ */
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#endif
+#include <emmintrin.h>
+
+/* Note that each of these vectorized functions require len >= 64.  */
+
+static bool
+buffer_zero_sse2(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+    __m128i zero = _mm_setzero_si128();
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        t = _mm_cmpeq_epi8(t, zero);
+        if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#pragma GCC pop_options
+#endif
+
+#ifdef CONFIG_AVX2_OPT
+/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
+ * the includes have to be within the corresponding push_options region, and
+ * therefore the regions themselves have to be ordered with increasing ISA.
+ */
+#pragma GCC push_options
+#pragma GCC target("sse4")
+#include <smmintrin.h>
+
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+    __m128i t = _mm_loadu_si128(buf);
+    __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+    __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+    /* Loop over 16-byte aligned blocks of 64.  */
+    while (likely(p <= e)) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm_testz_si128(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    /* Finish the aligned tail.  */
+    t |= e[-3];
+    t |= e[-2];
+    t |= e[-1];
+
+    /* Finish the unaligned tail.  */
+    t |= _mm_loadu_si128(buf + len - 16);
+
+    return _mm_testz_si128(t, t);
+}
+
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include <immintrin.h>
+
+static bool
+buffer_zero_avx2(const void *buf, size_t len)
+{
+    /* Begin with an unaligned head of 32 bytes.  */
+    __m256i t = _mm256_loadu_si256(buf);
+    __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+    __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+    /* Loop over 32-byte aligned blocks of 128.  */
+    while (p <= e) {
+        __builtin_prefetch(p);
+        if (unlikely(!_mm256_testz_si256(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    } ;
+
+    /* Finish the last block of 128 unaligned.  */
+    t |= _mm256_loadu_si256(buf + len - 4 * 32);
+    t |= _mm256_loadu_si256(buf + len - 3 * 32);
+    t |= _mm256_loadu_si256(buf + len - 2 * 32);
+    t |= _mm256_loadu_si256(buf + len - 1 * 32);
+
+    return _mm256_testz_si256(t, t);
+}
+#pragma GCC pop_options
+#endif /* CONFIG_AVX2_OPT */
+
+#ifdef CONFIG_AVX512F_OPT
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#include <immintrin.h>
+
+static bool
+buffer_zero_avx512(const void *buf, size_t len)
+{
+    /* Begin with an unaligned head of 64 bytes.  */
+    __m512i t = _mm512_loadu_si512(buf);
+    __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
+    __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
+
+    /* Loop over 64-byte aligned blocks of 256.  */
+    while (p <= e) {
+        __builtin_prefetch(p);
+        if (unlikely(_mm512_test_epi64_mask(t, t))) {
+            return false;
+        }
+        t = p[-4] | p[-3] | p[-2] | p[-1];
+        p += 4;
+    }
+
+    t |= _mm512_loadu_si512(buf + len - 4 * 64);
+    t |= _mm512_loadu_si512(buf + len - 3 * 64);
+    t |= _mm512_loadu_si512(buf + len - 2 * 64);
+    t |= _mm512_loadu_si512(buf + len - 1 * 64);
+
+    return !_mm512_test_epi64_mask(t, t);
+
+}
+#pragma GCC pop_options
+#endif
+
+
+/* Note that for test_buffer_is_zero_next_accel, the most preferred
+ * ISA must have the least significant bit.
+ */
+#define CACHE_AVX512F 1
+#define CACHE_AVX2    2
+#define CACHE_SSE4    4
+#define CACHE_SSE2    8
+
+/* Make sure that these variables are appropriately initialized when
+ * SSE2 is enabled on the compiler command-line, but the compiler is
+ * too old to support CONFIG_AVX2_OPT.
+ */
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+# define INIT_CACHE 0
+# define INIT_ACCEL buffer_zero_int
+#else
+# ifndef __SSE2__
+#  error "ISA selection confusion"
+# endif
+# define INIT_CACHE CACHE_SSE2
+# define INIT_ACCEL buffer_zero_sse2
+#endif
+
+static unsigned cpuid_cache = INIT_CACHE;
+static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
+static int length_to_accel = 64;
+
+static void init_accel(unsigned cache)
+{
+    bool (*fn)(const void *, size_t) = buffer_zero_int;
+    if (cache & CACHE_SSE2) {
+        fn = buffer_zero_sse2;
+        length_to_accel = 64;
+    }
+#ifdef CONFIG_AVX2_OPT
+    if (cache & CACHE_SSE4) {
+        fn = buffer_zero_sse4;
+        length_to_accel = 64;
+    }
+    if (cache & CACHE_AVX2) {
+        fn = buffer_zero_avx2;
+        length_to_accel = 128;
+    }
+#endif
+#ifdef CONFIG_AVX512F_OPT
+    if (cache & CACHE_AVX512F) {
+        fn = buffer_zero_avx512;
+        length_to_accel = 256;
+    }
+#endif
+    buffer_accel = fn;
+}
+
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#include "qemu/cpuid.h"
+
+static void __attribute__((constructor)) init_cpuid_cache(void)
+{
+    int max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    unsigned cache = 0;
+
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+        if (d & bit_SSE2) {
+            cache |= CACHE_SSE2;
+        }
+        if (c & bit_SSE4_1) {
+            cache |= CACHE_SSE4;
+        }
+
+        /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+            if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
+                cache |= CACHE_AVX2;
+            }
+            /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
+                cache |= CACHE_AVX512F;
+            }
+        }
+    }
+    cpuid_cache = cache;
+    init_accel(cache);
+}
+#endif /* CONFIG_AVX2_OPT */
+
+bool test_buffer_is_zero_next_accel(void)
+{
+    /* If no bits set, we just tested buffer_zero_int, and there
+       are no more acceleration options to test.  */
+    if (cpuid_cache == 0) {
+        return false;
+    }
+    /* Disable the accelerator we used before and select a new one.  */
+    cpuid_cache &= cpuid_cache - 1;
+    init_accel(cpuid_cache);
+    return true;
+}
+
+static bool select_accel_fn(const void *buf, size_t len)
+{
+    if (likely(len >= length_to_accel)) {
+        return buffer_accel(buf, len);
+    }
+    return buffer_zero_int(buf, len);
+}
+
+#else
+#define select_accel_fn  buffer_zero_int
+bool test_buffer_is_zero_next_accel(void)
+{
+    return false;
+}
+#endif
+
+/*
+ * Checks if a buffer is all zeroes
+ */
+bool buffer_is_zero(const void *buf, size_t len)
+{
+    if (unlikely(len == 0)) {
+        return true;
+    }
+
+    /* Fetch the beginning of the buffer while we select the accelerator.  */
+    __builtin_prefetch(buf);
+
+    /* Use an optimized zero check if possible.  Note that this also
+       includes a check for an unrolled loop over 64-bit integers.  */
+    return select_accel_fn(buf, len);
+}
diff --git a/util/cacheflush.c b/util/cacheflush.c
new file mode 100644
index 000000000..933355b0c
--- /dev/null
+++ b/util/cacheflush.c
@@ -0,0 +1,146 @@
+/*
+ * Flush the host cpu caches.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cacheflush.h"
+#include "qemu/bitops.h"
+
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__s390__)
+
+/* Caches are coherent and do not require flushing; symbol inline. */
+
+#elif defined(__aarch64__)
+
+#ifdef CONFIG_DARWIN
+/* Apple does not expose CTR_EL0, so we must use system interfaces. */
+extern void sys_icache_invalidate(void *start, size_t len);
+extern void sys_dcache_flush(void *start, size_t len);
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    sys_dcache_flush((void *)rw, len);
+    sys_icache_invalidate((void *)rx, len);
+}
+#else
+
+/*
+ * TODO: unify this with cacheinfo.c.
+ * We want to save the whole contents of CTR_EL0, so that we
+ * have more than the linesize, but also IDC and DIC.
+ */
+static uint64_t save_ctr_el0;
+static void __attribute__((constructor)) init_ctr_el0(void)
+{
+    asm volatile("mrs\t%0, ctr_el0" : "=r"(save_ctr_el0));
+}
+
+/*
+ * This is a copy of gcc's __aarch64_sync_cache_range, modified
+ * to fit this three-operand interface.
+ */
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    const unsigned CTR_IDC = 1u << 28;
+    const unsigned CTR_DIC = 1u << 29;
+    const uint64_t ctr_el0 = save_ctr_el0;
+    const uintptr_t icache_lsize = 4 << extract64(ctr_el0, 0, 4);
+    const uintptr_t dcache_lsize = 4 << extract64(ctr_el0, 16, 4);
+    uintptr_t p;
+
+    /*
+     * If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification
+     * is not required for instruction to data coherence.
+     */
+    if (!(ctr_el0 & CTR_IDC)) {
+        /*
+         * Loop over the address range, clearing one cache line at once.
+         * Data cache must be flushed to unification first to make sure
+         * the instruction cache fetches the updated data.
+         */
+        for (p = rw & -dcache_lsize; p < rw + len; p += dcache_lsize) {
+            asm volatile("dc\tcvau, %0" : : "r" (p) : "memory");
+        }
+        asm volatile("dsb\tish" : : : "memory");
+    }
+
+    /*
+     * If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point
+     * of Unification is not required for instruction to data coherence.
+     */
+    if (!(ctr_el0 & CTR_DIC)) {
+        for (p = rx & -icache_lsize; p < rx + len; p += icache_lsize) {
+            asm volatile("ic\tivau, %0" : : "r"(p) : "memory");
+        }
+        asm volatile ("dsb\tish" : : : "memory");
+    }
+
+    asm volatile("isb" : : : "memory");
+}
+#endif /* CONFIG_DARWIN */
+
+#elif defined(__mips__)
+
+#ifdef __OpenBSD__
+#include <machine/sysarch.h>
+#else
+#include <sys/cachectl.h>
+#endif
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    if (rx != rw) {
+        cacheflush((void *)rw, len, DCACHE);
+    }
+    cacheflush((void *)rx, len, ICACHE);
+}
+
+#elif defined(__powerpc__)
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    uintptr_t p, b, e;
+    size_t dsize = qemu_dcache_linesize;
+    size_t isize = qemu_icache_linesize;
+
+    b = rw & ~(dsize - 1);
+    e = (rw + len + dsize - 1) & ~(dsize - 1);
+    for (p = b; p < e; p += dsize) {
+        asm volatile ("dcbst 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+
+    b = rx & ~(isize - 1);
+    e = (rx + len + isize - 1) & ~(isize - 1);
+    for (p = b; p < e; p += isize) {
+        asm volatile ("icbi 0,%0" : : "r"(p) : "memory");
+    }
+    asm volatile ("sync" : : : "memory");
+    asm volatile ("isync" : : : "memory");
+}
+
+#elif defined(__sparc__)
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    /* No additional data flush to the RW virtual address required. */
+    uintptr_t p, end = (rx + len + 7) & -8;
+    for (p = rx & -8; p < end; p += 8) {
+        __asm__ __volatile__("flush\t%0" : : "r" (p));
+    }
+}
+
+#else
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+    if (rw != rx) {
+        __builtin___clear_cache((char *)rw, (char *)rw + len);
+    }
+    __builtin___clear_cache((char *)rx, (char *)rx + len);
+}
+
+#endif
diff --git a/util/cacheinfo.c b/util/cacheinfo.c
new file mode 100644
index 000000000..b182f0b69
--- /dev/null
+++ b/util/cacheinfo.c
@@ -0,0 +1,199 @@
+/*
+ * cacheinfo.c - helpers to query the host about its caches
+ *
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/atomic.h"
+
+int qemu_icache_linesize = 0;
+int qemu_icache_linesize_log;
+int qemu_dcache_linesize = 0;
+int qemu_dcache_linesize_log;
+
+/*
+ * Operating system specific detection mechanisms.
+ */
+
+#if defined(_WIN32)
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf;
+    DWORD size = 0;
+    BOOL success;
+    size_t i, n;
+
+    /* Check for the required buffer size first.  Note that if the zero
+       size we use for the probe results in success, then there is no
+       data available; fail in that case.  */
+    success = GetLogicalProcessorInformation(0, &size);
+    if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+        return;
+    }
+
+    n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+    buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n);
+    if (!GetLogicalProcessorInformation(buf, &size)) {
+        goto fail;
+    }
+
+    for (i = 0; i < n; i++) {
+        if (buf[i].Relationship == RelationCache
+            && buf[i].Cache.Level == 1) {
+            switch (buf[i].Cache.Type) {
+            case CacheUnified:
+                *isize = *dsize = buf[i].Cache.LineSize;
+                break;
+            case CacheInstruction:
+                *isize = buf[i].Cache.LineSize;
+                break;
+            case CacheData:
+                *dsize = buf[i].Cache.LineSize;
+                break;
+            default:
+                break;
+            }
+        }
+    }
+ fail:
+    g_free(buf);
+}
+
+#elif defined(__APPLE__)
+# include <sys/sysctl.h>
+static void sys_cache_info(int *isize, int *dsize)
+{
+    /* There's only a single sysctl for both I/D cache line sizes.  */
+    long size;
+    size_t len = sizeof(size);
+    if (!sysctlbyname("hw.cachelinesize", &size, &len, NULL, 0)) {
+        *isize = *dsize = size;
+    }
+}
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+# include <sys/sysctl.h>
+static void sys_cache_info(int *isize, int *dsize)
+{
+    /* There's only a single sysctl for both I/D cache line sizes.  */
+    int size;
+    size_t len = sizeof(size);
+    if (!sysctlbyname("machdep.cacheline_size", &size, &len, NULL, 0)) {
+        *isize = *dsize = size;
+    }
+}
+#else
+/* POSIX */
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+# ifdef _SC_LEVEL1_ICACHE_LINESIZE
+    int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+    if (tmp_isize > 0) {
+        *isize = tmp_isize;
+    }
+# endif
+# ifdef _SC_LEVEL1_DCACHE_LINESIZE
+    int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+    if (tmp_dsize > 0) {
+        *dsize = tmp_dsize;
+    }
+# endif
+}
+#endif /* sys_cache_info */
+
+/*
+ * Architecture (+ OS) specific detection mechanisms.
+ */
+
+#if defined(__aarch64__)
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+    if (*isize == 0 || *dsize == 0) {
+        uint64_t ctr;
+
+        /* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1,
+           but (at least under Linux) these are marked protected by the
+           kernel.  However, CTR_EL0 contains the minimum linesize in the
+           entire hierarchy, and is used by userspace cache flushing.  */
+        asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr));
+        if (*isize == 0) {
+            *isize = 4 << (ctr & 0xf);
+        }
+        if (*dsize == 0) {
+            *dsize = 4 << ((ctr >> 16) & 0xf);
+        }
+    }
+}
+
+#elif defined(_ARCH_PPC) && defined(__linux__)
+# include "elf.h"
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+    if (*isize == 0) {
+        *isize = qemu_getauxval(AT_ICACHEBSIZE);
+    }
+    if (*dsize == 0) {
+        *dsize = qemu_getauxval(AT_DCACHEBSIZE);
+    }
+}
+
+#else
+static void arch_cache_info(int *isize, int *dsize) { }
+#endif /* arch_cache_info */
+
+/*
+ * ... and if all else fails ...
+ */
+
+static void fallback_cache_info(int *isize, int *dsize)
+{
+    /* If we can only find one of the two, assume they're the same.  */
+    if (*isize) {
+        if (*dsize) {
+            /* Success! */
+        } else {
+            *dsize = *isize;
+        }
+    } else if (*dsize) {
+        *isize = *dsize;
+    } else {
+#if defined(_ARCH_PPC)
+        /*
+         * For PPC, we're going to use the cache sizes computed for
+         * flush_idcache_range.  Which means that we must use the
+         * architecture minimum.
+         */
+        *isize = *dsize = 16;
+#else
+        /* Otherwise, 64 bytes is not uncommon.  */
+        *isize = *dsize = 64;
+#endif
+    }
+}
+
+static void __attribute__((constructor)) init_cache_info(void)
+{
+    int isize = 0, dsize = 0;
+
+    sys_cache_info(&isize, &dsize);
+    arch_cache_info(&isize, &dsize);
+    fallback_cache_info(&isize, &dsize);
+
+    assert((isize & (isize - 1)) == 0);
+    assert((dsize & (dsize - 1)) == 0);
+
+    qemu_icache_linesize = isize;
+    qemu_icache_linesize_log = ctz32(isize);
+    qemu_dcache_linesize = dsize;
+    qemu_dcache_linesize_log = ctz32(dsize);
+
+    qatomic64_init();
+}
diff --git a/util/compatfd.c b/util/compatfd.c
new file mode 100644
index 000000000..ab810c42a
--- /dev/null
+++ b/util/compatfd.c
@@ -0,0 +1,106 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+
+#if defined(CONFIG_SIGNALFD)
+#include <sys/signalfd.h>
+#endif
+
+struct sigfd_compat_info {
+    sigset_t mask;
+    int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+    struct sigfd_compat_info *info = opaque;
+
+    while (1) {
+        int sig;
+        int err;
+
+        err = sigwait(&info->mask, &sig);
+        if (err != 0) {
+            if (errno == EINTR) {
+                continue;
+            } else {
+                return NULL;
+            }
+        } else {
+            struct qemu_signalfd_siginfo buffer;
+            size_t offset = 0;
+
+            memset(&buffer, 0, sizeof(buffer));
+            buffer.ssi_signo = sig;
+
+            while (offset < sizeof(buffer)) {
+                ssize_t len;
+
+                len = write(info->fd, (char *)&buffer + offset,
+                            sizeof(buffer) - offset);
+                if (len == -1 && errno == EINTR) {
+                    continue;
+                }
+
+                if (len <= 0) {
+                    return NULL;
+                }
+
+                offset += len;
+            }
+        }
+    }
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+    struct sigfd_compat_info *info;
+    QemuThread thread;
+    int fds[2];
+
+    info = g_malloc(sizeof(*info));
+
+    if (pipe(fds) == -1) {
+        g_free(info);
+        return -1;
+    }
+
+    qemu_set_cloexec(fds[0]);
+    qemu_set_cloexec(fds[1]);
+
+    memcpy(&info->mask, mask, sizeof(*mask));
+    info->fd = fds[1];
+
+    qemu_thread_create(&thread, "signalfd_compat", sigwait_compat, info,
+                       QEMU_THREAD_DETACHED);
+
+    return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+    int ret;
+
+    ret = signalfd(-1, mask, SFD_CLOEXEC);
+    if (ret != -1) {
+        return ret;
+    }
+#endif
+
+    return qemu_signalfd_compat(mask);
+}
diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
new file mode 100644
index 000000000..e99b8a4f9
--- /dev/null
+++ b/util/coroutine-sigaltstack.c
@@ -0,0 +1,304 @@
+/*
+ * sigaltstack coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ * Copyright (C) 2012  Alex Barcelo <abarcelo@ac.upc.edu>
+** This file is partly based on pth_mctx.c, from the GNU Portable Threads
+**  Copyright (c) 1999-2006 Ralf S. Engelschall <rse@engelschall.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include "qemu/osdep.h"
+#include <pthread.h>
+#include "qemu-common.h"
+#include "qemu/coroutine_int.h"
+
+#ifdef CONFIG_SAFESTACK
+#error "SafeStack is not compatible with code run in alternate signal stacks"
+#endif
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    size_t stack_size;
+    sigjmp_buf env;
+} CoroutineSigAltStack;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+typedef struct {
+    /** Currently executing coroutine */
+    Coroutine *current;
+
+    /** The default coroutine */
+    CoroutineSigAltStack leader;
+
+    /** Information for the signal handler (trampoline) */
+    sigjmp_buf tr_reenter;
+    volatile sig_atomic_t tr_called;
+    void *tr_handler;
+} CoroutineThreadState;
+
+static pthread_key_t thread_state_key;
+
+static CoroutineThreadState *coroutine_get_thread_state(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    if (!s) {
+        s = g_malloc0(sizeof(*s));
+        s->current = &s->leader.base;
+        pthread_setspecific(thread_state_key, s);
+    }
+    return s;
+}
+
+static void qemu_coroutine_thread_cleanup(void *opaque)
+{
+    CoroutineThreadState *s = opaque;
+
+    g_free(s);
+}
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+    int ret;
+
+    ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
+    if (ret != 0) {
+        fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
+        abort();
+    }
+}
+
+/* "boot" function
+ * This is what starts the coroutine, is called from the trampoline
+ * (from the signal handler when it is not signal handling, read ahead
+ * for more information).
+ */
+static void coroutine_bootstrap(CoroutineSigAltStack *self, Coroutine *co)
+{
+    /* Initialize longjmp environment and switch back the caller */
+    if (!sigsetjmp(self->env, 0)) {
+        siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+    }
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+/*
+ * This is used as the signal handler. This is called with the brand new stack
+ * (thanks to sigaltstack). We have to return, given that this is a signal
+ * handler and the sigmask and some other things are changed.
+ */
+static void coroutine_trampoline(int signal)
+{
+    CoroutineSigAltStack *self;
+    Coroutine *co;
+    CoroutineThreadState *coTS;
+
+    /* Get the thread specific information */
+    coTS = coroutine_get_thread_state();
+    self = coTS->tr_handler;
+    coTS->tr_called = 1;
+    co = &self->base;
+
+    /*
+     * Here we have to do a bit of a ping pong between the caller, given that
+     * this is a signal handler and we have to do a return "soon". Then the
+     * caller can reestablish everything and do a siglongjmp here again.
+     */
+    if (!sigsetjmp(coTS->tr_reenter, 0)) {
+        return;
+    }
+
+    /*
+     * Ok, the caller has siglongjmp'ed back to us, so now prepare
+     * us for the real machine state switching. We have to jump
+     * into another function here to get a new stack context for
+     * the auto variables (which have to be auto-variables
+     * because the start of the thread happens later). Else with
+     * PIC (i.e. Position Independent Code which is used when PTH
+     * is built as a shared library) most platforms would
+     * horrible core dump as experience showed.
+     */
+    coroutine_bootstrap(self, co);
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineSigAltStack *co;
+    CoroutineThreadState *coTS;
+    struct sigaction sa;
+    struct sigaction osa;
+    stack_t ss;
+    stack_t oss;
+    sigset_t sigs;
+    sigset_t osigs;
+    sigjmp_buf old_env;
+    static pthread_mutex_t sigusr2_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+    /* The way to manipulate stack is with the sigaltstack function. We
+     * prepare a stack, with it delivering a signal to ourselves and then
+     * put sigsetjmp/siglongjmp where needed.
+     * This has been done keeping coroutine-ucontext as a model and with the
+     * pth ideas (GNU Portable Threads). See coroutine-ucontext for the basics
+     * of the coroutines and see pth_mctx.c (from the pth project) for the
+     * sigaltstack way of manipulating stacks.
+     */
+
+    co = g_malloc0(sizeof(*co));
+    co->stack_size = COROUTINE_STACK_SIZE;
+    co->stack = qemu_alloc_stack(&co->stack_size);
+    co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+    coTS = coroutine_get_thread_state();
+    coTS->tr_handler = co;
+
+    /*
+     * Preserve the SIGUSR2 signal state, block SIGUSR2,
+     * and establish our signal handler. The signal will
+     * later transfer control onto the signal stack.
+     */
+    sigemptyset(&sigs);
+    sigaddset(&sigs, SIGUSR2);
+    pthread_sigmask(SIG_BLOCK, &sigs, &osigs);
+    sa.sa_handler = coroutine_trampoline;
+    sigfillset(&sa.sa_mask);
+    sa.sa_flags = SA_ONSTACK;
+
+    /*
+     * sigaction() is a process-global operation.  We must not run
+     * this code in multiple threads at once.
+     */
+    pthread_mutex_lock(&sigusr2_mutex);
+    if (sigaction(SIGUSR2, &sa, &osa) != 0) {
+        abort();
+    }
+
+    /*
+     * Set the new stack.
+     */
+    ss.ss_sp = co->stack;
+    ss.ss_size = co->stack_size;
+    ss.ss_flags = 0;
+    if (sigaltstack(&ss, &oss) < 0) {
+        abort();
+    }
+
+    /*
+     * Now transfer control onto the signal stack and set it up.
+     * It will return immediately via "return" after the sigsetjmp()
+     * was performed. Be careful here with race conditions.  The
+     * signal can be delivered the first time sigsuspend() is
+     * called.
+     */
+    coTS->tr_called = 0;
+    pthread_kill(pthread_self(), SIGUSR2);
+    sigfillset(&sigs);
+    sigdelset(&sigs, SIGUSR2);
+    while (!coTS->tr_called) {
+        sigsuspend(&sigs);
+    }
+
+    /*
+     * Inform the system that we are back off the signal stack by
+     * removing the alternative signal stack. Be careful here: It
+     * first has to be disabled, before it can be removed.
+     */
+    sigaltstack(NULL, &ss);
+    ss.ss_flags = SS_DISABLE;
+    if (sigaltstack(&ss, NULL) < 0) {
+        abort();
+    }
+    sigaltstack(NULL, &ss);
+    if (!(oss.ss_flags & SS_DISABLE)) {
+        sigaltstack(&oss, NULL);
+    }
+
+    /*
+     * Restore the old SIGUSR2 signal handler and mask
+     */
+    sigaction(SIGUSR2, &osa, NULL);
+    pthread_mutex_unlock(&sigusr2_mutex);
+
+    pthread_sigmask(SIG_SETMASK, &osigs, NULL);
+
+    /*
+     * Now enter the trampoline again, but this time not as a signal
+     * handler. Instead we jump into it directly. The functionally
+     * redundant ping-pong pointer arithmetic is necessary to avoid
+     * type-conversion warnings related to the `volatile' qualifier and
+     * the fact that `jmp_buf' usually is an array type.
+     */
+    if (!sigsetjmp(old_env, 0)) {
+        siglongjmp(coTS->tr_reenter, 1);
+    }
+
+    /*
+     * Ok, we returned again, so now we're finished
+     */
+
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineSigAltStack *co = DO_UPCAST(CoroutineSigAltStack, base, co_);
+
+    qemu_free_stack(co->stack, co->stack_size);
+    g_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                                      CoroutineAction action)
+{
+    CoroutineSigAltStack *from = DO_UPCAST(CoroutineSigAltStack, base, from_);
+    CoroutineSigAltStack *to = DO_UPCAST(CoroutineSigAltStack, base, to_);
+    CoroutineThreadState *s = coroutine_get_thread_state();
+    int ret;
+
+    s->current = to_;
+
+    ret = sigsetjmp(from->env, 0);
+    if (ret == 0) {
+        siglongjmp(to->env, action);
+    }
+    return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    CoroutineThreadState *s = coroutine_get_thread_state();
+
+    return s->current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+    return s && s->current->caller;
+}
+
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
new file mode 100644
index 000000000..904b37519
--- /dev/null
+++ b/util/coroutine-ucontext.c
@@ -0,0 +1,332 @@
+/*
+ * ucontext coroutine initialization code
+ *
+ * Copyright (C) 2006  Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011  Kevin Wolf <kwolf@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include "qemu/osdep.h"
+#include <ucontext.h>
+#include "qemu/coroutine_int.h"
+
+#ifdef CONFIG_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer)
+#ifdef CONFIG_ASAN_IFACE_FIBER
+#define CONFIG_ASAN 1
+#include <sanitizer/asan_interface.h>
+#endif
+#endif
+
+#ifdef CONFIG_TSAN
+#include <sanitizer/tsan_interface.h>
+#endif
+
+typedef struct {
+    Coroutine base;
+    void *stack;
+    size_t stack_size;
+#ifdef CONFIG_SAFESTACK
+    /* Need an unsafe stack for each coroutine */
+    void *unsafe_stack;
+    size_t unsafe_stack_size;
+#endif
+    sigjmp_buf env;
+
+#ifdef CONFIG_TSAN
+    void *tsan_co_fiber;
+    void *tsan_caller_fiber;
+#endif
+
+#ifdef CONFIG_VALGRIND_H
+    unsigned int valgrind_stack_id;
+#endif
+
+} CoroutineUContext;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+static __thread CoroutineUContext leader;
+static __thread Coroutine *current;
+
+/*
+ * va_args to makecontext() must be type 'int', so passing
+ * the pointer we need may require several int args. This
+ * union is a quick hack to let us do that
+ */
+union cc_arg {
+    void *p;
+    int i[2];
+};
+
+/*
+ * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it.
+ * always_inline is required to avoid TSan runtime fatal errors.
+ */
+static inline __attribute__((always_inline))
+void on_new_fiber(CoroutineUContext *co)
+{
+#ifdef CONFIG_TSAN
+    co->tsan_co_fiber = __tsan_create_fiber(0); /* flags: sync on switch */
+    co->tsan_caller_fiber = __tsan_get_current_fiber();
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void finish_switch_fiber(void *fake_stack_save)
+{
+#ifdef CONFIG_ASAN
+    const void *bottom_old;
+    size_t size_old;
+
+    __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
+
+    if (!leader.stack) {
+        leader.stack = (void *)bottom_old;
+        leader.stack_size = size_old;
+    }
+#endif
+#ifdef CONFIG_TSAN
+    if (fake_stack_save) {
+        __tsan_release(fake_stack_save);
+        __tsan_switch_to_fiber(fake_stack_save, 0);  /* 0=synchronize */
+    }
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void start_switch_fiber_asan(CoroutineAction action, void **fake_stack_save,
+                             const void *bottom, size_t size)
+{
+#ifdef CONFIG_ASAN
+    __sanitizer_start_switch_fiber(
+            action == COROUTINE_TERMINATE ? NULL : fake_stack_save,
+            bottom, size);
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void start_switch_fiber_tsan(void **fake_stack_save,
+                             CoroutineUContext *co,
+                             bool caller)
+{
+#ifdef CONFIG_TSAN
+    void *new_fiber = caller ?
+                      co->tsan_caller_fiber :
+                      co->tsan_co_fiber;
+    void *curr_fiber = __tsan_get_current_fiber();
+    __tsan_acquire(curr_fiber);
+
+    *fake_stack_save = curr_fiber;
+    __tsan_switch_to_fiber(new_fiber, 0);  /* 0=synchronize */
+#endif
+}
+
+static void coroutine_trampoline(int i0, int i1)
+{
+    union cc_arg arg;
+    CoroutineUContext *self;
+    Coroutine *co;
+    void *fake_stack_save = NULL;
+
+    finish_switch_fiber(NULL);
+
+    arg.i[0] = i0;
+    arg.i[1] = i1;
+    self = arg.p;
+    co = &self->base;
+
+    /* Initialize longjmp environment and switch back the caller */
+    if (!sigsetjmp(self->env, 0)) {
+        start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, leader.stack,
+                                leader.stack_size);
+        start_switch_fiber_tsan(&fake_stack_save, self, true); /* true=caller */
+        siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+    }
+
+    finish_switch_fiber(fake_stack_save);
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    CoroutineUContext *co;
+    ucontext_t old_uc, uc;
+    sigjmp_buf old_env;
+    union cc_arg arg = {0};
+    void *fake_stack_save = NULL;
+
+    /* The ucontext functions preserve signal masks which incurs a
+     * system call overhead.  sigsetjmp(buf, 0)/siglongjmp() does not
+     * preserve signal masks but only works on the current stack.
+     * Since we need a way to create and switch to a new stack, use
+     * the ucontext functions for that but sigsetjmp()/siglongjmp() for
+     * everything else.
+     */
+
+    if (getcontext(&uc) == -1) {
+        abort();
+    }
+
+    co = g_malloc0(sizeof(*co));
+    co->stack_size = COROUTINE_STACK_SIZE;
+    co->stack = qemu_alloc_stack(&co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+    co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+#endif
+    co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+    uc.uc_link = &old_uc;
+    uc.uc_stack.ss_sp = co->stack;
+    uc.uc_stack.ss_size = co->stack_size;
+    uc.uc_stack.ss_flags = 0;
+
+#ifdef CONFIG_VALGRIND_H
+    co->valgrind_stack_id =
+        VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size);
+#endif
+
+    arg.p = co;
+
+    on_new_fiber(co);
+    makecontext(&uc, (void (*)(void))coroutine_trampoline,
+                2, arg.i[0], arg.i[1]);
+
+    /* swapcontext() in, siglongjmp() back out */
+    if (!sigsetjmp(old_env, 0)) {
+        start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, co->stack,
+                                co->stack_size);
+        start_switch_fiber_tsan(&fake_stack_save,
+                                co, false); /* false=not caller */
+
+#ifdef CONFIG_SAFESTACK
+        /*
+         * Before we swap the context, set the new unsafe stack
+         * The unsafe stack grows just like the normal stack, so start from
+         * the last usable location of the memory area.
+         * NOTE: we don't have to re-set the usp afterwards because we are
+         * coming back to this context through a siglongjmp.
+         * The compiler already wrapped the corresponding sigsetjmp call with
+         * code that saves the usp on the (safe) stack before the call, and
+         * restores it right after (which is where we return with siglongjmp).
+         */
+        void *usp = co->unsafe_stack + co->unsafe_stack_size;
+        __safestack_unsafe_stack_ptr = usp;
+#endif
+
+        swapcontext(&old_uc, &uc);
+    }
+
+    finish_switch_fiber(fake_stack_save);
+
+    return &co->base;
+}
+
+#ifdef CONFIG_VALGRIND_H
+/* Work around an unused variable in the valgrind.h macro... */
+#if !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+static inline void valgrind_stack_deregister(CoroutineUContext *co)
+{
+    VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
+}
+#if !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
+
+#ifdef CONFIG_VALGRIND_H
+    valgrind_stack_deregister(co);
+#endif
+
+    qemu_free_stack(co->stack, co->stack_size);
+#ifdef CONFIG_SAFESTACK
+    qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
+#endif
+    g_free(co);
+}
+
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the sigsetjmp() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                      CoroutineAction action)
+{
+    CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
+    CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
+    int ret;
+    void *fake_stack_save = NULL;
+
+    current = to_;
+
+    ret = sigsetjmp(from->env, 0);
+    if (ret == 0) {
+        start_switch_fiber_asan(action, &fake_stack_save, to->stack,
+                                to->stack_size);
+        start_switch_fiber_tsan(&fake_stack_save,
+                                to, false); /* false=not caller */
+        siglongjmp(to->env, action);
+    }
+
+    finish_switch_fiber(fake_stack_save);
+
+    return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    if (!current) {
+        current = &leader.base;
+    }
+#ifdef CONFIG_TSAN
+    if (!leader.tsan_co_fiber) {
+        leader.tsan_co_fiber = __tsan_get_current_fiber();
+    }
+#endif
+    return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    return current && current->caller;
+}
diff --git a/util/coroutine-win32.c b/util/coroutine-win32.c
new file mode 100644
index 000000000..de6bd4fd3
--- /dev/null
+++ b/util/coroutine-win32.c
@@ -0,0 +1,102 @@
+/*
+ * Win32 coroutine initialization code
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/coroutine_int.h"
+
+typedef struct
+{
+    Coroutine base;
+
+    LPVOID fiber;
+    CoroutineAction action;
+} CoroutineWin32;
+
+static __thread CoroutineWin32 leader;
+static __thread Coroutine *current;
+
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the SwitchToFiber() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+                      CoroutineAction action)
+{
+    CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
+    CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
+
+    current = to_;
+
+    to->action = action;
+    SwitchToFiber(to->fiber);
+    return from->action;
+}
+
+static void CALLBACK coroutine_trampoline(void *co_)
+{
+    Coroutine *co = co_;
+
+    while (true) {
+        co->entry(co->entry_arg);
+        qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+    }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+    const size_t stack_size = COROUTINE_STACK_SIZE;
+    CoroutineWin32 *co;
+
+    co = g_malloc0(sizeof(*co));
+    co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base);
+    return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+    CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_);
+
+    DeleteFiber(co->fiber);
+    g_free(co);
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+    if (!current) {
+        current = &leader.base;
+        leader.fiber = ConvertThreadToFiber(NULL);
+    }
+    return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+    return current && current->caller;
+}
diff --git a/util/crc-ccitt.c b/util/crc-ccitt.c
new file mode 100644
index 000000000..b981d8ac5
--- /dev/null
+++ b/util/crc-ccitt.c
@@ -0,0 +1,127 @@
+/*
+ * CRC16 (CCITT) Checksum Algorithm
+ *
+ * Copyright (c) 2021 Wind River Systems, Inc.
+ *
+ * Author:
+ *   Bin Meng <bin.meng@windriver.com>
+ *
+ * From Linux kernel v5.10 lib/crc-ccitt.c
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/crc-ccitt.h"
+
+/*
+ * This mysterious table is just the CRC of each possible byte. It can be
+ * computed using the standard bit-at-a-time methods. The polynomial can
+ * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12.
+ * Add the implicit x^16, and you have the standard CRC-CCITT.
+ */
+uint16_t const crc_ccitt_table[256] = {
+    0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
+    0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
+    0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
+    0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
+    0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
+    0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
+    0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
+    0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
+    0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
+    0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
+    0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
+    0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
+    0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
+    0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
+    0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
+    0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
+    0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
+    0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
+    0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
+    0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
+    0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
+    0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
+    0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
+    0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
+    0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
+    0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
+    0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
+    0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
+    0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
+    0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
+    0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
+    0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
+};
+
+/*
+ * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE
+ * Reflected bits order, does not augment final value.
+ */
+uint16_t const crc_ccitt_false_table[256] = {
+    0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
+    0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
+    0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
+    0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
+    0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
+    0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
+    0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
+    0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
+    0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
+    0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
+    0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
+    0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
+    0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
+    0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
+    0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
+    0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
+    0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
+    0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
+    0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
+    0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
+    0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
+    0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
+    0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
+    0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
+    0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
+    0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
+    0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
+    0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
+    0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
+    0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
+    0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
+    0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
+};
+
+/**
+ * crc_ccitt - recompute the CRC (CRC-CCITT variant)
+ * for the data buffer
+ *
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ */
+uint16_t crc_ccitt(uint16_t crc, uint8_t const *buffer, size_t len)
+{
+    while (len--) {
+        crc = crc_ccitt_byte(crc, *buffer++);
+    }
+    return crc;
+}
+
+/**
+ * crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant)
+ * for the data buffer
+ *
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ */
+uint16_t crc_ccitt_false(uint16_t crc, uint8_t const *buffer, size_t len)
+{
+    while (len--) {
+        crc = crc_ccitt_false_byte(crc, *buffer++);
+    }
+    return crc;
+}
diff --git a/util/crc32c.c b/util/crc32c.c
new file mode 100644
index 000000000..762657d85
--- /dev/null
+++ b/util/crc32c.c
@@ -0,0 +1,115 @@
+/*
+ *  Castagnoli CRC32C Checksum Algorithm
+ *
+ *  Polynomial: 0x11EDC6F41
+ *
+ *  Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman
+ *               "Optimization of Cyclic Redundancy-Check Codes with 24
+ *                 and 32 Parity Bits",IEEE Transactions on Communication,
+ *                Volume 41, Number 6, June 1993
+ *
+ *  Copyright (c) 2013 Red Hat, Inc.,
+ *
+ *  Authors:
+ *   Jeff Cody <jcody@redhat.com>
+ *
+ *  Based on the Linux kernel cryptographic crc32c module,
+ *
+ *  Copyright (c) 2004 Cisco Systems, Inc.
+ *  Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/crc32c.h"
+
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const uint32_t crc32c_table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+
+uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length)
+{
+    while (length--) {
+        crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+    }
+    return crc^0xffffffff;
+}
+
diff --git a/util/cutils.c b/util/cutils.c
new file mode 100644
index 000000000..c9b91e753
--- /dev/null
+++ b/util/cutils.c
@@ -0,0 +1,1059 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include <math.h>
+
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "net/net.h"
+#include "qemu/ctype.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+void strpadcpy(char *buf, int buf_size, const char *str, char pad)
+{
+    int len = qemu_strnlen(str, buf_size);
+    memcpy(buf, str, len);
+    memset(buf + len, pad, buf_size - len);
+}
+
+void pstrcpy(char *buf, int buf_size, const char *str)
+{
+    int c;
+    char *q = buf;
+
+    if (buf_size <= 0)
+        return;
+
+    for(;;) {
+        c = *str++;
+        if (c == 0 || q >= buf + buf_size - 1)
+            break;
+        *q++ = c;
+    }
+    *q = '\0';
+}
+
+/* strcat and truncate. */
+char *pstrcat(char *buf, int buf_size, const char *s)
+{
+    int len;
+    len = strlen(buf);
+    if (len < buf_size)
+        pstrcpy(buf + len, buf_size - len, s);
+    return buf;
+}
+
+int strstart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (*p != *q)
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+int stristart(const char *str, const char *val, const char **ptr)
+{
+    const char *p, *q;
+    p = str;
+    q = val;
+    while (*q != '\0') {
+        if (qemu_toupper(*p) != qemu_toupper(*q))
+            return 0;
+        p++;
+        q++;
+    }
+    if (ptr)
+        *ptr = p;
+    return 1;
+}
+
+/* XXX: use host strnlen if available ? */
+int qemu_strnlen(const char *s, int max_len)
+{
+    int i;
+
+    for(i = 0; i < max_len; i++) {
+        if (s[i] == '\0') {
+            break;
+        }
+    }
+    return i;
+}
+
+char *qemu_strsep(char **input, const char *delim)
+{
+    char *result = *input;
+    if (result != NULL) {
+        char *p;
+
+        for (p = result; *p != '\0'; p++) {
+            if (strchr(delim, *p)) {
+                break;
+            }
+        }
+        if (*p == '\0') {
+            *input = NULL;
+        } else {
+            *p = '\0';
+            *input = p + 1;
+        }
+    }
+    return result;
+}
+
+time_t mktimegm(struct tm *tm)
+{
+    time_t t;
+    int y = tm->tm_year + 1900, m = tm->tm_mon + 1, d = tm->tm_mday;
+    if (m < 3) {
+        m += 12;
+        y--;
+    }
+    t = 86400ULL * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 + 
+                 y / 400 - 719469);
+    t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
+    return t;
+}
+
+/*
+ * Make sure data goes on disk, but if possible do not bother to
+ * write out the inode just for timestamp updates.
+ *
+ * Unfortunately even in 2009 many operating systems do not support
+ * fdatasync and have to fall back to fsync.
+ */
+int qemu_fdatasync(int fd)
+{
+#ifdef CONFIG_FDATASYNC
+    return fdatasync(fd);
+#else
+    return fsync(fd);
+#endif
+}
+
+/**
+ * Sync changes made to the memory mapped file back to the backing
+ * storage. For POSIX compliant systems this will fallback
+ * to regular msync call. Otherwise it will trigger whole file sync
+ * (including the metadata case there is no support to skip that otherwise)
+ *
+ * @addr   - start of the memory area to be synced
+ * @length - length of the are to be synced
+ * @fd     - file descriptor for the file to be synced
+ *           (mandatory only for POSIX non-compliant systems)
+ */
+int qemu_msync(void *addr, size_t length, int fd)
+{
+#ifdef CONFIG_POSIX
+    size_t align_mask = ~(qemu_real_host_page_size - 1);
+
+    /**
+     * There are no strict reqs as per the length of mapping
+     * to be synced. Still the length needs to follow the address
+     * alignment changes. Additionally - round the size to the multiple
+     * of PAGE_SIZE
+     */
+    length += ((uintptr_t)addr & (qemu_real_host_page_size - 1));
+    length = (length + ~align_mask) & align_mask;
+
+    addr = (void *)((uintptr_t)addr & align_mask);
+
+    return msync(addr, length, MS_SYNC);
+#else /* CONFIG_POSIX */
+    /**
+     * Perform the sync based on the file descriptor
+     * The sync range will most probably be wider than the one
+     * requested - but it will still get the job done
+     */
+    return qemu_fdatasync(fd);
+#endif /* CONFIG_POSIX */
+}
+
+#ifndef _WIN32
+/* Sets a specific flag */
+int fcntl_setfl(int fd, int flag)
+{
+    int flags;
+
+    flags = fcntl(fd, F_GETFL);
+    if (flags == -1)
+        return -errno;
+
+    if (fcntl(fd, F_SETFL, flags | flag) == -1)
+        return -errno;
+
+    return 0;
+}
+#endif
+
+static int64_t suffix_mul(char suffix, int64_t unit)
+{
+    switch (qemu_toupper(suffix)) {
+    case 'B':
+        return 1;
+    case 'K':
+        return unit;
+    case 'M':
+        return unit * unit;
+    case 'G':
+        return unit * unit * unit;
+    case 'T':
+        return unit * unit * unit * unit;
+    case 'P':
+        return unit * unit * unit * unit * unit;
+    case 'E':
+        return unit * unit * unit * unit * unit * unit;
+    }
+    return -1;
+}
+
+/*
+ * Convert size string to bytes.
+ *
+ * The size parsing supports the following syntaxes
+ * - 12345 - decimal, scale determined by @default_suffix and @unit
+ * - 12345{bBkKmMgGtTpPeE} - decimal, scale determined by suffix and @unit
+ * - 12345.678{kKmMgGtTpPeE} - decimal, scale determined by suffix, and
+ *   fractional portion is truncated to byte
+ * - 0x7fEE - hexadecimal, unit determined by @default_suffix
+ *
+ * The following cause a deprecation warning, and may be removed in the future
+ * - 0xabc{kKmMgGtTpP} - hex with scaling suffix
+ *
+ * The following are intentionally not supported
+ * - octal, such as 08
+ * - fractional hex, such as 0x1.8
+ * - floating point exponents, such as 1e3
+ *
+ * The end pointer will be returned in *end, if not NULL.  If there is
+ * no fraction, the input can be decimal or hexadecimal; if there is a
+ * fraction, then the input must be decimal and there must be a suffix
+ * (possibly by @default_suffix) larger than Byte, and the fractional
+ * portion may suffer from precision loss or rounding.  The input must
+ * be positive.
+ *
+ * Return -ERANGE on overflow (with *@end advanced), and -EINVAL on
+ * other error (with *@end left unchanged).
+ */
+static int do_strtosz(const char *nptr, const char **end,
+                      const char default_suffix, int64_t unit,
+                      uint64_t *result)
+{
+    int retval;
+    const char *endptr, *f;
+    unsigned char c;
+    bool hex = false;
+    uint64_t val, valf = 0;
+    int64_t mul;
+
+    /* Parse integral portion as decimal. */
+    retval = qemu_strtou64(nptr, &endptr, 10, &val);
+    if (retval) {
+        goto out;
+    }
+    if (memchr(nptr, '-', endptr - nptr) != NULL) {
+        endptr = nptr;
+        retval = -EINVAL;
+        goto out;
+    }
+    if (val == 0 && (*endptr == 'x' || *endptr == 'X')) {
+        /* Input looks like hex, reparse, and insist on no fraction. */
+        retval = qemu_strtou64(nptr, &endptr, 16, &val);
+        if (retval) {
+            goto out;
+        }
+        if (*endptr == '.') {
+            endptr = nptr;
+            retval = -EINVAL;
+            goto out;
+        }
+        hex = true;
+    } else if (*endptr == '.') {
+        /*
+         * Input looks like a fraction.  Make sure even 1.k works
+         * without fractional digits.  If we see an exponent, treat
+         * the entire input as invalid instead.
+         */
+        double fraction;
+
+        f = endptr;
+        retval = qemu_strtod_finite(f, &endptr, &fraction);
+        if (retval) {
+            endptr++;
+        } else if (memchr(f, 'e', endptr - f) || memchr(f, 'E', endptr - f)) {
+            endptr = nptr;
+            retval = -EINVAL;
+            goto out;
+        } else {
+            /* Extract into a 64-bit fixed-point fraction. */
+            valf = (uint64_t)(fraction * 0x1p64);
+        }
+    }
+    c = *endptr;
+    mul = suffix_mul(c, unit);
+    if (mul > 0) {
+        if (hex) {
+            warn_report("Using a multiplier suffix on hex numbers "
+                        "is deprecated: %s", nptr);
+        }
+        endptr++;
+    } else {
+        mul = suffix_mul(default_suffix, unit);
+        assert(mul > 0);
+    }
+    if (mul == 1) {
+        /* When a fraction is present, a scale is required. */
+        if (valf != 0) {
+            endptr = nptr;
+            retval = -EINVAL;
+            goto out;
+        }
+    } else {
+        uint64_t valh, tmp;
+
+        /* Compute exact result: 64.64 x 64.0 -> 128.64 fixed point */
+        mulu64(&val, &valh, val, mul);
+        mulu64(&valf, &tmp, valf, mul);
+        val += tmp;
+        valh += val < tmp;
+
+        /* Round 0.5 upward. */
+        tmp = valf >> 63;
+        val += tmp;
+        valh += val < tmp;
+
+        /* Report overflow. */
+        if (valh != 0) {
+            retval = -ERANGE;
+            goto out;
+        }
+    }
+
+    retval = 0;
+
+out:
+    if (end) {
+        *end = endptr;
+    } else if (*endptr) {
+        retval = -EINVAL;
+    }
+    if (retval == 0) {
+        *result = val;
+    }
+
+    return retval;
+}
+
+int qemu_strtosz(const char *nptr, const char **end, uint64_t *result)
+{
+    return do_strtosz(nptr, end, 'B', 1024, result);
+}
+
+int qemu_strtosz_MiB(const char *nptr, const char **end, uint64_t *result)
+{
+    return do_strtosz(nptr, end, 'M', 1024, result);
+}
+
+int qemu_strtosz_metric(const char *nptr, const char **end, uint64_t *result)
+{
+    return do_strtosz(nptr, end, 'B', 1000, result);
+}
+
+/**
+ * Helper function for error checking after strtol() and the like
+ */
+static int check_strtox_error(const char *nptr, char *ep,
+                              const char **endptr, bool check_zero,
+                              int libc_errno)
+{
+    assert(ep >= nptr);
+
+    /* Windows has a bug in that it fails to parse 0 from "0x" in base 16 */
+    if (check_zero && ep == nptr && libc_errno == 0) {
+        char *tmp;
+
+        errno = 0;
+        if (strtol(nptr, &tmp, 10) == 0 && errno == 0 &&
+            (*tmp == 'x' || *tmp == 'X')) {
+            ep = tmp;
+        }
+    }
+
+    if (endptr) {
+        *endptr = ep;
+    }
+
+    /* Turn "no conversion" into an error */
+    if (libc_errno == 0 && ep == nptr) {
+        return -EINVAL;
+    }
+
+    /* Fail when we're expected to consume the string, but didn't */
+    if (!endptr && *ep) {
+        return -EINVAL;
+    }
+
+    return -libc_errno;
+}
+
+/**
+ * Convert string @nptr to an integer, and store it in @result.
+ *
+ * This is a wrapper around strtol() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtol() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL.  This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store INT_MAX in @result,
+ * and return -ERANGE.
+ *
+ * If the conversion underflows @result, store INT_MIN in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtoi(const char *nptr, const char **endptr, int base,
+                int *result)
+{
+    char *ep;
+    long long lresult;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    errno = 0;
+    lresult = strtoll(nptr, &ep, base);
+    if (lresult < INT_MIN) {
+        *result = INT_MIN;
+        errno = ERANGE;
+    } else if (lresult > INT_MAX) {
+        *result = INT_MAX;
+        errno = ERANGE;
+    } else {
+        *result = lresult;
+    }
+    return check_strtox_error(nptr, ep, endptr, lresult == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an unsigned integer, and store it in @result.
+ *
+ * This is a wrapper around strtoul() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtoul() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL.  This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store UINT_MAX in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ *
+ * Note that a number with a leading minus sign gets converted without
+ * the minus sign, checked for overflow (see above), then negated (in
+ * @result's type).  This is exactly how strtoul() works.
+ */
+int qemu_strtoui(const char *nptr, const char **endptr, int base,
+                 unsigned int *result)
+{
+    char *ep;
+    long long lresult;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    errno = 0;
+    lresult = strtoull(nptr, &ep, base);
+
+    /* Windows returns 1 for negative out-of-range values.  */
+    if (errno == ERANGE) {
+        *result = -1;
+    } else {
+        if (lresult > UINT_MAX) {
+            *result = UINT_MAX;
+            errno = ERANGE;
+        } else if (lresult < INT_MIN) {
+            *result = UINT_MAX;
+            errno = ERANGE;
+        } else {
+            *result = lresult;
+        }
+    }
+    return check_strtox_error(nptr, ep, endptr, lresult == 0, errno);
+}
+
+/**
+ * Convert string @nptr to a long integer, and store it in @result.
+ *
+ * This is a wrapper around strtol() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtol() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL.  This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store LONG_MAX in @result,
+ * and return -ERANGE.
+ *
+ * If the conversion underflows @result, store LONG_MIN in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtol(const char *nptr, const char **endptr, int base,
+                long *result)
+{
+    char *ep;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    errno = 0;
+    *result = strtol(nptr, &ep, base);
+    return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an unsigned long, and store it in @result.
+ *
+ * This is a wrapper around strtoul() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtoul() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL.  This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store ULONG_MAX in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ *
+ * Note that a number with a leading minus sign gets converted without
+ * the minus sign, checked for overflow (see above), then negated (in
+ * @result's type).  This is exactly how strtoul() works.
+ */
+int qemu_strtoul(const char *nptr, const char **endptr, int base,
+                 unsigned long *result)
+{
+    char *ep;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    errno = 0;
+    *result = strtoul(nptr, &ep, base);
+    /* Windows returns 1 for negative out-of-range values.  */
+    if (errno == ERANGE) {
+        *result = -1;
+    }
+    return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an int64_t.
+ *
+ * Works like qemu_strtol(), except it stores INT64_MAX on overflow,
+ * and INT64_MIN on underflow.
+ */
+int qemu_strtoi64(const char *nptr, const char **endptr, int base,
+                 int64_t *result)
+{
+    char *ep;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    /* This assumes int64_t is long long TODO relax */
+    QEMU_BUILD_BUG_ON(sizeof(int64_t) != sizeof(long long));
+    errno = 0;
+    *result = strtoll(nptr, &ep, base);
+    return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an uint64_t.
+ *
+ * Works like qemu_strtoul(), except it stores UINT64_MAX on overflow.
+ */
+int qemu_strtou64(const char *nptr, const char **endptr, int base,
+                  uint64_t *result)
+{
+    char *ep;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    /* This assumes uint64_t is unsigned long long TODO relax */
+    QEMU_BUILD_BUG_ON(sizeof(uint64_t) != sizeof(unsigned long long));
+    errno = 0;
+    *result = strtoull(nptr, &ep, base);
+    /* Windows returns 1 for negative out-of-range values.  */
+    if (errno == ERANGE) {
+        *result = -1;
+    }
+    return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to a double.
+  *
+ * This is a wrapper around strtod() that is harder to misuse.
+ * Semantics of @nptr and @endptr match strtod() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows, store +/-HUGE_VAL in @result, depending
+ * on the sign, and return -ERANGE.
+ *
+ * If the conversion underflows, store +/-0.0 in @result, depending on the
+ * sign, and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtod(const char *nptr, const char **endptr, double *result)
+{
+    char *ep;
+
+    if (!nptr) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        return -EINVAL;
+    }
+
+    errno = 0;
+    *result = strtod(nptr, &ep);
+    return check_strtox_error(nptr, ep, endptr, false, errno);
+}
+
+/**
+ * Convert string @nptr to a finite double.
+ *
+ * Works like qemu_strtod(), except that "NaN" and "inf" are rejected
+ * with -EINVAL and no conversion is performed.
+ */
+int qemu_strtod_finite(const char *nptr, const char **endptr, double *result)
+{
+    double tmp;
+    int ret;
+
+    ret = qemu_strtod(nptr, endptr, &tmp);
+    if (!ret && !isfinite(tmp)) {
+        if (endptr) {
+            *endptr = nptr;
+        }
+        ret = -EINVAL;
+    }
+
+    if (ret != -EINVAL) {
+        *result = tmp;
+    }
+    return ret;
+}
+
+/**
+ * Searches for the first occurrence of 'c' in 's', and returns a pointer
+ * to the trailing null byte if none was found.
+ */
+#ifndef HAVE_STRCHRNUL
+const char *qemu_strchrnul(const char *s, int c)
+{
+    const char *e = strchr(s, c);
+    if (!e) {
+        e = s + strlen(s);
+    }
+    return e;
+}
+#endif
+
+/**
+ * parse_uint:
+ *
+ * @s: String to parse
+ * @value: Destination for parsed integer value
+ * @endptr: Destination for pointer to first character not consumed
+ * @base: integer base, between 2 and 36 inclusive, or 0
+ *
+ * Parse unsigned integer
+ *
+ * Parsed syntax is like strtoull()'s: arbitrary whitespace, a single optional
+ * '+' or '-', an optional "0x" if @base is 0 or 16, one or more digits.
+ *
+ * If @s is null, or @base is invalid, or @s doesn't start with an
+ * integer in the syntax above, set *@value to 0, *@endptr to @s, and
+ * return -EINVAL.
+ *
+ * Set *@endptr to point right beyond the parsed integer (even if the integer
+ * overflows or is negative, all digits will be parsed and *@endptr will
+ * point right beyond them).
+ *
+ * If the integer is negative, set *@value to 0, and return -ERANGE.
+ *
+ * If the integer overflows unsigned long long, set *@value to
+ * ULLONG_MAX, and return -ERANGE.
+ *
+ * Else, set *@value to the parsed integer, and return 0.
+ */
+int parse_uint(const char *s, unsigned long long *value, char **endptr,
+               int base)
+{
+    int r = 0;
+    char *endp = (char *)s;
+    unsigned long long val = 0;
+
+    assert((unsigned) base <= 36 && base != 1);
+    if (!s) {
+        r = -EINVAL;
+        goto out;
+    }
+
+    errno = 0;
+    val = strtoull(s, &endp, base);
+    if (errno) {
+        r = -errno;
+        goto out;
+    }
+
+    if (endp == s) {
+        r = -EINVAL;
+        goto out;
+    }
+
+    /* make sure we reject negative numbers: */
+    while (qemu_isspace(*s)) {
+        s++;
+    }
+    if (*s == '-') {
+        val = 0;
+        r = -ERANGE;
+        goto out;
+    }
+
+out:
+    *value = val;
+    *endptr = endp;
+    return r;
+}
+
+/**
+ * parse_uint_full:
+ *
+ * @s: String to parse
+ * @value: Destination for parsed integer value
+ * @base: integer base, between 2 and 36 inclusive, or 0
+ *
+ * Parse unsigned integer from entire string
+ *
+ * Have the same behavior of parse_uint(), but with an additional check
+ * for additional data after the parsed number. If extra characters are present
+ * after the parsed number, the function will return -EINVAL, and *@v will
+ * be set to 0.
+ */
+int parse_uint_full(const char *s, unsigned long long *value, int base)
+{
+    char *endp;
+    int r;
+
+    r = parse_uint(s, value, &endp, base);
+    if (r < 0) {
+        return r;
+    }
+    if (*endp) {
+        *value = 0;
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+int qemu_parse_fd(const char *param)
+{
+    long fd;
+    char *endptr;
+
+    errno = 0;
+    fd = strtol(param, &endptr, 10);
+    if (param == endptr /* no conversion performed */                    ||
+        errno != 0      /* not representable as long; possibly others */ ||
+        *endptr != '\0' /* final string not empty */                     ||
+        fd < 0          /* invalid as file descriptor */                 ||
+        fd > INT_MAX    /* not representable as int */) {
+        return -1;
+    }
+    return fd;
+}
+
+/*
+ * Implementation of  ULEB128 (http://en.wikipedia.org/wiki/LEB128)
+ * Input is limited to 14-bit numbers
+ */
+int uleb128_encode_small(uint8_t *out, uint32_t n)
+{
+    g_assert(n <= 0x3fff);
+    if (n < 0x80) {
+        *out = n;
+        return 1;
+    } else {
+        *out++ = (n & 0x7f) | 0x80;
+        *out = n >> 7;
+        return 2;
+    }
+}
+
+int uleb128_decode_small(const uint8_t *in, uint32_t *n)
+{
+    if (!(*in & 0x80)) {
+        *n = *in;
+        return 1;
+    } else {
+        *n = *in++ & 0x7f;
+        /* we exceed 14 bit number */
+        if (*in & 0x80) {
+            return -1;
+        }
+        *n |= *in << 7;
+        return 2;
+    }
+}
+
+/*
+ * helper to parse debug environment variables
+ */
+int parse_debug_env(const char *name, int max, int initial)
+{
+    char *debug_env = getenv(name);
+    char *inv = NULL;
+    long debug;
+
+    if (!debug_env) {
+        return initial;
+    }
+    errno = 0;
+    debug = strtol(debug_env, &inv, 10);
+    if (inv == debug_env) {
+        return initial;
+    }
+    if (debug < 0 || debug > max || errno != 0) {
+        warn_report("%s not in [0, %d]", name, max);
+        return initial;
+    }
+    return debug;
+}
+
+/*
+ * Helper to print ethernet mac address
+ */
+const char *qemu_ether_ntoa(const MACAddr *mac)
+{
+    static char ret[18];
+
+    snprintf(ret, sizeof(ret), "%02x:%02x:%02x:%02x:%02x:%02x",
+             mac->a[0], mac->a[1], mac->a[2], mac->a[3], mac->a[4], mac->a[5]);
+
+    return ret;
+}
+
+/*
+ * Return human readable string for size @val.
+ * @val can be anything that uint64_t allows (no more than "16 EiB").
+ * Use IEC binary units like KiB, MiB, and so forth.
+ * Caller is responsible for passing it to g_free().
+ */
+char *size_to_str(uint64_t val)
+{
+    static const char *suffixes[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei" };
+    uint64_t div;
+    int i;
+
+    /*
+     * The exponent (returned in i) minus one gives us
+     * floor(log2(val * 1024 / 1000).  The correction makes us
+     * switch to the higher power when the integer part is >= 1000.
+     * (see e41b509d68afb1f for more info)
+     */
+    frexp(val / (1000.0 / 1024.0), &i);
+    i = (i - 1) / 10;
+    div = 1ULL << (i * 10);
+
+    return g_strdup_printf("%0.3g %sB", (double)val / div, suffixes[i]);
+}
+
+char *freq_to_str(uint64_t freq_hz)
+{
+    static const char *const suffixes[] = { "", "K", "M", "G", "T", "P", "E" };
+    double freq = freq_hz;
+    size_t idx = 0;
+
+    while (freq >= 1000.0) {
+        freq /= 1000.0;
+        idx++;
+    }
+    assert(idx < ARRAY_SIZE(suffixes));
+
+    return g_strdup_printf("%0.3g %sHz", freq, suffixes[idx]);
+}
+
+int qemu_pstrcmp0(const char **str1, const char **str2)
+{
+    return g_strcmp0(*str1, *str2);
+}
+
+static inline bool starts_with_prefix(const char *dir)
+{
+    size_t prefix_len = strlen(CONFIG_PREFIX);
+    return !memcmp(dir, CONFIG_PREFIX, prefix_len) &&
+        (!dir[prefix_len] || G_IS_DIR_SEPARATOR(dir[prefix_len]));
+}
+
+/* Return the next path component in dir, and store its length in *p_len.  */
+static inline const char *next_component(const char *dir, int *p_len)
+{
+    int len;
+    while ((*dir && G_IS_DIR_SEPARATOR(*dir)) ||
+           (*dir == '.' && (G_IS_DIR_SEPARATOR(dir[1]) || dir[1] == '\0'))) {
+        dir++;
+    }
+    len = 0;
+    while (dir[len] && !G_IS_DIR_SEPARATOR(dir[len])) {
+        len++;
+    }
+    *p_len = len;
+    return dir;
+}
+
+char *get_relocated_path(const char *dir)
+{
+    size_t prefix_len = strlen(CONFIG_PREFIX);
+    const char *bindir = CONFIG_BINDIR;
+    const char *exec_dir = qemu_get_exec_dir();
+    GString *result;
+    int len_dir, len_bindir;
+
+    /* Fail if qemu_init_exec_dir was not called.  */
+    assert(exec_dir[0]);
+    if (!starts_with_prefix(dir) || !starts_with_prefix(bindir)) {
+        return g_strdup(dir);
+    }
+
+    result = g_string_new(exec_dir);
+
+    /* Advance over common components.  */
+    len_dir = len_bindir = prefix_len;
+    do {
+        dir += len_dir;
+        bindir += len_bindir;
+        dir = next_component(dir, &len_dir);
+        bindir = next_component(bindir, &len_bindir);
+    } while (len_dir && len_dir == len_bindir && !memcmp(dir, bindir, len_dir));
+
+    /* Ascend from bindir to the common prefix with dir.  */
+    while (len_bindir) {
+        bindir += len_bindir;
+        g_string_append(result, "/..");
+        bindir = next_component(bindir, &len_bindir);
+    }
+
+    if (*dir) {
+        assert(G_IS_DIR_SEPARATOR(dir[-1]));
+        g_string_append(result, dir - 1);
+    }
+    return g_string_free(result, false);
+}
diff --git a/util/dbus.c b/util/dbus.c
new file mode 100644
index 000000000..9099dc5b4
--- /dev/null
+++ b/util/dbus.c
@@ -0,0 +1,57 @@
+/*
+ * Helpers for using D-Bus
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/dbus.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+/*
+ * qemu_dbus_get_queued_owners() - return the list of queued unique names
+ * @connection: A GDBusConnection
+ * @name: a service name
+ *
+ * Return: a GStrv of unique names, or NULL on failure.
+ */
+GStrv
+qemu_dbus_get_queued_owners(GDBusConnection *connection, const char *name,
+                            Error **errp)
+{
+    g_autoptr(GDBusProxy) proxy = NULL;
+    g_autoptr(GVariant) result = NULL;
+    g_autoptr(GVariant) child = NULL;
+    g_autoptr(GError) err = NULL;
+
+    proxy = g_dbus_proxy_new_sync(connection, G_DBUS_PROXY_FLAGS_NONE, NULL,
+                                  "org.freedesktop.DBus",
+                                  "/org/freedesktop/DBus",
+                                  "org.freedesktop.DBus",
+                                  NULL, &err);
+    if (!proxy) {
+        error_setg(errp, "Failed to create DBus proxy: %s", err->message);
+        return NULL;
+    }
+
+    result = g_dbus_proxy_call_sync(proxy, "ListQueuedOwners",
+                                    g_variant_new("(s)", name),
+                                    G_DBUS_CALL_FLAGS_NO_AUTO_START,
+                                    -1, NULL, &err);
+    if (!result) {
+        if (g_error_matches(err,
+                            G_DBUS_ERROR,
+                            G_DBUS_ERROR_NAME_HAS_NO_OWNER)) {
+            return g_new0(char *, 1);
+        }
+        error_setg(errp, "Failed to call ListQueuedOwners: %s", err->message);
+        return NULL;
+    }
+
+    child = g_variant_get_child_value(result, 0);
+    return g_variant_dup_strv(child, NULL);
+}
diff --git a/util/drm.c b/util/drm.c
new file mode 100644
index 000000000..dae8ffebc
--- /dev/null
+++ b/util/drm.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2015-2016 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/drm.h"
+
+#include <glob.h>
+#include <dirent.h>
+
+int qemu_drm_rendernode_open(const char *rendernode)
+{
+    DIR *dir;
+    struct dirent *e;
+    struct stat st;
+    int r, fd, ret;
+    char *p;
+
+    if (rendernode) {
+        return open(rendernode, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK);
+    }
+
+    dir = opendir("/dev/dri");
+    if (!dir) {
+        return -1;
+    }
+
+    fd = -1;
+    while ((e = readdir(dir))) {
+        if (strncmp(e->d_name, "renderD", 7)) {
+            continue;
+        }
+
+        p = g_strdup_printf("/dev/dri/%s", e->d_name);
+
+        r = open(p, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK);
+        if (r < 0) {
+            g_free(p);
+            continue;
+        }
+
+        /*
+         * prefer fstat() over checking e->d_type == DT_CHR for
+         * portability reasons
+         */
+        ret = fstat(r, &st);
+        if (ret < 0 || (st.st_mode & S_IFMT) != S_IFCHR) {
+            close(r);
+            g_free(p);
+            continue;
+        }
+
+        fd = r;
+        g_free(p);
+        break;
+    }
+
+    closedir(dir);
+    if (fd < 0) {
+        return -1;
+    }
+    return fd;
+}
diff --git a/util/envlist.c b/util/envlist.c
new file mode 100644
index 000000000..2bcc13f09
--- /dev/null
+++ b/util/envlist.c
@@ -0,0 +1,232 @@
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/envlist.h"
+
+struct envlist_entry {
+	const char *ev_var;			/* actual env value */
+	QLIST_ENTRY(envlist_entry) ev_link;
+};
+
+struct envlist {
+	QLIST_HEAD(, envlist_entry) el_entries;	/* actual entries */
+	size_t el_count;			/* number of entries */
+};
+
+static int envlist_parse(envlist_t *envlist,
+    const char *env, int (*)(envlist_t *, const char *));
+
+/*
+ * Allocates new envlist and returns pointer to it.
+ */
+envlist_t *
+envlist_create(void)
+{
+	envlist_t *envlist;
+
+	envlist = g_malloc(sizeof(*envlist));
+
+	QLIST_INIT(&envlist->el_entries);
+	envlist->el_count = 0;
+
+	return (envlist);
+}
+
+/*
+ * Releases given envlist and its entries.
+ */
+void
+envlist_free(envlist_t *envlist)
+{
+	struct envlist_entry *entry;
+
+	assert(envlist != NULL);
+
+	while (envlist->el_entries.lh_first != NULL) {
+		entry = envlist->el_entries.lh_first;
+		QLIST_REMOVE(entry, ev_link);
+
+		g_free((char *)entry->ev_var);
+		g_free(entry);
+	}
+	g_free(envlist);
+}
+
+/*
+ * Parses comma separated list of set/modify environment
+ * variable entries and updates given enlist accordingly.
+ *
+ * For example:
+ *     envlist_parse(el, "HOME=foo,SHELL=/bin/sh");
+ *
+ * inserts/sets environment variables HOME and SHELL.
+ *
+ * Returns 0 on success, errno otherwise.
+ */
+int
+envlist_parse_set(envlist_t *envlist, const char *env)
+{
+	return (envlist_parse(envlist, env, &envlist_setenv));
+}
+
+/*
+ * Parses comma separated list of unset environment variable
+ * entries and removes given variables from given envlist.
+ *
+ * Returns 0 on success, errno otherwise.
+ */
+int
+envlist_parse_unset(envlist_t *envlist, const char *env)
+{
+	return (envlist_parse(envlist, env, &envlist_unsetenv));
+}
+
+/*
+ * Parses comma separated list of set, modify or unset entries
+ * and calls given callback for each entry.
+ *
+ * Returns 0 in case of success, errno otherwise.
+ */
+static int
+envlist_parse(envlist_t *envlist, const char *env,
+    int (*callback)(envlist_t *, const char *))
+{
+	char *tmpenv, *envvar;
+	char *envsave = NULL;
+    int ret = 0;
+    assert(callback != NULL);
+
+	if ((envlist == NULL) || (env == NULL))
+		return (EINVAL);
+
+	tmpenv = g_strdup(env);
+    envsave = tmpenv;
+
+    do {
+        envvar = strchr(tmpenv, ',');
+        if (envvar != NULL) {
+            *envvar = '\0';
+        }
+        if ((*callback)(envlist, tmpenv) != 0) {
+            ret = errno;
+            break;
+		}
+        tmpenv = envvar + 1;
+    } while (envvar != NULL);
+
+    g_free(envsave);
+    return ret;
+}
+
+/*
+ * Sets environment value to envlist in similar manner
+ * than putenv(3).
+ *
+ * Returns 0 in success, errno otherwise.
+ */
+int
+envlist_setenv(envlist_t *envlist, const char *env)
+{
+	struct envlist_entry *entry = NULL;
+	const char *eq_sign;
+	size_t envname_len;
+
+	if ((envlist == NULL) || (env == NULL))
+		return (EINVAL);
+
+	/* find out first equals sign in given env */
+	if ((eq_sign = strchr(env, '=')) == NULL)
+		return (EINVAL);
+	envname_len = eq_sign - env + 1;
+
+	/*
+	 * If there already exists variable with given name
+	 * we remove and release it before allocating a whole
+	 * new entry.
+	 */
+	for (entry = envlist->el_entries.lh_first; entry != NULL;
+	    entry = entry->ev_link.le_next) {
+		if (strncmp(entry->ev_var, env, envname_len) == 0)
+			break;
+	}
+
+	if (entry != NULL) {
+		QLIST_REMOVE(entry, ev_link);
+		g_free((char *)entry->ev_var);
+		g_free(entry);
+	} else {
+		envlist->el_count++;
+	}
+
+	entry = g_malloc(sizeof(*entry));
+	entry->ev_var = g_strdup(env);
+	QLIST_INSERT_HEAD(&envlist->el_entries, entry, ev_link);
+
+	return (0);
+}
+
+/*
+ * Removes given env value from envlist in similar manner
+ * than unsetenv(3).  Returns 0 in success, errno otherwise.
+ */
+int
+envlist_unsetenv(envlist_t *envlist, const char *env)
+{
+	struct envlist_entry *entry;
+	size_t envname_len;
+
+	if ((envlist == NULL) || (env == NULL))
+		return (EINVAL);
+
+	/* env is not allowed to contain '=' */
+	if (strchr(env, '=') != NULL)
+		return (EINVAL);
+
+	/*
+	 * Find out the requested entry and remove
+	 * it from the list.
+	 */
+	envname_len = strlen(env);
+	for (entry = envlist->el_entries.lh_first; entry != NULL;
+	    entry = entry->ev_link.le_next) {
+		if (strncmp(entry->ev_var, env, envname_len) == 0)
+			break;
+	}
+	if (entry != NULL) {
+		QLIST_REMOVE(entry, ev_link);
+		g_free((char *)entry->ev_var);
+		g_free(entry);
+
+		envlist->el_count--;
+	}
+	return (0);
+}
+
+/*
+ * Returns given envlist as array of strings (in same form that
+ * global variable environ is).  Caller must free returned memory
+ * by calling g_free for each element and the array.
+ * Returned array and given envlist are not related (no common
+ * references).
+ *
+ * If caller provides count pointer, number of items in array is
+ * stored there.
+ */
+char **
+envlist_to_environ(const envlist_t *envlist, size_t *count)
+{
+	struct envlist_entry *entry;
+	char **env, **penv;
+
+	penv = env = g_malloc((envlist->el_count + 1) * sizeof(char *));
+
+	for (entry = envlist->el_entries.lh_first; entry != NULL;
+	    entry = entry->ev_link.le_next) {
+		*(penv++) = g_strdup(entry->ev_var);
+	}
+	*penv = NULL; /* NULL terminate the list */
+
+	if (count != NULL)
+		*count = envlist->el_count;
+
+	return (env);
+}
diff --git a/util/error.c b/util/error.c
new file mode 100644
index 000000000..b6c89d141
--- /dev/null
+++ b/util/error.c
@@ -0,0 +1,305 @@
+/*
+ * QEMU Error Objects
+ *
+ * Copyright IBM, Corp. 2011
+ * Copyright (C) 2011-2015 Red Hat, Inc.
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.  See
+ * the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+struct Error
+{
+    char *msg;
+    ErrorClass err_class;
+    const char *src, *func;
+    int line;
+    GString *hint;
+};
+
+Error *error_abort;
+Error *error_fatal;
+
+static void error_handle_fatal(Error **errp, Error *err)
+{
+    if (errp == &error_abort) {
+        fprintf(stderr, "Unexpected error in %s() at %s:%d:\n",
+                err->func, err->src, err->line);
+        error_report("%s", error_get_pretty(err));
+        if (err->hint) {
+            error_printf("%s", err->hint->str);
+        }
+        abort();
+    }
+    if (errp == &error_fatal) {
+        error_report_err(err);
+        exit(1);
+    }
+}
+
+static void error_setv(Error **errp,
+                       const char *src, int line, const char *func,
+                       ErrorClass err_class, const char *fmt, va_list ap,
+                       const char *suffix)
+{
+    Error *err;
+    int saved_errno = errno;
+
+    if (errp == NULL) {
+        return;
+    }
+    assert(*errp == NULL);
+
+    err = g_malloc0(sizeof(*err));
+    err->msg = g_strdup_vprintf(fmt, ap);
+    if (suffix) {
+        char *msg = err->msg;
+        err->msg = g_strdup_printf("%s: %s", msg, suffix);
+        g_free(msg);
+    }
+    err->err_class = err_class;
+    err->src = src;
+    err->line = line;
+    err->func = func;
+
+    error_handle_fatal(errp, err);
+    *errp = err;
+
+    errno = saved_errno;
+}
+
+void error_set_internal(Error **errp,
+                        const char *src, int line, const char *func,
+                        ErrorClass err_class, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    error_setv(errp, src, line, func, err_class, fmt, ap, NULL);
+    va_end(ap);
+}
+
+void error_setg_internal(Error **errp,
+                         const char *src, int line, const char *func,
+                         const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap, NULL);
+    va_end(ap);
+}
+
+void error_setg_errno_internal(Error **errp,
+                               const char *src, int line, const char *func,
+                               int os_errno, const char *fmt, ...)
+{
+    va_list ap;
+    int saved_errno = errno;
+
+    va_start(ap, fmt);
+    error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap,
+               os_errno != 0 ? strerror(os_errno) : NULL);
+    va_end(ap);
+
+    errno = saved_errno;
+}
+
+void error_setg_file_open_internal(Error **errp,
+                                   const char *src, int line, const char *func,
+                                   int os_errno, const char *filename)
+{
+    error_setg_errno_internal(errp, src, line, func, os_errno,
+                              "Could not open '%s'", filename);
+}
+
+void error_vprepend(Error *const *errp, const char *fmt, va_list ap)
+{
+    GString *newmsg;
+
+    if (!errp) {
+        return;
+    }
+
+    newmsg = g_string_new(NULL);
+    g_string_vprintf(newmsg, fmt, ap);
+    g_string_append(newmsg, (*errp)->msg);
+    g_free((*errp)->msg);
+    (*errp)->msg = g_string_free(newmsg, 0);
+}
+
+void error_prepend(Error *const *errp, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    error_vprepend(errp, fmt, ap);
+    va_end(ap);
+}
+
+void error_append_hint(Error *const *errp, const char *fmt, ...)
+{
+    va_list ap;
+    int saved_errno = errno;
+    Error *err;
+
+    if (!errp) {
+        return;
+    }
+    err = *errp;
+    assert(err && errp != &error_abort && errp != &error_fatal);
+
+    if (!err->hint) {
+        err->hint = g_string_new(NULL);
+    }
+    va_start(ap, fmt);
+    g_string_append_vprintf(err->hint, fmt, ap);
+    va_end(ap);
+
+    errno = saved_errno;
+}
+
+#ifdef _WIN32
+
+void error_setg_win32_internal(Error **errp,
+                               const char *src, int line, const char *func,
+                               int win32_err, const char *fmt, ...)
+{
+    va_list ap;
+    char *suffix = NULL;
+
+    if (errp == NULL) {
+        return;
+    }
+
+    if (win32_err != 0) {
+        suffix = g_win32_error_message(win32_err);
+    }
+
+    va_start(ap, fmt);
+    error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR,
+               fmt, ap, suffix);
+    va_end(ap);
+
+    g_free(suffix);
+}
+
+#endif
+
+Error *error_copy(const Error *err)
+{
+    Error *err_new;
+
+    err_new = g_malloc0(sizeof(*err));
+    err_new->msg = g_strdup(err->msg);
+    err_new->err_class = err->err_class;
+    err_new->src = err->src;
+    err_new->line = err->line;
+    err_new->func = err->func;
+    if (err->hint) {
+        err_new->hint = g_string_new(err->hint->str);
+    }
+
+    return err_new;
+}
+
+ErrorClass error_get_class(const Error *err)
+{
+    return err->err_class;
+}
+
+const char *error_get_pretty(const Error *err)
+{
+    return err->msg;
+}
+
+void error_report_err(Error *err)
+{
+    error_report("%s", error_get_pretty(err));
+    if (err->hint) {
+        error_printf("%s", err->hint->str);
+    }
+    error_free(err);
+}
+
+void warn_report_err(Error *err)
+{
+    warn_report("%s", error_get_pretty(err));
+    if (err->hint) {
+        error_printf("%s", err->hint->str);
+    }
+    error_free(err);
+}
+
+void error_reportf_err(Error *err, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    error_vprepend(&err, fmt, ap);
+    va_end(ap);
+    error_report_err(err);
+}
+
+
+void warn_reportf_err(Error *err, const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    error_vprepend(&err, fmt, ap);
+    va_end(ap);
+    warn_report_err(err);
+}
+
+void error_free(Error *err)
+{
+    if (err) {
+        g_free(err->msg);
+        if (err->hint) {
+            g_string_free(err->hint, true);
+        }
+        g_free(err);
+    }
+}
+
+void error_free_or_abort(Error **errp)
+{
+    assert(errp && *errp);
+    error_free(*errp);
+    *errp = NULL;
+}
+
+void error_propagate(Error **dst_errp, Error *local_err)
+{
+    if (!local_err) {
+        return;
+    }
+    error_handle_fatal(dst_errp, local_err);
+    if (dst_errp && !*dst_errp) {
+        *dst_errp = local_err;
+    } else {
+        error_free(local_err);
+    }
+}
+
+void error_propagate_prepend(Error **dst_errp, Error *err,
+                             const char *fmt, ...)
+{
+    va_list ap;
+
+    if (dst_errp && !*dst_errp) {
+        va_start(ap, fmt);
+        error_vprepend(&err, fmt, ap);
+        va_end(ap);
+    } /* else error is being ignored, don't bother with prepending */
+    error_propagate(dst_errp, err);
+}
diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c
new file mode 100644
index 000000000..8307013c5
--- /dev/null
+++ b/util/event_notifier-posix.c
@@ -0,0 +1,140 @@
+/*
+ * event notifier support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
+
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
+#ifdef CONFIG_EVENTFD
+/*
+ * Initialize @e with existing file descriptor @fd.
+ * @fd must be a genuine eventfd object, emulation with pipe won't do.
+ */
+void event_notifier_init_fd(EventNotifier *e, int fd)
+{
+    e->rfd = fd;
+    e->wfd = fd;
+    e->initialized = true;
+}
+#endif
+
+int event_notifier_init(EventNotifier *e, int active)
+{
+    int fds[2];
+    int ret;
+
+#ifdef CONFIG_EVENTFD
+    ret = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+#else
+    ret = -1;
+    errno = ENOSYS;
+#endif
+    if (ret >= 0) {
+        e->rfd = e->wfd = ret;
+    } else {
+        if (errno != ENOSYS) {
+            return -errno;
+        }
+        if (qemu_pipe(fds) < 0) {
+            return -errno;
+        }
+        ret = fcntl_setfl(fds[0], O_NONBLOCK);
+        if (ret < 0) {
+            ret = -errno;
+            goto fail;
+        }
+        ret = fcntl_setfl(fds[1], O_NONBLOCK);
+        if (ret < 0) {
+            ret = -errno;
+            goto fail;
+        }
+        e->rfd = fds[0];
+        e->wfd = fds[1];
+    }
+    e->initialized = true;
+    if (active) {
+        event_notifier_set(e);
+    }
+    return 0;
+
+fail:
+    close(fds[0]);
+    close(fds[1]);
+    return ret;
+}
+
+void event_notifier_cleanup(EventNotifier *e)
+{
+    if (!e->initialized) {
+        return;
+    }
+
+    if (e->rfd != e->wfd) {
+        close(e->rfd);
+    }
+
+    e->rfd = -1;
+    close(e->wfd);
+    e->wfd = -1;
+    e->initialized = false;
+}
+
+int event_notifier_get_fd(const EventNotifier *e)
+{
+    return e->rfd;
+}
+
+int event_notifier_set(EventNotifier *e)
+{
+    static const uint64_t value = 1;
+    ssize_t ret;
+
+    if (!e->initialized) {
+        return -1;
+    }
+
+    do {
+        ret = write(e->wfd, &value, sizeof(value));
+    } while (ret < 0 && errno == EINTR);
+
+    /* EAGAIN is fine, a read must be pending.  */
+    if (ret < 0 && errno != EAGAIN) {
+        return -errno;
+    }
+    return 0;
+}
+
+int event_notifier_test_and_clear(EventNotifier *e)
+{
+    int value;
+    ssize_t len;
+    char buffer[512];
+
+    if (!e->initialized) {
+        return 0;
+    }
+
+    /* Drain the notify pipe.  For eventfd, only 8 bytes will be read.  */
+    value = 0;
+    do {
+        len = read(e->rfd, buffer, sizeof(buffer));
+        value |= (len > 0);
+    } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
+
+    return value;
+}
diff --git a/util/event_notifier-win32.c b/util/event_notifier-win32.c
new file mode 100644
index 000000000..62c53b0a9
--- /dev/null
+++ b/util/event_notifier-win32.c
@@ -0,0 +1,50 @@
+/*
+ * event notifier support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ *  Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
+
+int event_notifier_init(EventNotifier *e, int active)
+{
+    e->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+    assert(e->event);
+    return 0;
+}
+
+void event_notifier_cleanup(EventNotifier *e)
+{
+    CloseHandle(e->event);
+    e->event = NULL;
+}
+
+HANDLE event_notifier_get_handle(EventNotifier *e)
+{
+    return e->event;
+}
+
+int event_notifier_set(EventNotifier *e)
+{
+    SetEvent(e->event);
+    return 0;
+}
+
+int event_notifier_test_and_clear(EventNotifier *e)
+{
+    int ret = WaitForSingleObject(e->event, 0);
+    if (ret == WAIT_OBJECT_0) {
+        ResetEvent(e->event);
+        return true;
+    }
+    return false;
+}
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
new file mode 100644
index 000000000..e11a8a022
--- /dev/null
+++ b/util/fdmon-epoll.c
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * epoll(7) file descriptor monitoring
+ */
+
+#include "qemu/osdep.h"
+#include <sys/epoll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+/* The fd number threshold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+void fdmon_epoll_disable(AioContext *ctx)
+{
+    if (ctx->epollfd >= 0) {
+        close(ctx->epollfd);
+        ctx->epollfd = -1;
+    }
+
+    /* Switch back */
+    ctx->fdmon_ops = &fdmon_poll_ops;
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static void fdmon_epoll_update(AioContext *ctx,
+                               AioHandler *old_node,
+                               AioHandler *new_node)
+{
+    struct epoll_event event = {
+        .data.ptr = new_node,
+        .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
+    };
+    int r;
+
+    if (!new_node) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
+    } else if (!old_node) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
+    } else {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
+    }
+
+    if (r) {
+        fdmon_epoll_disable(ctx);
+    }
+}
+
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
+                            int64_t timeout)
+{
+    GPollFD pfd = {
+        .fd = ctx->epollfd,
+        .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+    };
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    /* Fall back while external clients are disabled */
+    if (qatomic_read(&ctx->external_disable_cnt)) {
+        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+    }
+
+    if (timeout > 0) {
+        ret = qemu_poll_ns(&pfd, 1, timeout);
+        if (ret > 0) {
+            timeout = 0;
+        }
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         ARRAY_SIZE(events),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                          (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                          (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                          (ev & EPOLLERR ? G_IO_ERR : 0);
+
+            node = events[i].data.ptr;
+            aio_add_ready_handler(ready_list, node, revents);
+        }
+    }
+out:
+    return ret;
+}
+
+static const FDMonOps fdmon_epoll_ops = {
+    .update = fdmon_epoll_update,
+    .wait = fdmon_epoll_wait,
+    .need_wait = aio_poll_disabled,
+};
+
+static bool fdmon_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        int r;
+        if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+
+    ctx->fdmon_ops = &fdmon_epoll_ops;
+    return true;
+}
+
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+    if (ctx->epollfd < 0) {
+        return false;
+    }
+
+    /* Do not upgrade while external clients are disabled */
+    if (qatomic_read(&ctx->external_disable_cnt)) {
+        return false;
+    }
+
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (fdmon_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            fdmon_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+void fdmon_epoll_setup(AioContext *ctx)
+{
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
+    }
+}
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
new file mode 100644
index 000000000..1461dfa40
--- /dev/null
+++ b/util/fdmon-io_uring.c
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Linux io_uring file descriptor monitoring
+ *
+ * The Linux io_uring API supports file descriptor monitoring with a few
+ * advantages over existing APIs like poll(2) and epoll(7):
+ *
+ * 1. Userspace polling of events is possible because the completion queue (cq
+ *    ring) is shared between the kernel and userspace.  This allows
+ *    applications that rely on userspace polling to also monitor file
+ *    descriptors in the same userspace polling loop.
+ *
+ * 2. Submission and completion is batched and done together in a single system
+ *    call.  This minimizes the number of system calls.
+ *
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
+ *    poll(2).
+ *
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
+ *    epoll(7).
+ *
+ * This code only monitors file descriptors and does not do asynchronous disk
+ * I/O.  Implementing disk I/O efficiently has other requirements and should
+ * use a separate io_uring so it does not make sense to unify the code.
+ *
+ * File descriptor monitoring is implemented using the following operations:
+ *
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored.  When
+ *    the poll mask changes for a file descriptor it is first removed and then
+ *    re-added with the new poll mask, so this operation is also used as part
+ *    of modifying an existing monitored file descriptor.
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
+ *    for events.  This operation self-cancels if another event completes
+ *    before the timeout.
+ *
+ * io_uring calls the submission queue the "sq ring" and the completion queue
+ * the "cq ring".  Ring entries are called "sqe" and "cqe", respectively.
+ *
+ * The code is structured so that sq/cq rings are only modified within
+ * fdmon_io_uring_wait().  Changes to AioHandlers are made by enqueuing them on
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
+ */
+
+#include "qemu/osdep.h"
+#include <poll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+enum {
+    FDMON_IO_URING_ENTRIES  = 128, /* sq/cq ring size */
+
+    /* AioHandler::flags */
+    FDMON_IO_URING_PENDING  = (1 << 0),
+    FDMON_IO_URING_ADD      = (1 << 1),
+    FDMON_IO_URING_REMOVE   = (1 << 2),
+};
+
+static inline int poll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? POLLIN : 0) |
+           (pfd_events & G_IO_OUT ? POLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? POLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? POLLERR : 0);
+}
+
+static inline int pfd_events_from_poll(int poll_events)
+{
+    return (poll_events & POLLIN ? G_IO_IN : 0) |
+           (poll_events & POLLOUT ? G_IO_OUT : 0) |
+           (poll_events & POLLHUP ? G_IO_HUP : 0) |
+           (poll_events & POLLERR ? G_IO_ERR : 0);
+}
+
+/*
+ * Returns an sqe for submitting a request.  Only be called within
+ * fdmon_io_uring_wait().
+ */
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
+{
+    struct io_uring *ring = &ctx->fdmon_io_uring;
+    struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
+    int ret;
+
+    if (likely(sqe)) {
+        return sqe;
+    }
+
+    /* No free sqes left, submit pending sqes first */
+    do {
+        ret = io_uring_submit(ring);
+    } while (ret == -EINTR);
+
+    assert(ret > 1);
+    sqe = io_uring_get_sqe(ring);
+    assert(sqe);
+    return sqe;
+}
+
+/* Atomically enqueue an AioHandler for sq ring submission */
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
+{
+    unsigned old_flags;
+
+    old_flags = qatomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
+    if (!(old_flags & FDMON_IO_URING_PENDING)) {
+        QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
+    }
+}
+
+/* Dequeue an AioHandler for sq ring submission.  Called by fill_sq_ring(). */
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
+{
+    AioHandler *node = QSLIST_FIRST(head);
+
+    if (!node) {
+        return NULL;
+    }
+
+    /* Doesn't need to be atomic since fill_sq_ring() moves the list */
+    QSLIST_REMOVE_HEAD(head, node_submitted);
+
+    /*
+     * Don't clear FDMON_IO_URING_REMOVE.  It's sticky so it can serve two
+     * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
+     * telling process_cqe() to delete the AioHandler when its
+     * IORING_OP_POLL_ADD completes.
+     */
+    *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
+                                              FDMON_IO_URING_ADD));
+    return node;
+}
+
+static void fdmon_io_uring_update(AioContext *ctx,
+                                  AioHandler *old_node,
+                                  AioHandler *new_node)
+{
+    if (new_node) {
+        enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
+    }
+
+    if (old_node) {
+        /*
+         * Deletion is tricky because IORING_OP_POLL_ADD and
+         * IORING_OP_POLL_REMOVE are async.  We need to wait for the original
+         * IORING_OP_POLL_ADD to complete before this handler can be freed
+         * safely.
+         *
+         * It's possible that the file descriptor becomes ready and the
+         * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
+         * submitted, too.
+         *
+         * Mark this handler deleted right now but don't place it on
+         * ctx->deleted_aio_handlers yet.  Instead, manually fudge the list
+         * entry to make QLIST_IS_INSERTED() think this handler has been
+         * inserted and other code recognizes this AioHandler as deleted.
+         *
+         * Once the original IORING_OP_POLL_ADD completes we enqueue the
+         * handler on the real ctx->deleted_aio_handlers list to be freed.
+         */
+        assert(!QLIST_IS_INSERTED(old_node, node_deleted));
+        old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
+
+        enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
+    }
+}
+
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+    int events = poll_events_from_pfd(node->pfd.events);
+
+    io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+    io_uring_sqe_set_data(sqe, node);
+}
+
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
+{
+    struct io_uring_sqe *sqe = get_sqe(ctx);
+
+    io_uring_prep_poll_remove(sqe, node);
+}
+
+/* Add a timeout that self-cancels when another cqe becomes ready */
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
+{
+    struct io_uring_sqe *sqe;
+    struct __kernel_timespec ts = {
+        .tv_sec = ns / NANOSECONDS_PER_SECOND,
+        .tv_nsec = ns % NANOSECONDS_PER_SECOND,
+    };
+
+    sqe = get_sqe(ctx);
+    io_uring_prep_timeout(sqe, &ts, 1, 0);
+}
+
+/* Add sqes from ctx->submit_list for submission */
+static void fill_sq_ring(AioContext *ctx)
+{
+    AioHandlerSList submit_list;
+    AioHandler *node;
+    unsigned flags;
+
+    QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
+
+    while ((node = dequeue(&submit_list, &flags))) {
+        /* Order matters, just in case both flags were set */
+        if (flags & FDMON_IO_URING_ADD) {
+            add_poll_add_sqe(ctx, node);
+        }
+        if (flags & FDMON_IO_URING_REMOVE) {
+            add_poll_remove_sqe(ctx, node);
+        }
+    }
+}
+
+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+                        AioHandlerList *ready_list,
+                        struct io_uring_cqe *cqe)
+{
+    AioHandler *node = io_uring_cqe_get_data(cqe);
+    unsigned flags;
+
+    /* poll_timeout and poll_remove have a zero user_data field */
+    if (!node) {
+        return false;
+    }
+
+    /*
+     * Deletion can only happen when IORING_OP_POLL_ADD completes.  If we race
+     * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
+     * bit before IORING_OP_POLL_REMOVE is submitted.
+     */
+    flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
+    if (flags & FDMON_IO_URING_REMOVE) {
+        QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+        return false;
+    }
+
+    aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+
+    /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
+    add_poll_add_sqe(ctx, node);
+    return true;
+}
+
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
+{
+    struct io_uring *ring = &ctx->fdmon_io_uring;
+    struct io_uring_cqe *cqe;
+    unsigned num_cqes = 0;
+    unsigned num_ready = 0;
+    unsigned head;
+
+    io_uring_for_each_cqe(ring, head, cqe) {
+        if (process_cqe(ctx, ready_list, cqe)) {
+            num_ready++;
+        }
+
+        num_cqes++;
+    }
+
+    io_uring_cq_advance(ring, num_cqes);
+    return num_ready;
+}
+
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
+                               int64_t timeout)
+{
+    unsigned wait_nr = 1; /* block until at least one cqe is ready */
+    int ret;
+
+    /* Fall back while external clients are disabled */
+    if (qatomic_read(&ctx->external_disable_cnt)) {
+        return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+    }
+
+    if (timeout == 0) {
+        wait_nr = 0; /* non-blocking */
+    } else if (timeout > 0) {
+        add_timeout_sqe(ctx, timeout);
+    }
+
+    fill_sq_ring(ctx);
+
+    do {
+        ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+    } while (ret == -EINTR);
+
+    assert(ret >= 0);
+
+    return process_cq_ring(ctx, ready_list);
+}
+
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
+{
+    /* Have io_uring events completed? */
+    if (io_uring_cq_ready(&ctx->fdmon_io_uring)) {
+        return true;
+    }
+
+    /* Are there pending sqes to submit? */
+    if (io_uring_sq_ready(&ctx->fdmon_io_uring)) {
+        return true;
+    }
+
+    /* Do we need to process AioHandlers for io_uring changes? */
+    if (!QSLIST_EMPTY_RCU(&ctx->submit_list)) {
+        return true;
+    }
+
+    /* Are we falling back to fdmon-poll? */
+    return qatomic_read(&ctx->external_disable_cnt);
+}
+
+static const FDMonOps fdmon_io_uring_ops = {
+    .update = fdmon_io_uring_update,
+    .wait = fdmon_io_uring_wait,
+    .need_wait = fdmon_io_uring_need_wait,
+};
+
+bool fdmon_io_uring_setup(AioContext *ctx)
+{
+    int ret;
+
+    ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+    if (ret != 0) {
+        return false;
+    }
+
+    QSLIST_INIT(&ctx->submit_list);
+    ctx->fdmon_ops = &fdmon_io_uring_ops;
+    return true;
+}
+
+void fdmon_io_uring_destroy(AioContext *ctx)
+{
+    if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
+        AioHandler *node;
+
+        io_uring_queue_exit(&ctx->fdmon_io_uring);
+
+        /* Move handlers due to be removed onto the deleted list */
+        while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+            unsigned flags = qatomic_fetch_and(&node->flags,
+                    ~(FDMON_IO_URING_PENDING |
+                      FDMON_IO_URING_ADD |
+                      FDMON_IO_URING_REMOVE));
+
+            if (flags & FDMON_IO_URING_REMOVE) {
+                QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+            }
+
+            QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+        }
+
+        ctx->fdmon_ops = &fdmon_poll_ops;
+    }
+}
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
new file mode 100644
index 000000000..5fe3b4786
--- /dev/null
+++ b/util/fdmon-poll.c
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * poll(2) file descriptor monitoring
+ *
+ * Uses ppoll(2) when available, g_poll() otherwise.
+ */
+
+#include "qemu/osdep.h"
+#include "aio-posix.h"
+#include "qemu/rcu_queue.h"
+
+/*
+ * These thread-local variables are used only in fdmon_poll_wait() around the
+ * call to the poll() system call.  In particular they are not used while
+ * aio_poll is performing callbacks, which makes it much easier to think about
+ * reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll().  And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext.  Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+    g_assert(npfd == 0);
+    g_free(pollfds);
+    g_free(nodes);
+    nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+    if (npfd == nalloc) {
+        if (nalloc == 0) {
+            pollfds_cleanup_notifier.notify = pollfds_cleanup;
+            qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+            nalloc = 8;
+        } else {
+            g_assert(nalloc <= INT_MAX);
+            nalloc *= 2;
+        }
+        pollfds = g_renew(GPollFD, pollfds, nalloc);
+        nodes = g_renew(AioHandler *, nodes, nalloc);
+    }
+    nodes[npfd] = node;
+    pollfds[npfd] = (GPollFD) {
+        .fd = node->pfd.fd,
+        .events = node->pfd.events,
+    };
+    npfd++;
+}
+
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
+                            int64_t timeout)
+{
+    AioHandler *node;
+    int ret;
+
+    assert(npfd == 0);
+
+    QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+        if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
+                && aio_node_check(ctx, node->is_external)) {
+            add_pollfd(node);
+        }
+    }
+
+    /* epoll(7) is faster above a certain number of fds */
+    if (fdmon_epoll_try_upgrade(ctx, npfd)) {
+        npfd = 0; /* we won't need pollfds[], reset npfd */
+        return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
+    }
+
+    ret = qemu_poll_ns(pollfds, npfd, timeout);
+    if (ret > 0) {
+        int i;
+
+        for (i = 0; i < npfd; i++) {
+            int revents = pollfds[i].revents;
+
+            if (revents) {
+                aio_add_ready_handler(ready_list, nodes[i], revents);
+            }
+        }
+    }
+
+    npfd = 0;
+    return ret;
+}
+
+static void fdmon_poll_update(AioContext *ctx,
+                              AioHandler *old_node,
+                              AioHandler *new_node)
+{
+    /* Do nothing, AioHandler already contains the state we'll need */
+}
+
+const FDMonOps fdmon_poll_ops = {
+    .update = fdmon_poll_update,
+    .wait = fdmon_poll_wait,
+    .need_wait = aio_poll_disabled,
+};
diff --git a/util/fifo8.c b/util/fifo8.c
new file mode 100644
index 000000000..d4d1c135e
--- /dev/null
+++ b/util/fifo8.c
@@ -0,0 +1,118 @@
+/*
+ * Generic FIFO component, implemented as a circular buffer.
+ *
+ * Copyright (c) 2012 Peter A. G. Crosthwaite
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/vmstate.h"
+#include "qemu/fifo8.h"
+
+void fifo8_create(Fifo8 *fifo, uint32_t capacity)
+{
+    fifo->data = g_new(uint8_t, capacity);
+    fifo->capacity = capacity;
+    fifo->head = 0;
+    fifo->num = 0;
+}
+
+void fifo8_destroy(Fifo8 *fifo)
+{
+    g_free(fifo->data);
+}
+
+void fifo8_push(Fifo8 *fifo, uint8_t data)
+{
+    assert(fifo->num < fifo->capacity);
+    fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data;
+    fifo->num++;
+}
+
+void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num)
+{
+    uint32_t start, avail;
+
+    assert(fifo->num + num <= fifo->capacity);
+
+    start = (fifo->head + fifo->num) % fifo->capacity;
+
+    if (start + num <= fifo->capacity) {
+        memcpy(&fifo->data[start], data, num);
+    } else {
+        avail = fifo->capacity - start;
+        memcpy(&fifo->data[start], data, avail);
+        memcpy(&fifo->data[0], &data[avail], num - avail);
+    }
+
+    fifo->num += num;
+}
+
+uint8_t fifo8_pop(Fifo8 *fifo)
+{
+    uint8_t ret;
+
+    assert(fifo->num > 0);
+    ret = fifo->data[fifo->head++];
+    fifo->head %= fifo->capacity;
+    fifo->num--;
+    return ret;
+}
+
+const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num)
+{
+    uint8_t *ret;
+
+    assert(max > 0 && max <= fifo->num);
+    *num = MIN(fifo->capacity - fifo->head, max);
+    ret = &fifo->data[fifo->head];
+    fifo->head += *num;
+    fifo->head %= fifo->capacity;
+    fifo->num -= *num;
+    return ret;
+}
+
+void fifo8_reset(Fifo8 *fifo)
+{
+    fifo->num = 0;
+    fifo->head = 0;
+}
+
+bool fifo8_is_empty(Fifo8 *fifo)
+{
+    return (fifo->num == 0);
+}
+
+bool fifo8_is_full(Fifo8 *fifo)
+{
+    return (fifo->num == fifo->capacity);
+}
+
+uint32_t fifo8_num_free(Fifo8 *fifo)
+{
+    return fifo->capacity - fifo->num;
+}
+
+uint32_t fifo8_num_used(Fifo8 *fifo)
+{
+    return fifo->num;
+}
+
+const VMStateDescription vmstate_fifo8 = {
+    .name = "Fifo8",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity),
+        VMSTATE_UINT32(head, Fifo8),
+        VMSTATE_UINT32(num, Fifo8),
+        VMSTATE_END_OF_LIST()
+    }
+};
diff --git a/util/filemonitor-inotify.c b/util/filemonitor-inotify.c
new file mode 100644
index 000000000..2c45f7f17
--- /dev/null
+++ b/util/filemonitor-inotify.c
@@ -0,0 +1,339 @@
+/*
+ * QEMU file monitor Linux inotify impl
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/filemonitor.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "trace.h"
+
+#include <sys/inotify.h>
+
+struct QFileMonitor {
+    int fd;
+    QemuMutex lock; /* protects dirs & idmap */
+    GHashTable *dirs; /* dirname => QFileMonitorDir */
+    GHashTable *idmap; /* inotify ID => dirname */
+};
+
+
+typedef struct {
+    int64_t id; /* watch ID */
+    char *filename; /* optional filter */
+    QFileMonitorHandler cb;
+    void *opaque;
+} QFileMonitorWatch;
+
+
+typedef struct {
+    char *path;
+    int inotify_id; /* inotify ID */
+    int next_file_id; /* file ID counter */
+    GArray *watches; /* QFileMonitorWatch elements */
+} QFileMonitorDir;
+
+
+static void qemu_file_monitor_watch(void *arg)
+{
+    QFileMonitor *mon = arg;
+    char buf[4096]
+        __attribute__ ((aligned(__alignof__(struct inotify_event))));
+    int used = 0;
+    int len;
+
+    qemu_mutex_lock(&mon->lock);
+
+    if (mon->fd == -1) {
+        qemu_mutex_unlock(&mon->lock);
+        return;
+    }
+
+    len = read(mon->fd, buf, sizeof(buf));
+
+    if (len < 0) {
+        if (errno != EAGAIN) {
+            error_report("Failure monitoring inotify FD '%s',"
+                         "disabling events", strerror(errno));
+            goto cleanup;
+        }
+
+        /* no more events right now */
+        goto cleanup;
+    }
+
+    /* Loop over all events in the buffer */
+    while (used < len) {
+        struct inotify_event *ev =
+            (struct inotify_event *)(buf + used);
+        const char *name = ev->len ? ev->name : "";
+        QFileMonitorDir *dir = g_hash_table_lookup(mon->idmap,
+                                                   GINT_TO_POINTER(ev->wd));
+        uint32_t iev = ev->mask &
+            (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED |
+             IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
+        int qev;
+        gsize i;
+
+        used += sizeof(struct inotify_event) + ev->len;
+
+        if (!dir) {
+            continue;
+        }
+
+        /*
+         * During a rename operation, the old name gets
+         * IN_MOVED_FROM and the new name gets IN_MOVED_TO.
+         * To simplify life for callers, we turn these into
+         * DELETED and CREATED events
+         */
+        switch (iev) {
+        case IN_CREATE:
+        case IN_MOVED_TO:
+            qev = QFILE_MONITOR_EVENT_CREATED;
+            break;
+        case IN_MODIFY:
+            qev = QFILE_MONITOR_EVENT_MODIFIED;
+            break;
+        case IN_DELETE:
+        case IN_MOVED_FROM:
+            qev = QFILE_MONITOR_EVENT_DELETED;
+            break;
+        case IN_ATTRIB:
+            qev = QFILE_MONITOR_EVENT_ATTRIBUTES;
+            break;
+        case IN_IGNORED:
+            qev = QFILE_MONITOR_EVENT_IGNORED;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+
+        trace_qemu_file_monitor_event(mon, dir->path, name, ev->mask,
+                                      dir->inotify_id);
+        for (i = 0; i < dir->watches->len; i++) {
+            QFileMonitorWatch *watch = &g_array_index(dir->watches,
+                                                      QFileMonitorWatch,
+                                                      i);
+
+            if (watch->filename == NULL ||
+                (name && g_str_equal(watch->filename, name))) {
+                trace_qemu_file_monitor_dispatch(mon, dir->path, name,
+                                                 qev, watch->cb,
+                                                 watch->opaque, watch->id);
+                watch->cb(watch->id, qev, name, watch->opaque);
+            }
+        }
+    }
+
+ cleanup:
+    qemu_mutex_unlock(&mon->lock);
+}
+
+
+static void
+qemu_file_monitor_dir_free(void *data)
+{
+    QFileMonitorDir *dir = data;
+    gsize i;
+
+    for (i = 0; i < dir->watches->len; i++) {
+        QFileMonitorWatch *watch = &g_array_index(dir->watches,
+                                                  QFileMonitorWatch, i);
+        g_free(watch->filename);
+    }
+    g_array_unref(dir->watches);
+    g_free(dir->path);
+    g_free(dir);
+}
+
+
+QFileMonitor *
+qemu_file_monitor_new(Error **errp)
+{
+    int fd;
+    QFileMonitor *mon;
+
+    fd = inotify_init1(IN_NONBLOCK);
+    if (fd < 0) {
+        error_setg_errno(errp, errno,
+                         "Unable to initialize inotify");
+        return NULL;
+    }
+
+    mon = g_new0(QFileMonitor, 1);
+    qemu_mutex_init(&mon->lock);
+    mon->fd = fd;
+
+    mon->dirs = g_hash_table_new_full(g_str_hash, g_str_equal, NULL,
+                                      qemu_file_monitor_dir_free);
+    mon->idmap = g_hash_table_new(g_direct_hash, g_direct_equal);
+
+    trace_qemu_file_monitor_new(mon, mon->fd);
+
+    return mon;
+}
+
+static gboolean
+qemu_file_monitor_free_idle(void *opaque)
+{
+    QFileMonitor *mon = opaque;
+
+    if (!mon) {
+        return G_SOURCE_REMOVE;
+    }
+
+    qemu_mutex_lock(&mon->lock);
+
+    g_hash_table_unref(mon->idmap);
+    g_hash_table_unref(mon->dirs);
+
+    qemu_mutex_unlock(&mon->lock);
+
+    qemu_mutex_destroy(&mon->lock);
+    g_free(mon);
+
+    return G_SOURCE_REMOVE;
+}
+
+void
+qemu_file_monitor_free(QFileMonitor *mon)
+{
+    if (!mon) {
+        return;
+    }
+
+    qemu_mutex_lock(&mon->lock);
+    if (mon->fd != -1) {
+        qemu_set_fd_handler(mon->fd, NULL, NULL, NULL);
+        close(mon->fd);
+        mon->fd = -1;
+    }
+    qemu_mutex_unlock(&mon->lock);
+
+    /*
+     * Can't free it yet, because another thread
+     * may be running event loop, so the inotify
+     * callback might be pending. Using an idle
+     * source ensures we'll only free after the
+     * pending callback is done
+     */
+    g_idle_add((GSourceFunc)qemu_file_monitor_free_idle, mon);
+}
+
+int64_t
+qemu_file_monitor_add_watch(QFileMonitor *mon,
+                            const char *dirpath,
+                            const char *filename,
+                            QFileMonitorHandler cb,
+                            void *opaque,
+                            Error **errp)
+{
+    QFileMonitorDir *dir;
+    QFileMonitorWatch watch;
+    int64_t ret = -1;
+
+    qemu_mutex_lock(&mon->lock);
+    dir = g_hash_table_lookup(mon->dirs, dirpath);
+    if (!dir) {
+        int rv = inotify_add_watch(mon->fd, dirpath,
+                                   IN_CREATE | IN_DELETE | IN_MODIFY |
+                                   IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
+
+        if (rv < 0) {
+            error_setg_errno(errp, errno, "Unable to watch '%s'", dirpath);
+            goto cleanup;
+        }
+
+        trace_qemu_file_monitor_enable_watch(mon, dirpath, rv);
+
+        dir = g_new0(QFileMonitorDir, 1);
+        dir->path = g_strdup(dirpath);
+        dir->inotify_id = rv;
+        dir->watches = g_array_new(FALSE, TRUE, sizeof(QFileMonitorWatch));
+
+        g_hash_table_insert(mon->dirs, dir->path, dir);
+        g_hash_table_insert(mon->idmap, GINT_TO_POINTER(rv), dir);
+
+        if (g_hash_table_size(mon->dirs) == 1) {
+            qemu_set_fd_handler(mon->fd, qemu_file_monitor_watch, NULL, mon);
+        }
+    }
+
+    watch.id = (((int64_t)dir->inotify_id) << 32) | dir->next_file_id++;
+    watch.filename = g_strdup(filename);
+    watch.cb = cb;
+    watch.opaque = opaque;
+
+    g_array_append_val(dir->watches, watch);
+
+    trace_qemu_file_monitor_add_watch(mon, dirpath,
+                                      filename ? filename : "<none>",
+                                      cb, opaque, watch.id);
+
+    ret = watch.id;
+
+ cleanup:
+    qemu_mutex_unlock(&mon->lock);
+    return ret;
+}
+
+
+void qemu_file_monitor_remove_watch(QFileMonitor *mon,
+                                    const char *dirpath,
+                                    int64_t id)
+{
+    QFileMonitorDir *dir;
+    gsize i;
+
+    qemu_mutex_lock(&mon->lock);
+
+    trace_qemu_file_monitor_remove_watch(mon, dirpath, id);
+
+    dir = g_hash_table_lookup(mon->dirs, dirpath);
+    if (!dir) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < dir->watches->len; i++) {
+        QFileMonitorWatch *watch = &g_array_index(dir->watches,
+                                                  QFileMonitorWatch, i);
+        if (watch->id == id) {
+            g_free(watch->filename);
+            g_array_remove_index(dir->watches, i);
+            break;
+        }
+    }
+
+    if (dir->watches->len == 0) {
+        inotify_rm_watch(mon->fd, dir->inotify_id);
+        trace_qemu_file_monitor_disable_watch(mon, dir->path, dir->inotify_id);
+
+        g_hash_table_remove(mon->idmap, GINT_TO_POINTER(dir->inotify_id));
+        g_hash_table_remove(mon->dirs, dir->path);
+
+        if (g_hash_table_size(mon->dirs) == 0) {
+            qemu_set_fd_handler(mon->fd, NULL, NULL, NULL);
+        }
+    }
+
+ cleanup:
+    qemu_mutex_unlock(&mon->lock);
+}
diff --git a/util/filemonitor-stub.c b/util/filemonitor-stub.c
new file mode 100644
index 000000000..93fef6534
--- /dev/null
+++ b/util/filemonitor-stub.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU file monitor stub impl
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/filemonitor.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+
+QFileMonitor *
+qemu_file_monitor_new(Error **errp)
+{
+    error_setg(errp, "File monitoring not available on this platform");
+    return NULL;
+}
+
+
+void
+qemu_file_monitor_free(QFileMonitor *mon G_GNUC_UNUSED)
+{
+}
+
+
+int64_t
+qemu_file_monitor_add_watch(QFileMonitor *mon G_GNUC_UNUSED,
+                            const char *dirpath G_GNUC_UNUSED,
+                            const char *filename G_GNUC_UNUSED,
+                            QFileMonitorHandler cb G_GNUC_UNUSED,
+                            void *opaque G_GNUC_UNUSED,
+                            Error **errp)
+{
+    error_setg(errp, "File monitoring not available on this platform");
+    return -1;
+}
+
+
+void
+qemu_file_monitor_remove_watch(QFileMonitor *mon G_GNUC_UNUSED,
+                               const char *dirpath G_GNUC_UNUSED,
+                               int64_t id G_GNUC_UNUSED)
+{
+}
diff --git a/util/getauxval.c b/util/getauxval.c
new file mode 100644
index 000000000..b124107d6
--- /dev/null
+++ b/util/getauxval.c
@@ -0,0 +1,118 @@
+/*
+ * QEMU access to the auxiliary vector
+ *
+ * Copyright (C) 2013 Red Hat, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#ifdef CONFIG_GETAUXVAL
+/* Don't inline this in qemu/osdep.h, because pulling in <sys/auxv.h> for
+   the system declaration of getauxval pulls in the system <elf.h>, which
+   conflicts with qemu's version.  */
+
+#include <sys/auxv.h>
+
+unsigned long qemu_getauxval(unsigned long key)
+{
+    return getauxval(key);
+}
+#elif defined(__linux__)
+#include "elf.h"
+
+/* Our elf.h doesn't contain Elf32_auxv_t and Elf64_auxv_t, which is ok because
+   that just makes it easier to define it properly for the host here.  */
+typedef struct {
+    unsigned long a_type;
+    unsigned long a_val;
+} ElfW_auxv_t;
+
+static const ElfW_auxv_t *auxv;
+
+static const ElfW_auxv_t *qemu_init_auxval(void)
+{
+    ElfW_auxv_t *a;
+    ssize_t size = 512, r, ofs;
+    int fd;
+
+    /* Allocate some initial storage.  Make sure the first entry is set
+       to end-of-list, so that we've got a valid list in case of error.  */
+    auxv = a = g_malloc(size);
+    a[0].a_type = 0;
+    a[0].a_val = 0;
+
+    fd = open("/proc/self/auxv", O_RDONLY);
+    if (fd < 0) {
+        return a;
+    }
+
+    /* Read the first SIZE bytes.  Hopefully, this covers everything.  */
+    r = read(fd, a, size);
+
+    if (r == size) {
+        /* Continue to expand until we do get a partial read.  */
+        do {
+            ofs = size;
+            size *= 2;
+            auxv = a = g_realloc(a, size);
+            r = read(fd, (char *)a + ofs, ofs);
+        } while (r == ofs);
+    }
+
+    close(fd);
+    return a;
+}
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+    const ElfW_auxv_t *a = auxv;
+
+    if (unlikely(a == NULL)) {
+        a = qemu_init_auxval();
+    }
+
+    for (; a->a_type != 0; a++) {
+        if (a->a_type == type) {
+            return a->a_val;
+        }
+    }
+
+    return 0;
+}
+
+#elif defined(__FreeBSD__)
+#include <sys/auxv.h>
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+    unsigned long aux = 0;
+    elf_aux_info(type, &aux, sizeof(aux));
+    return aux;
+}
+
+#else
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+    return 0;
+}
+
+#endif
diff --git a/util/guest-random.c b/util/guest-random.c
new file mode 100644
index 000000000..23643f86c
--- /dev/null
+++ b/util/guest-random.c
@@ -0,0 +1,101 @@
+/*
+ * QEMU guest-visible random functions
+ *
+ * Copyright 2019 Linaro, Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qemu/guest-random.h"
+#include "crypto/random.h"
+#include "sysemu/replay.h"
+
+
+static __thread GRand *thread_rand;
+static bool deterministic;
+
+
+static int glib_random_bytes(void *buf, size_t len)
+{
+    GRand *rand = thread_rand;
+    size_t i;
+    uint32_t x;
+
+    if (unlikely(rand == NULL)) {
+        /* Thread not initialized for a cpu, or main w/o -seed.  */
+        thread_rand = rand = g_rand_new();
+    }
+
+    for (i = 0; i + 4 <= len; i += 4) {
+        x = g_rand_int(rand);
+        __builtin_memcpy(buf + i, &x, 4);
+    }
+    if (i < len) {
+        x = g_rand_int(rand);
+        __builtin_memcpy(buf + i, &x, len - i);
+    }
+    return 0;
+}
+
+int qemu_guest_getrandom(void *buf, size_t len, Error **errp)
+{
+    int ret;
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        return replay_read_random(buf, len);
+    }
+    if (unlikely(deterministic)) {
+        /* Deterministic implementation using Glib's Mersenne Twister.  */
+        ret = glib_random_bytes(buf, len);
+    } else {
+        /* Non-deterministic implementation using crypto routines.  */
+        ret = qcrypto_random_bytes(buf, len, errp);
+    }
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_save_random(ret, buf, len);
+    }
+    return ret;
+}
+
+void qemu_guest_getrandom_nofail(void *buf, size_t len)
+{
+    (void)qemu_guest_getrandom(buf, len, &error_fatal);
+}
+
+uint64_t qemu_guest_random_seed_thread_part1(void)
+{
+    if (deterministic) {
+        uint64_t ret;
+        glib_random_bytes(&ret, sizeof(ret));
+        return ret;
+    }
+    return 0;
+}
+
+void qemu_guest_random_seed_thread_part2(uint64_t seed)
+{
+    g_assert(thread_rand == NULL);
+    if (deterministic) {
+        thread_rand =
+            g_rand_new_with_seed_array((const guint32 *)&seed,
+                                       sizeof(seed) / sizeof(guint32));
+    }
+}
+
+int qemu_guest_random_seed_main(const char *optarg, Error **errp)
+{
+    unsigned long long seed;
+    if (parse_uint_full(optarg, &seed, 0)) {
+        error_setg(errp, "Invalid seed number: %s", optarg);
+        return -1;
+    } else {
+        deterministic = true;
+        qemu_guest_random_seed_thread_part2(seed);
+        return 0;
+    }
+}
diff --git a/util/hbitmap.c b/util/hbitmap.c
new file mode 100644
index 000000000..305b894a6
--- /dev/null
+++ b/util/hbitmap.c
@@ -0,0 +1,933 @@
+/*
+ * Hierarchical Bitmap Data Type
+ *
+ * Copyright Red Hat, Inc., 2012
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/hbitmap.h"
+#include "qemu/host-utils.h"
+#include "trace.h"
+#include "crypto/hash.h"
+
+/* HBitmaps provides an array of bits.  The bits are stored as usual in an
+ * array of unsigned longs, but HBitmap is also optimized to provide fast
+ * iteration over set bits; going from one bit to the next is O(logB n)
+ * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough
+ * that the number of levels is in fact fixed.
+ *
+ * In order to do this, it stacks multiple bitmaps with progressively coarser
+ * granularity; in all levels except the last, bit N is set iff the N-th
+ * unsigned long is nonzero in the immediately next level.  When iteration
+ * completes on the last level it can examine the 2nd-last level to quickly
+ * skip entire words, and even do so recursively to skip blocks of 64 words or
+ * powers thereof (32 on 32-bit machines).
+ *
+ * Given an index in the bitmap, it can be split in group of bits like
+ * this (for the 64-bit case):
+ *
+ *   bits 0-57 => word in the last bitmap     | bits 58-63 => bit in the word
+ *   bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word
+ *   bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word
+ *
+ * So it is easy to move up simply by shifting the index right by
+ * log2(BITS_PER_LONG) bits.  To move down, you shift the index left
+ * similarly, and add the word index within the group.  Iteration uses
+ * ffs (find first set bit) to find the next word to examine; this
+ * operation can be done in constant time in most current architectures.
+ *
+ * Setting or clearing a range of m bits on all levels, the work to perform
+ * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap.
+ *
+ * When iterating on a bitmap, each bit (on any level) is only visited
+ * once.  Hence, The total cost of visiting a bitmap with m bits in it is
+ * the number of bits that are set in all bitmaps.  Unless the bitmap is
+ * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized
+ * cost of advancing from one bit to the next is usually constant (worst case
+ * O(logB n) as in the non-amortized complexity).
+ */
+
+struct HBitmap {
+    /*
+     * Size of the bitmap, as requested in hbitmap_alloc or in hbitmap_truncate.
+     */
+    uint64_t orig_size;
+
+    /* Number of total bits in the bottom level.  */
+    uint64_t size;
+
+    /* Number of set bits in the bottom level.  */
+    uint64_t count;
+
+    /* A scaling factor.  Given a granularity of G, each bit in the bitmap will
+     * will actually represent a group of 2^G elements.  Each operation on a
+     * range of bits first rounds the bits to determine which group they land
+     * in, and then affect the entire page; iteration will only visit the first
+     * bit of each group.  Here is an example of operations in a size-16,
+     * granularity-1 HBitmap:
+     *
+     *    initial state            00000000
+     *    set(start=0, count=9)    11111000 (iter: 0, 2, 4, 6, 8)
+     *    reset(start=1, count=3)  00111000 (iter: 4, 6, 8)
+     *    set(start=9, count=2)    00111100 (iter: 4, 6, 8, 10)
+     *    reset(start=5, count=5)  00000000
+     *
+     * From an implementation point of view, when setting or resetting bits,
+     * the bitmap will scale bit numbers right by this amount of bits.  When
+     * iterating, the bitmap will scale bit numbers left by this amount of
+     * bits.
+     */
+    int granularity;
+
+    /* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */
+    HBitmap *meta;
+
+    /* A number of progressively less coarse bitmaps (i.e. level 0 is the
+     * coarsest).  Each bit in level N represents a word in level N+1 that
+     * has a set bit, except the last level where each bit represents the
+     * actual bitmap.
+     *
+     * Note that all bitmaps have the same number of levels.  Even a 1-bit
+     * bitmap will still allocate HBITMAP_LEVELS arrays.
+     */
+    unsigned long *levels[HBITMAP_LEVELS];
+
+    /* The length of each levels[] array. */
+    uint64_t sizes[HBITMAP_LEVELS];
+};
+
+/* Advance hbi to the next nonzero word and return it.  hbi->pos
+ * is updated.  Returns zero if we reach the end of the bitmap.
+ */
+static unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
+{
+    size_t pos = hbi->pos;
+    const HBitmap *hb = hbi->hb;
+    unsigned i = HBITMAP_LEVELS - 1;
+
+    unsigned long cur;
+    do {
+        i--;
+        pos >>= BITS_PER_LEVEL;
+        cur = hbi->cur[i] & hb->levels[i][pos];
+    } while (cur == 0);
+
+    /* Check for end of iteration.  We always use fewer than BITS_PER_LONG
+     * bits in the level 0 bitmap; thus we can repurpose the most significant
+     * bit as a sentinel.  The sentinel is set in hbitmap_alloc and ensures
+     * that the above loop ends even without an explicit check on i.
+     */
+
+    if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) {
+        return 0;
+    }
+    for (; i < HBITMAP_LEVELS - 1; i++) {
+        /* Shift back pos to the left, matching the right shifts above.
+         * The index of this word's least significant set bit provides
+         * the low-order bits.
+         */
+        assert(cur);
+        pos = (pos << BITS_PER_LEVEL) + ctzl(cur);
+        hbi->cur[i] = cur & (cur - 1);
+
+        /* Set up next level for iteration.  */
+        cur = hb->levels[i + 1][pos];
+    }
+
+    hbi->pos = pos;
+    trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur);
+
+    assert(cur);
+    return cur;
+}
+
+int64_t hbitmap_iter_next(HBitmapIter *hbi)
+{
+    unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
+            hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
+    int64_t item;
+
+    if (cur == 0) {
+        cur = hbitmap_iter_skip_words(hbi);
+        if (cur == 0) {
+            return -1;
+        }
+    }
+
+    /* The next call will resume work from the next bit.  */
+    hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+    item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
+
+    return item << hbi->granularity;
+}
+
+void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first)
+{
+    unsigned i, bit;
+    uint64_t pos;
+
+    hbi->hb = hb;
+    pos = first >> hb->granularity;
+    assert(pos < hb->size);
+    hbi->pos = pos >> BITS_PER_LEVEL;
+    hbi->granularity = hb->granularity;
+
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        bit = pos & (BITS_PER_LONG - 1);
+        pos >>= BITS_PER_LEVEL;
+
+        /* Drop bits representing items before first.  */
+        hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1);
+
+        /* We have already added level i+1, so the lowest set bit has
+         * been processed.  Clear it.
+         */
+        if (i != HBITMAP_LEVELS - 1) {
+            hbi->cur[i] &= ~(1UL << bit);
+        }
+    }
+}
+
+int64_t hbitmap_next_dirty(const HBitmap *hb, int64_t start, int64_t count)
+{
+    HBitmapIter hbi;
+    int64_t first_dirty_off;
+    uint64_t end;
+
+    assert(start >= 0 && count >= 0);
+
+    if (start >= hb->orig_size || count == 0) {
+        return -1;
+    }
+
+    end = count > hb->orig_size - start ? hb->orig_size : start + count;
+
+    hbitmap_iter_init(&hbi, hb, start);
+    first_dirty_off = hbitmap_iter_next(&hbi);
+
+    if (first_dirty_off < 0 || first_dirty_off >= end) {
+        return -1;
+    }
+
+    return MAX(start, first_dirty_off);
+}
+
+int64_t hbitmap_next_zero(const HBitmap *hb, int64_t start, int64_t count)
+{
+    size_t pos = (start >> hb->granularity) >> BITS_PER_LEVEL;
+    unsigned long *last_lev = hb->levels[HBITMAP_LEVELS - 1];
+    unsigned long cur = last_lev[pos];
+    unsigned start_bit_offset;
+    uint64_t end_bit, sz;
+    int64_t res;
+
+    assert(start >= 0 && count >= 0);
+
+    if (start >= hb->orig_size || count == 0) {
+        return -1;
+    }
+
+    end_bit = count > hb->orig_size - start ?
+                hb->size :
+                ((start + count - 1) >> hb->granularity) + 1;
+    sz = (end_bit + BITS_PER_LONG - 1) >> BITS_PER_LEVEL;
+
+    /* There may be some zero bits in @cur before @start. We are not interested
+     * in them, let's set them.
+     */
+    start_bit_offset = (start >> hb->granularity) & (BITS_PER_LONG - 1);
+    cur |= (1UL << start_bit_offset) - 1;
+    assert((start >> hb->granularity) < hb->size);
+
+    if (cur == (unsigned long)-1) {
+        do {
+            pos++;
+        } while (pos < sz && last_lev[pos] == (unsigned long)-1);
+
+        if (pos >= sz) {
+            return -1;
+        }
+
+        cur = last_lev[pos];
+    }
+
+    res = (pos << BITS_PER_LEVEL) + ctol(cur);
+    if (res >= end_bit) {
+        return -1;
+    }
+
+    res = res << hb->granularity;
+    if (res < start) {
+        assert(((start - res) >> hb->granularity) == 0);
+        return start;
+    }
+
+    return res;
+}
+
+bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t start, int64_t end,
+                             int64_t max_dirty_count,
+                             int64_t *dirty_start, int64_t *dirty_count)
+{
+    int64_t next_zero;
+
+    assert(start >= 0 && end >= 0 && max_dirty_count > 0);
+
+    end = MIN(end, hb->orig_size);
+    if (start >= end) {
+        return false;
+    }
+
+    start = hbitmap_next_dirty(hb, start, end - start);
+    if (start < 0) {
+        return false;
+    }
+
+    end = start + MIN(end - start, max_dirty_count);
+
+    next_zero = hbitmap_next_zero(hb, start, end - start);
+    if (next_zero >= 0) {
+        end = next_zero;
+    }
+
+    *dirty_start = start;
+    *dirty_count = end - start;
+
+    return true;
+}
+
+bool hbitmap_empty(const HBitmap *hb)
+{
+    return hb->count == 0;
+}
+
+int hbitmap_granularity(const HBitmap *hb)
+{
+    return hb->granularity;
+}
+
+uint64_t hbitmap_count(const HBitmap *hb)
+{
+    return hb->count << hb->granularity;
+}
+
+/**
+ * hbitmap_iter_next_word:
+ * @hbi: HBitmapIter to operate on.
+ * @p_cur: Location where to store the next non-zero word.
+ *
+ * Return the index of the next nonzero word that is set in @hbi's
+ * associated HBitmap, and set *p_cur to the content of that word
+ * (bits before the index that was passed to hbitmap_iter_init are
+ * trimmed on the first call).  Return -1, and set *p_cur to zero,
+ * if all remaining words are zero.
+ */
+static size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur)
+{
+    unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
+
+    if (cur == 0) {
+        cur = hbitmap_iter_skip_words(hbi);
+        if (cur == 0) {
+            *p_cur = 0;
+            return -1;
+        }
+    }
+
+    /* The next call will resume work from the next word.  */
+    hbi->cur[HBITMAP_LEVELS - 1] = 0;
+    *p_cur = cur;
+    return hbi->pos;
+}
+
+/* Count the number of set bits between start and end, not accounting for
+ * the granularity.  Also an example of how to use hbitmap_iter_next_word.
+ */
+static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last)
+{
+    HBitmapIter hbi;
+    uint64_t count = 0;
+    uint64_t end = last + 1;
+    unsigned long cur;
+    size_t pos;
+
+    hbitmap_iter_init(&hbi, hb, start << hb->granularity);
+    for (;;) {
+        pos = hbitmap_iter_next_word(&hbi, &cur);
+        if (pos >= (end >> BITS_PER_LEVEL)) {
+            break;
+        }
+        count += ctpopl(cur);
+    }
+
+    if (pos == (end >> BITS_PER_LEVEL)) {
+        /* Drop bits representing the END-th and subsequent items.  */
+        int bit = end & (BITS_PER_LONG - 1);
+        cur &= (1UL << bit) - 1;
+        count += ctpopl(cur);
+    }
+
+    return count;
+}
+
+/* Setting starts at the last layer and propagates up if an element
+ * changes.
+ */
+static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+    unsigned long mask;
+    unsigned long old;
+
+    assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+    assert(start <= last);
+
+    mask = 2UL << (last & (BITS_PER_LONG - 1));
+    mask -= 1UL << (start & (BITS_PER_LONG - 1));
+    old = *elem;
+    *elem |= mask;
+    return old != *elem;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_set_between(HBitmap *hb, int level, uint64_t start,
+                           uint64_t last)
+{
+    size_t pos = start >> BITS_PER_LEVEL;
+    size_t lastpos = last >> BITS_PER_LEVEL;
+    bool changed = false;
+    size_t i;
+
+    i = pos;
+    if (i < lastpos) {
+        uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+        changed |= hb_set_elem(&hb->levels[level][i], start, next - 1);
+        for (;;) {
+            start = next;
+            next += BITS_PER_LONG;
+            if (++i == lastpos) {
+                break;
+            }
+            changed |= (hb->levels[level][i] == 0);
+            hb->levels[level][i] = ~0UL;
+        }
+    }
+    changed |= hb_set_elem(&hb->levels[level][i], start, last);
+
+    /* If there was any change in this layer, we may have to update
+     * the one above.
+     */
+    if (level > 0 && changed) {
+        hb_set_between(hb, level - 1, pos, lastpos);
+    }
+    return changed;
+}
+
+void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
+{
+    /* Compute range in the last layer.  */
+    uint64_t first, n;
+    uint64_t last = start + count - 1;
+
+    if (count == 0) {
+        return;
+    }
+
+    trace_hbitmap_set(hb, start, count,
+                      start >> hb->granularity, last >> hb->granularity);
+
+    first = start >> hb->granularity;
+    last >>= hb->granularity;
+    assert(last < hb->size);
+    n = last - first + 1;
+
+    hb->count += n - hb_count_between(hb, first, last);
+    if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+        hb->meta) {
+        hbitmap_set(hb->meta, start, count);
+    }
+}
+
+/* Resetting works the other way round: propagate up if the new
+ * value is zero.
+ */
+static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+    unsigned long mask;
+    bool blanked;
+
+    assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+    assert(start <= last);
+
+    mask = 2UL << (last & (BITS_PER_LONG - 1));
+    mask -= 1UL << (start & (BITS_PER_LONG - 1));
+    blanked = *elem != 0 && ((*elem & ~mask) == 0);
+    *elem &= ~mask;
+    return blanked;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_reset_between(HBitmap *hb, int level, uint64_t start,
+                             uint64_t last)
+{
+    size_t pos = start >> BITS_PER_LEVEL;
+    size_t lastpos = last >> BITS_PER_LEVEL;
+    bool changed = false;
+    size_t i;
+
+    i = pos;
+    if (i < lastpos) {
+        uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+
+        /* Here we need a more complex test than when setting bits.  Even if
+         * something was changed, we must not blank bits in the upper level
+         * unless the lower-level word became entirely zero.  So, remove pos
+         * from the upper-level range if bits remain set.
+         */
+        if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) {
+            changed = true;
+        } else {
+            pos++;
+        }
+
+        for (;;) {
+            start = next;
+            next += BITS_PER_LONG;
+            if (++i == lastpos) {
+                break;
+            }
+            changed |= (hb->levels[level][i] != 0);
+            hb->levels[level][i] = 0UL;
+        }
+    }
+
+    /* Same as above, this time for lastpos.  */
+    if (hb_reset_elem(&hb->levels[level][i], start, last)) {
+        changed = true;
+    } else {
+        lastpos--;
+    }
+
+    if (level > 0 && changed) {
+        hb_reset_between(hb, level - 1, pos, lastpos);
+    }
+
+    return changed;
+
+}
+
+void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
+{
+    /* Compute range in the last layer.  */
+    uint64_t first;
+    uint64_t last = start + count - 1;
+    uint64_t gran = 1ULL << hb->granularity;
+
+    if (count == 0) {
+        return;
+    }
+
+    assert(QEMU_IS_ALIGNED(start, gran));
+    assert(QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size));
+
+    trace_hbitmap_reset(hb, start, count,
+                        start >> hb->granularity, last >> hb->granularity);
+
+    first = start >> hb->granularity;
+    last >>= hb->granularity;
+    assert(last < hb->size);
+
+    hb->count -= hb_count_between(hb, first, last);
+    if (hb_reset_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+        hb->meta) {
+        hbitmap_set(hb->meta, start, count);
+    }
+}
+
+void hbitmap_reset_all(HBitmap *hb)
+{
+    unsigned int i;
+
+    /* Same as hbitmap_alloc() except for memset() instead of malloc() */
+    for (i = HBITMAP_LEVELS; --i >= 1; ) {
+        memset(hb->levels[i], 0, hb->sizes[i] * sizeof(unsigned long));
+    }
+
+    hb->levels[0][0] = 1UL << (BITS_PER_LONG - 1);
+    hb->count = 0;
+}
+
+bool hbitmap_is_serializable(const HBitmap *hb)
+{
+    /* Every serialized chunk must be aligned to 64 bits so that endianness
+     * requirements can be fulfilled on both 64 bit and 32 bit hosts.
+     * We have hbitmap_serialization_align() which converts this
+     * alignment requirement from bitmap bits to items covered (e.g. sectors).
+     * That value is:
+     *    64 << hb->granularity
+     * Since this value must not exceed UINT64_MAX, hb->granularity must be
+     * less than 58 (== 64 - 6, where 6 is ld(64), i.e. 1 << 6 == 64).
+     *
+     * In order for hbitmap_serialization_align() to always return a
+     * meaningful value, bitmaps that are to be serialized must have a
+     * granularity of less than 58. */
+
+    return hb->granularity < 58;
+}
+
+bool hbitmap_get(const HBitmap *hb, uint64_t item)
+{
+    /* Compute position and bit in the last layer.  */
+    uint64_t pos = item >> hb->granularity;
+    unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1));
+    assert(pos < hb->size);
+
+    return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
+}
+
+uint64_t hbitmap_serialization_align(const HBitmap *hb)
+{
+    assert(hbitmap_is_serializable(hb));
+
+    /* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit
+     * hosts. */
+    return UINT64_C(64) << hb->granularity;
+}
+
+/* Start should be aligned to serialization granularity, chunk size should be
+ * aligned to serialization granularity too, except for last chunk.
+ */
+static void serialization_chunk(const HBitmap *hb,
+                                uint64_t start, uint64_t count,
+                                unsigned long **first_el, uint64_t *el_count)
+{
+    uint64_t last = start + count - 1;
+    uint64_t gran = hbitmap_serialization_align(hb);
+
+    assert((start & (gran - 1)) == 0);
+    assert((last >> hb->granularity) < hb->size);
+    if ((last >> hb->granularity) != hb->size - 1) {
+        assert((count & (gran - 1)) == 0);
+    }
+
+    start = (start >> hb->granularity) >> BITS_PER_LEVEL;
+    last = (last >> hb->granularity) >> BITS_PER_LEVEL;
+
+    *first_el = &hb->levels[HBITMAP_LEVELS - 1][start];
+    *el_count = last - start + 1;
+}
+
+uint64_t hbitmap_serialization_size(const HBitmap *hb,
+                                    uint64_t start, uint64_t count)
+{
+    uint64_t el_count;
+    unsigned long *cur;
+
+    if (!count) {
+        return 0;
+    }
+    serialization_chunk(hb, start, count, &cur, &el_count);
+
+    return el_count * sizeof(unsigned long);
+}
+
+void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf,
+                            uint64_t start, uint64_t count)
+{
+    uint64_t el_count;
+    unsigned long *cur, *end;
+
+    if (!count) {
+        return;
+    }
+    serialization_chunk(hb, start, count, &cur, &el_count);
+    end = cur + el_count;
+
+    while (cur != end) {
+        unsigned long el =
+            (BITS_PER_LONG == 32 ? cpu_to_le32(*cur) : cpu_to_le64(*cur));
+
+        memcpy(buf, &el, sizeof(el));
+        buf += sizeof(el);
+        cur++;
+    }
+}
+
+void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf,
+                              uint64_t start, uint64_t count,
+                              bool finish)
+{
+    uint64_t el_count;
+    unsigned long *cur, *end;
+
+    if (!count) {
+        return;
+    }
+    serialization_chunk(hb, start, count, &cur, &el_count);
+    end = cur + el_count;
+
+    while (cur != end) {
+        memcpy(cur, buf, sizeof(*cur));
+
+        if (BITS_PER_LONG == 32) {
+            le32_to_cpus((uint32_t *)cur);
+        } else {
+            le64_to_cpus((uint64_t *)cur);
+        }
+
+        buf += sizeof(unsigned long);
+        cur++;
+    }
+    if (finish) {
+        hbitmap_deserialize_finish(hb);
+    }
+}
+
+void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count,
+                                bool finish)
+{
+    uint64_t el_count;
+    unsigned long *first;
+
+    if (!count) {
+        return;
+    }
+    serialization_chunk(hb, start, count, &first, &el_count);
+
+    memset(first, 0, el_count * sizeof(unsigned long));
+    if (finish) {
+        hbitmap_deserialize_finish(hb);
+    }
+}
+
+void hbitmap_deserialize_ones(HBitmap *hb, uint64_t start, uint64_t count,
+                              bool finish)
+{
+    uint64_t el_count;
+    unsigned long *first;
+
+    if (!count) {
+        return;
+    }
+    serialization_chunk(hb, start, count, &first, &el_count);
+
+    memset(first, 0xff, el_count * sizeof(unsigned long));
+    if (finish) {
+        hbitmap_deserialize_finish(hb);
+    }
+}
+
+void hbitmap_deserialize_finish(HBitmap *bitmap)
+{
+    int64_t i, size, prev_size;
+    int lev;
+
+    /* restore levels starting from penultimate to zero level, assuming
+     * that the last level is ok */
+    size = MAX((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+    for (lev = HBITMAP_LEVELS - 1; lev-- > 0; ) {
+        prev_size = size;
+        size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+        memset(bitmap->levels[lev], 0, size * sizeof(unsigned long));
+
+        for (i = 0; i < prev_size; ++i) {
+            if (bitmap->levels[lev + 1][i]) {
+                bitmap->levels[lev][i >> BITS_PER_LEVEL] |=
+                    1UL << (i & (BITS_PER_LONG - 1));
+            }
+        }
+    }
+
+    bitmap->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+    bitmap->count = hb_count_between(bitmap, 0, bitmap->size - 1);
+}
+
+void hbitmap_free(HBitmap *hb)
+{
+    unsigned i;
+    assert(!hb->meta);
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        g_free(hb->levels[i]);
+    }
+    g_free(hb);
+}
+
+HBitmap *hbitmap_alloc(uint64_t size, int granularity)
+{
+    HBitmap *hb = g_new0(struct HBitmap, 1);
+    unsigned i;
+
+    assert(size <= INT64_MAX);
+    hb->orig_size = size;
+
+    assert(granularity >= 0 && granularity < 64);
+    size = (size + (1ULL << granularity) - 1) >> granularity;
+    assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+
+    hb->size = size;
+    hb->granularity = granularity;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+        hb->sizes[i] = size;
+        hb->levels[i] = g_new0(unsigned long, size);
+    }
+
+    /* We necessarily have free bits in level 0 due to the definition
+     * of HBITMAP_LEVELS, so use one for a sentinel.  This speeds up
+     * hbitmap_iter_skip_words.
+     */
+    assert(size == 1);
+    hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+    return hb;
+}
+
+void hbitmap_truncate(HBitmap *hb, uint64_t size)
+{
+    bool shrink;
+    unsigned i;
+    uint64_t num_elements = size;
+    uint64_t old;
+
+    assert(size <= INT64_MAX);
+    hb->orig_size = size;
+
+    /* Size comes in as logical elements, adjust for granularity. */
+    size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity;
+    assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+    shrink = size < hb->size;
+
+    /* bit sizes are identical; nothing to do. */
+    if (size == hb->size) {
+        return;
+    }
+
+    /* If we're losing bits, let's clear those bits before we invalidate all of
+     * our invariants. This helps keep the bitcount consistent, and will prevent
+     * us from carrying around garbage bits beyond the end of the map.
+     */
+    if (shrink) {
+        /* Don't clear partial granularity groups;
+         * start at the first full one. */
+        uint64_t start = ROUND_UP(num_elements, UINT64_C(1) << hb->granularity);
+        uint64_t fix_count = (hb->size << hb->granularity) - start;
+
+        assert(fix_count);
+        hbitmap_reset(hb, start, fix_count);
+    }
+
+    hb->size = size;
+    for (i = HBITMAP_LEVELS; i-- > 0; ) {
+        size = MAX(BITS_TO_LONGS(size), 1);
+        if (hb->sizes[i] == size) {
+            break;
+        }
+        old = hb->sizes[i];
+        hb->sizes[i] = size;
+        hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long));
+        if (!shrink) {
+            memset(&hb->levels[i][old], 0x00,
+                   (size - old) * sizeof(*hb->levels[i]));
+        }
+    }
+    if (hb->meta) {
+        hbitmap_truncate(hb->meta, hb->size << hb->granularity);
+    }
+}
+
+bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b)
+{
+    return (a->orig_size == b->orig_size);
+}
+
+/**
+ * hbitmap_sparse_merge: performs dst = dst | src
+ * works with differing granularities.
+ * best used when src is sparsely populated.
+ */
+static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src)
+{
+    int64_t offset;
+    int64_t count;
+
+    for (offset = 0;
+         hbitmap_next_dirty_area(src, offset, src->orig_size, INT64_MAX,
+                                 &offset, &count);
+         offset += count)
+    {
+        hbitmap_set(dst, offset, count);
+    }
+}
+
+/**
+ * Given HBitmaps A and B, let R := A (BITOR) B.
+ * Bitmaps A and B will not be modified,
+ *     except when bitmap R is an alias of A or B.
+ *
+ * @return true if the merge was successful,
+ *         false if it was not attempted.
+ */
+bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
+{
+    int i;
+    uint64_t j;
+
+    if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) {
+        return false;
+    }
+    assert(hbitmap_can_merge(b, result));
+
+    if ((!hbitmap_count(a) && result == b) ||
+        (!hbitmap_count(b) && result == a)) {
+        return true;
+    }
+
+    if (!hbitmap_count(a) && !hbitmap_count(b)) {
+        hbitmap_reset_all(result);
+        return true;
+    }
+
+    if (a->granularity != b->granularity) {
+        if ((a != result) && (b != result)) {
+            hbitmap_reset_all(result);
+        }
+        if (a != result) {
+            hbitmap_sparse_merge(result, a);
+        }
+        if (b != result) {
+            hbitmap_sparse_merge(result, b);
+        }
+        return true;
+    }
+
+    /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
+     * It may be possible to improve running times for sparsely populated maps
+     * by using hbitmap_iter_next, but this is suboptimal for dense maps.
+     */
+    assert(a->size == b->size);
+    for (i = HBITMAP_LEVELS - 1; i >= 0; i--) {
+        for (j = 0; j < a->sizes[i]; j++) {
+            result->levels[i][j] = a->levels[i][j] | b->levels[i][j];
+        }
+    }
+
+    /* Recompute the dirty count */
+    result->count = hb_count_between(result, 0, result->size - 1);
+
+    return true;
+}
+
+char *hbitmap_sha256(const HBitmap *bitmap, Error **errp)
+{
+    size_t size = bitmap->sizes[HBITMAP_LEVELS - 1] * sizeof(unsigned long);
+    char *data = (char *)bitmap->levels[HBITMAP_LEVELS - 1];
+    char *hash = NULL;
+    qcrypto_hash_digest(QCRYPTO_HASH_ALG_SHA256, data, size, &hash, errp);
+
+    return hash;
+}
diff --git a/util/hexdump.c b/util/hexdump.c
new file mode 100644
index 000000000..2c105a884
--- /dev/null
+++ b/util/hexdump.c
@@ -0,0 +1,65 @@
+/*
+ * Helper to hexdump a buffer
+ *
+ * Copyright (c) 2013 Red Hat, Inc.
+ * Copyright (c) 2013 Gerd Hoffmann <kraxel@redhat.com>
+ * Copyright (c) 2013 Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+ * Copyright (c) 2013 Xilinx, Inc
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+void qemu_hexdump_line(char *line, unsigned int b, const void *bufptr,
+                       unsigned int len, bool ascii)
+{
+    const char *buf = bufptr;
+    int i, c;
+
+    if (len > QEMU_HEXDUMP_LINE_BYTES) {
+        len = QEMU_HEXDUMP_LINE_BYTES;
+    }
+
+    line += snprintf(line, 6, "%04x:", b);
+    for (i = 0; i < QEMU_HEXDUMP_LINE_BYTES; i++) {
+        if ((i % 4) == 0) {
+            *line++ = ' ';
+        }
+        if (i < len) {
+            line += sprintf(line, " %02x", (unsigned char)buf[b + i]);
+        } else {
+            line += sprintf(line, "   ");
+        }
+    }
+    if (ascii) {
+        *line++ = ' ';
+        for (i = 0; i < len; i++) {
+            c = buf[b + i];
+            if (c < ' ' || c > '~') {
+                c = '.';
+            }
+            *line++ = c;
+        }
+    }
+    *line = '\0';
+}
+
+void qemu_hexdump(FILE *fp, const char *prefix,
+                  const void *bufptr, size_t size)
+{
+    unsigned int b, len;
+    char line[QEMU_HEXDUMP_LINE_LEN];
+
+    for (b = 0; b < size; b += QEMU_HEXDUMP_LINE_BYTES) {
+        len = size - b;
+        qemu_hexdump_line(line, b, bufptr, len, true);
+        fprintf(fp, "%s: %s\n", prefix, line);
+    }
+
+}
diff --git a/util/host-utils.c b/util/host-utils.c
new file mode 100644
index 000000000..bcc772b8e
--- /dev/null
+++ b/util/host-utils.c
@@ -0,0 +1,268 @@
+/*
+ * Utility compute operations used by translated code.
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2007 Aurelien Jarno
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+#ifndef CONFIG_INT128
+/* Long integer helpers */
+static inline void mul64(uint64_t *plow, uint64_t *phigh,
+                         uint64_t a, uint64_t b)
+{
+    typedef union {
+        uint64_t ll;
+        struct {
+#ifdef HOST_WORDS_BIGENDIAN
+            uint32_t high, low;
+#else
+            uint32_t low, high;
+#endif
+        } l;
+    } LL;
+    LL rl, rm, rn, rh, a0, b0;
+    uint64_t c;
+
+    a0.ll = a;
+    b0.ll = b;
+
+    rl.ll = (uint64_t)a0.l.low * b0.l.low;
+    rm.ll = (uint64_t)a0.l.low * b0.l.high;
+    rn.ll = (uint64_t)a0.l.high * b0.l.low;
+    rh.ll = (uint64_t)a0.l.high * b0.l.high;
+
+    c = (uint64_t)rl.l.high + rm.l.low + rn.l.low;
+    rl.l.high = c;
+    c >>= 32;
+    c = c + rm.l.high + rn.l.high + rh.l.low;
+    rh.l.low = c;
+    rh.l.high += (uint32_t)(c >> 32);
+
+    *plow = rl.ll;
+    *phigh = rh.ll;
+}
+
+/* Unsigned 64x64 -> 128 multiplication */
+void mulu64 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+    mul64(plow, phigh, a, b);
+}
+
+/* Signed 64x64 -> 128 multiplication */
+void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
+{
+    uint64_t rh;
+
+    mul64(plow, &rh, a, b);
+
+    /* Adjust for signs.  */
+    if (b < 0) {
+        rh -= a;
+    }
+    if (a < 0) {
+        rh -= b;
+    }
+    *phigh = rh;
+}
+
+/*
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+{
+    uint64_t dhi = *phigh;
+    uint64_t dlo = *plow;
+    uint64_t rem, dhighest;
+    int sh;
+
+    if (divisor == 0 || dhi == 0) {
+        *plow  = dlo / divisor;
+        *phigh = 0;
+        return dlo % divisor;
+    } else {
+        sh = clz64(divisor);
+
+        if (dhi < divisor) {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+            }
+
+            *phigh = 0;
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        } else {
+            if (sh != 0) {
+                /* normalize the divisor, shifting the dividend accordingly */
+                divisor <<= sh;
+                dhighest = dhi >> (64 - sh);
+                dhi = (dhi << sh) | (dlo >> (64 - sh));
+                dlo <<= sh;
+
+                *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+            } else {
+                /**
+                 * dhi >= divisor
+                 * Since the MSB of divisor is set (sh == 0),
+                 * (dhi - divisor) < divisor
+                 *
+                 * Thus, the high part of the quotient is 1, and we can
+                 * calculate the low part with a single call to udiv_qrnnd
+                 * after subtracting divisor from dhi
+                 */
+                dhi -= divisor;
+                *phigh = 1;
+            }
+
+            *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+        }
+
+        /*
+         * since the dividend/divisor might have been normalized,
+         * the remainder might also have to be shifted back
+         */
+        return rem >> sh;
+    }
+}
+
+/*
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
+{
+    bool neg_quotient = false, neg_remainder = false;
+    uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+    uint64_t rem;
+
+    if (*phigh < 0) {
+        neg_quotient = !neg_quotient;
+        neg_remainder = !neg_remainder;
+
+        if (unsig_lo == 0) {
+            unsig_hi = -unsig_hi;
+        } else {
+            unsig_hi = ~unsig_hi;
+            unsig_lo = -unsig_lo;
+        }
+    }
+
+    if (divisor < 0) {
+        neg_quotient = !neg_quotient;
+
+        divisor = -divisor;
+    }
+
+    rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+    if (neg_quotient) {
+        if (unsig_lo == 0) {
+            *phigh = -unsig_hi;
+            *plow = 0;
+        } else {
+            *phigh = ~unsig_hi;
+            *plow = -unsig_lo;
+        }
+    } else {
+        *phigh = unsig_hi;
+        *plow = unsig_lo;
+    }
+
+    if (neg_remainder) {
+        return -rem;
+    } else {
+        return rem;
+    }
+}
+#endif
+
+/**
+ * urshift - 128-bit Unsigned Right Shift.
+ * @plow: in/out - lower 64-bit integer.
+ * @phigh: in/out - higher 64-bit integer.
+ * @shift: in - bytes to shift, between 0 and 127.
+ *
+ * Result is zero-extended and stored in plow/phigh, which are
+ * input/output variables. Shift values outside the range will
+ * be mod to 128. In other words, the caller is responsible to
+ * verify/assert both the shift range and plow/phigh pointers.
+ */
+void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift)
+{
+    shift &= 127;
+    if (shift == 0) {
+        return;
+    }
+
+    uint64_t h = *phigh >> (shift & 63);
+    if (shift >= 64) {
+        *plow = h;
+        *phigh = 0;
+    } else {
+        *plow = (*plow >> (shift & 63)) | (*phigh << (64 - (shift & 63)));
+        *phigh = h;
+    }
+}
+
+/**
+ * ulshift - 128-bit Unsigned Left Shift.
+ * @plow: in/out - lower 64-bit integer.
+ * @phigh: in/out - higher 64-bit integer.
+ * @shift: in - bytes to shift, between 0 and 127.
+ * @overflow: out - true if any 1-bit is shifted out.
+ *
+ * Result is zero-extended and stored in plow/phigh, which are
+ * input/output variables. Shift values outside the range will
+ * be mod to 128. In other words, the caller is responsible to
+ * verify/assert both the shift range and plow/phigh pointers.
+ */
+void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow)
+{
+    uint64_t low = *plow;
+    uint64_t high = *phigh;
+
+    shift &= 127;
+    if (shift == 0) {
+        return;
+    }
+
+    /* check if any bit will be shifted out */
+    urshift(&low, &high, 128 - shift);
+    if (low | high) {
+        *overflow = true;
+    }
+
+    if (shift >= 64) {
+        *phigh = *plow << (shift & 63);
+        *plow = 0;
+    } else {
+        *phigh = (*plow >> (64 - (shift & 63))) | (*phigh << (shift & 63));
+        *plow = *plow << shift;
+    }
+}
diff --git a/util/id.c b/util/id.c
new file mode 100644
index 000000000..ded41c502
--- /dev/null
+++ b/util/id.c
@@ -0,0 +1,69 @@
+/*
+ * Dealing with identifiers
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later.  See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/ctype.h"
+#include "qemu/id.h"
+
+bool id_wellformed(const char *id)
+{
+    int i;
+
+    if (!qemu_isalpha(id[0])) {
+        return false;
+    }
+    for (i = 1; id[i]; i++) {
+        if (!qemu_isalnum(id[i]) && !strchr("-._", id[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+#define ID_SPECIAL_CHAR '#'
+
+static const char *const id_subsys_str[ID_MAX] = {
+    [ID_QDEV]  = "qdev",
+    [ID_BLOCK] = "block",
+    [ID_CHR] = "chr",
+    [ID_NET] = "net",
+};
+
+/*
+ *  Generates an ID of the form PREFIX SUBSYSTEM NUMBER
+ *  where:
+ *
+ *  - PREFIX is the reserved character '#'
+ *  - SUBSYSTEM identifies the subsystem creating the ID
+ *  - NUMBER is a decimal number unique within SUBSYSTEM.
+ *
+ *    Example: "#block146"
+ *
+ * Note that these IDs do not satisfy id_wellformed().
+ *
+ * The caller is responsible for freeing the returned string with g_free()
+ */
+char *id_generate(IdSubSystems id)
+{
+    static uint64_t id_counters[ID_MAX];
+    uint32_t rnd;
+
+    assert(id < ARRAY_SIZE(id_subsys_str));
+    assert(id_subsys_str[id]);
+
+    rnd = g_random_int_range(0, 100);
+
+    return g_strdup_printf("%c%s%" PRIu64 "%02" PRId32, ID_SPECIAL_CHAR,
+                                                        id_subsys_str[id],
+                                                        id_counters[id]++,
+                                                        rnd);
+}
diff --git a/util/iov.c b/util/iov.c
new file mode 100644
index 000000000..58c7b3eee
--- /dev/null
+++ b/util/iov.c
@@ -0,0 +1,764 @@
+/*
+ * Helpers for getting linearized buffers from iov / filling buffers into iovs
+ *
+ * Copyright IBM, Corp. 2007, 2008
+ * Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Author(s):
+ *  Anthony Liguori <aliguori@us.ibm.com>
+ *  Amit Shah <amit.shah@redhat.com>
+ *  Michael Tokarev <mjt@tls.msk.ru>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/iov.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+
+size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt,
+                         size_t offset, const void *buf, size_t bytes)
+{
+    size_t done;
+    unsigned int i;
+    for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+        if (offset < iov[i].iov_len) {
+            size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+            memcpy(iov[i].iov_base + offset, buf + done, len);
+            done += len;
+            offset = 0;
+        } else {
+            offset -= iov[i].iov_len;
+        }
+    }
+    assert(offset == 0);
+    return done;
+}
+
+size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt,
+                       size_t offset, void *buf, size_t bytes)
+{
+    size_t done;
+    unsigned int i;
+    for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+        if (offset < iov[i].iov_len) {
+            size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+            memcpy(buf + done, iov[i].iov_base + offset, len);
+            done += len;
+            offset = 0;
+        } else {
+            offset -= iov[i].iov_len;
+        }
+    }
+    assert(offset == 0);
+    return done;
+}
+
+size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt,
+                  size_t offset, int fillc, size_t bytes)
+{
+    size_t done;
+    unsigned int i;
+    for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+        if (offset < iov[i].iov_len) {
+            size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+            memset(iov[i].iov_base + offset, fillc, len);
+            done += len;
+            offset = 0;
+        } else {
+            offset -= iov[i].iov_len;
+        }
+    }
+    assert(offset == 0);
+    return done;
+}
+
+size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt)
+{
+    size_t len;
+    unsigned int i;
+
+    len = 0;
+    for (i = 0; i < iov_cnt; i++) {
+        len += iov[i].iov_len;
+    }
+    return len;
+}
+
+/* helper function for iov_send_recv() */
+static ssize_t
+do_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, bool do_send)
+{
+#ifdef CONFIG_POSIX
+    ssize_t ret;
+    struct msghdr msg;
+    memset(&msg, 0, sizeof(msg));
+    msg.msg_iov = iov;
+    msg.msg_iovlen = iov_cnt;
+    do {
+        ret = do_send
+            ? sendmsg(sockfd, &msg, 0)
+            : recvmsg(sockfd, &msg, 0);
+    } while (ret < 0 && errno == EINTR);
+    return ret;
+#else
+    /* else send piece-by-piece */
+    /*XXX Note: windows has WSASend() and WSARecv() */
+    unsigned i = 0;
+    ssize_t ret = 0;
+    while (i < iov_cnt) {
+        ssize_t r = do_send
+            ? send(sockfd, iov[i].iov_base, iov[i].iov_len, 0)
+            : recv(sockfd, iov[i].iov_base, iov[i].iov_len, 0);
+        if (r > 0) {
+            ret += r;
+        } else if (!r) {
+            break;
+        } else if (errno == EINTR) {
+            continue;
+        } else {
+            /* else it is some "other" error,
+             * only return if there was no data processed. */
+            if (ret == 0) {
+                ret = -1;
+            }
+            break;
+        }
+        i++;
+    }
+    return ret;
+#endif
+}
+
+ssize_t iov_send_recv(int sockfd, const struct iovec *_iov, unsigned iov_cnt,
+                      size_t offset, size_t bytes,
+                      bool do_send)
+{
+    ssize_t total = 0;
+    ssize_t ret;
+    size_t orig_len, tail;
+    unsigned niov;
+    struct iovec *local_iov, *iov;
+
+    if (bytes <= 0) {
+        return 0;
+    }
+
+    local_iov = g_new0(struct iovec, iov_cnt);
+    iov_copy(local_iov, iov_cnt, _iov, iov_cnt, offset, bytes);
+    offset = 0;
+    iov = local_iov;
+
+    while (bytes > 0) {
+        /* Find the start position, skipping `offset' bytes:
+         * first, skip all full-sized vector elements, */
+        for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) {
+            offset -= iov[niov].iov_len;
+        }
+
+        /* niov == iov_cnt would only be valid if bytes == 0, which
+         * we already ruled out in the loop condition.  */
+        assert(niov < iov_cnt);
+        iov += niov;
+        iov_cnt -= niov;
+
+        if (offset) {
+            /* second, skip `offset' bytes from the (now) first element,
+             * undo it on exit */
+            iov[0].iov_base += offset;
+            iov[0].iov_len -= offset;
+        }
+        /* Find the end position skipping `bytes' bytes: */
+        /* first, skip all full-sized elements */
+        tail = bytes;
+        for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) {
+            tail -= iov[niov].iov_len;
+        }
+        if (tail) {
+            /* second, fixup the last element, and remember the original
+             * length */
+            assert(niov < iov_cnt);
+            assert(iov[niov].iov_len > tail);
+            orig_len = iov[niov].iov_len;
+            iov[niov++].iov_len = tail;
+            ret = do_send_recv(sockfd, iov, niov, do_send);
+            /* Undo the changes above before checking for errors */
+            iov[niov-1].iov_len = orig_len;
+        } else {
+            ret = do_send_recv(sockfd, iov, niov, do_send);
+        }
+        if (offset) {
+            iov[0].iov_base -= offset;
+            iov[0].iov_len += offset;
+        }
+
+        if (ret < 0) {
+            assert(errno != EINTR);
+            g_free(local_iov);
+            if (errno == EAGAIN && total > 0) {
+                return total;
+            }
+            return -1;
+        }
+
+        if (ret == 0 && !do_send) {
+            /* recv returns 0 when the peer has performed an orderly
+             * shutdown. */
+            break;
+        }
+
+        /* Prepare for the next iteration */
+        offset += ret;
+        total += ret;
+        bytes -= ret;
+    }
+
+    g_free(local_iov);
+    return total;
+}
+
+
+void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt,
+                 FILE *fp, const char *prefix, size_t limit)
+{
+    int v;
+    size_t size = 0;
+    char *buf;
+
+    for (v = 0; v < iov_cnt; v++) {
+        size += iov[v].iov_len;
+    }
+    size = size > limit ? limit : size;
+    buf = g_malloc(size);
+    iov_to_buf(iov, iov_cnt, 0, buf, size);
+    qemu_hexdump(fp, prefix, buf, size);
+    g_free(buf);
+}
+
+unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt,
+                 const struct iovec *iov, unsigned int iov_cnt,
+                 size_t offset, size_t bytes)
+{
+    size_t len;
+    unsigned int i, j;
+    for (i = 0, j = 0;
+         i < iov_cnt && j < dst_iov_cnt && (offset || bytes); i++) {
+        if (offset >= iov[i].iov_len) {
+            offset -= iov[i].iov_len;
+            continue;
+        }
+        len = MIN(bytes, iov[i].iov_len - offset);
+
+        dst_iov[j].iov_base = iov[i].iov_base + offset;
+        dst_iov[j].iov_len = len;
+        j++;
+        bytes -= len;
+        offset = 0;
+    }
+    assert(offset == 0);
+    return j;
+}
+
+/* io vectors */
+
+void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint)
+{
+    qiov->iov = g_new(struct iovec, alloc_hint);
+    qiov->niov = 0;
+    qiov->nalloc = alloc_hint;
+    qiov->size = 0;
+}
+
+void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov)
+{
+    int i;
+
+    qiov->iov = iov;
+    qiov->niov = niov;
+    qiov->nalloc = -1;
+    qiov->size = 0;
+    for (i = 0; i < niov; i++)
+        qiov->size += iov[i].iov_len;
+}
+
+void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
+{
+    assert(qiov->nalloc != -1);
+
+    if (qiov->niov == qiov->nalloc) {
+        qiov->nalloc = 2 * qiov->nalloc + 1;
+        qiov->iov = g_renew(struct iovec, qiov->iov, qiov->nalloc);
+    }
+    qiov->iov[qiov->niov].iov_base = base;
+    qiov->iov[qiov->niov].iov_len = len;
+    qiov->size += len;
+    ++qiov->niov;
+}
+
+/*
+ * Concatenates (partial) iovecs from src_iov to the end of dst.
+ * It starts copying after skipping `soffset' bytes at the
+ * beginning of src and adds individual vectors from src to
+ * dst copies up to `sbytes' bytes total, or up to the end
+ * of src_iov if it comes first.  This way, it is okay to specify
+ * very large value for `sbytes' to indicate "up to the end
+ * of src".
+ * Only vector pointers are processed, not the actual data buffers.
+ */
+size_t qemu_iovec_concat_iov(QEMUIOVector *dst,
+                             struct iovec *src_iov, unsigned int src_cnt,
+                             size_t soffset, size_t sbytes)
+{
+    int i;
+    size_t done;
+
+    if (!sbytes) {
+        return 0;
+    }
+    assert(dst->nalloc != -1);
+    for (i = 0, done = 0; done < sbytes && i < src_cnt; i++) {
+        if (soffset < src_iov[i].iov_len) {
+            size_t len = MIN(src_iov[i].iov_len - soffset, sbytes - done);
+            qemu_iovec_add(dst, src_iov[i].iov_base + soffset, len);
+            done += len;
+            soffset = 0;
+        } else {
+            soffset -= src_iov[i].iov_len;
+        }
+    }
+    assert(soffset == 0); /* offset beyond end of src */
+
+    return done;
+}
+
+/*
+ * Concatenates (partial) iovecs from src to the end of dst.
+ * It starts copying after skipping `soffset' bytes at the
+ * beginning of src and adds individual vectors from src to
+ * dst copies up to `sbytes' bytes total, or up to the end
+ * of src if it comes first.  This way, it is okay to specify
+ * very large value for `sbytes' to indicate "up to the end
+ * of src".
+ * Only vector pointers are processed, not the actual data buffers.
+ */
+void qemu_iovec_concat(QEMUIOVector *dst,
+                       QEMUIOVector *src, size_t soffset, size_t sbytes)
+{
+    qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes);
+}
+
+/*
+ * qiov_find_iov
+ *
+ * Return pointer to iovec structure, where byte at @offset in original vector
+ * @iov exactly is.
+ * Set @remaining_offset to be offset inside that iovec to the same byte.
+ */
+static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
+                                     size_t *remaining_offset)
+{
+    while (offset > 0 && offset >= iov->iov_len) {
+        offset -= iov->iov_len;
+        iov++;
+    }
+    *remaining_offset = offset;
+
+    return iov;
+}
+
+/*
+ * qiov_slice
+ *
+ * Find subarray of iovec's, containing requested range. @head would
+ * be offset in first iov (returned by the function), @tail would be
+ * count of extra bytes in last iovec (returned iov + @niov - 1).
+ */
+static struct iovec *qiov_slice(QEMUIOVector *qiov,
+                                size_t offset, size_t len,
+                                size_t *head, size_t *tail, int *niov)
+{
+    struct iovec *iov, *end_iov;
+
+    assert(offset + len <= qiov->size);
+
+    iov = iov_skip_offset(qiov->iov, offset, head);
+    end_iov = iov_skip_offset(iov, *head + len, tail);
+
+    if (*tail > 0) {
+        assert(*tail < end_iov->iov_len);
+        *tail = end_iov->iov_len - *tail;
+        end_iov++;
+    }
+
+    *niov = end_iov - iov;
+
+    return iov;
+}
+
+int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
+{
+    size_t head, tail;
+    int niov;
+
+    qiov_slice(qiov, offset, len, &head, &tail, &niov);
+
+    return niov;
+}
+
+/*
+ * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
+ * and @tail_buf buffer into new qiov.
+ */
+int qemu_iovec_init_extended(
+        QEMUIOVector *qiov,
+        void *head_buf, size_t head_len,
+        QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
+        void *tail_buf, size_t tail_len)
+{
+    size_t mid_head, mid_tail;
+    int total_niov, mid_niov = 0;
+    struct iovec *p, *mid_iov = NULL;
+
+    assert(mid_qiov->niov <= IOV_MAX);
+
+    if (SIZE_MAX - head_len < mid_len ||
+        SIZE_MAX - head_len - mid_len < tail_len)
+    {
+        return -EINVAL;
+    }
+
+    if (mid_len) {
+        mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
+                             &mid_head, &mid_tail, &mid_niov);
+    }
+
+    total_niov = !!head_len + mid_niov + !!tail_len;
+    if (total_niov > IOV_MAX) {
+        return -EINVAL;
+    }
+
+    if (total_niov == 1) {
+        qemu_iovec_init_buf(qiov, NULL, 0);
+        p = &qiov->local_iov;
+    } else {
+        qiov->niov = qiov->nalloc = total_niov;
+        qiov->size = head_len + mid_len + tail_len;
+        p = qiov->iov = g_new(struct iovec, qiov->niov);
+    }
+
+    if (head_len) {
+        p->iov_base = head_buf;
+        p->iov_len = head_len;
+        p++;
+    }
+
+    assert(!mid_niov == !mid_len);
+    if (mid_niov) {
+        memcpy(p, mid_iov, mid_niov * sizeof(*p));
+        p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
+        p[0].iov_len -= mid_head;
+        p[mid_niov - 1].iov_len -= mid_tail;
+        p += mid_niov;
+    }
+
+    if (tail_len) {
+        p->iov_base = tail_buf;
+        p->iov_len = tail_len;
+    }
+
+    return 0;
+}
+
+/*
+ * Check if the contents of subrange of qiov data is all zeroes.
+ */
+bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
+{
+    struct iovec *iov;
+    size_t current_offset;
+
+    assert(offset + bytes <= qiov->size);
+
+    iov = iov_skip_offset(qiov->iov, offset, &current_offset);
+
+    while (bytes) {
+        uint8_t *base = (uint8_t *)iov->iov_base + current_offset;
+        size_t len = MIN(iov->iov_len - current_offset, bytes);
+
+        if (!buffer_is_zero(base, len)) {
+            return false;
+        }
+
+        current_offset = 0;
+        bytes -= len;
+        iov++;
+    }
+
+    return true;
+}
+
+void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
+                           size_t offset, size_t len)
+{
+    int ret;
+
+    assert(source->size >= len);
+    assert(source->size - len >= offset);
+
+    /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
+    ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
+    assert(ret == 0);
+}
+
+void qemu_iovec_destroy(QEMUIOVector *qiov)
+{
+    if (qiov->nalloc != -1) {
+        g_free(qiov->iov);
+    }
+
+    memset(qiov, 0, sizeof(*qiov));
+}
+
+void qemu_iovec_reset(QEMUIOVector *qiov)
+{
+    assert(qiov->nalloc != -1);
+
+    qiov->niov = 0;
+    qiov->size = 0;
+}
+
+size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
+                         void *buf, size_t bytes)
+{
+    return iov_to_buf(qiov->iov, qiov->niov, offset, buf, bytes);
+}
+
+size_t qemu_iovec_from_buf(QEMUIOVector *qiov, size_t offset,
+                           const void *buf, size_t bytes)
+{
+    return iov_from_buf(qiov->iov, qiov->niov, offset, buf, bytes);
+}
+
+size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
+                         int fillc, size_t bytes)
+{
+    return iov_memset(qiov->iov, qiov->niov, offset, fillc, bytes);
+}
+
+/**
+ * Check that I/O vector contents are identical
+ *
+ * The IO vectors must have the same structure (same length of all parts).
+ * A typical usage is to compare vectors created with qemu_iovec_clone().
+ *
+ * @a:          I/O vector
+ * @b:          I/O vector
+ * @ret:        Offset to first mismatching byte or -1 if match
+ */
+ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+    int i;
+    ssize_t offset = 0;
+
+    assert(a->niov == b->niov);
+    for (i = 0; i < a->niov; i++) {
+        size_t len = 0;
+        uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+        uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+        assert(a->iov[i].iov_len == b->iov[i].iov_len);
+        while (len < a->iov[i].iov_len && *p++ == *q++) {
+            len++;
+        }
+
+        offset += len;
+
+        if (len != a->iov[i].iov_len) {
+            return offset;
+        }
+    }
+    return -1;
+}
+
+typedef struct {
+    int src_index;
+    struct iovec *src_iov;
+    void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    /* Don't overflow */
+    if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+        return -1;
+    } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+        return 1;
+    } else {
+        return 0;
+    }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+    const IOVectorSortElem *elem_a = a;
+    const IOVectorSortElem *elem_b = b;
+
+    return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved.  This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf)
+{
+    IOVectorSortElem sortelems[src->niov];
+    void *last_end;
+    int i;
+
+    /* Sort by source iovecs by base address */
+    for (i = 0; i < src->niov; i++) {
+        sortelems[i].src_index = i;
+        sortelems[i].src_iov = &src->iov[i];
+    }
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+    /* Allocate buffer space taking into account overlapping iovecs */
+    last_end = NULL;
+    for (i = 0; i < src->niov; i++) {
+        struct iovec *cur = sortelems[i].src_iov;
+        ptrdiff_t rewind = 0;
+
+        /* Detect overlap */
+        if (last_end && last_end > cur->iov_base) {
+            rewind = last_end - cur->iov_base;
+        }
+
+        sortelems[i].dest_base = buf - rewind;
+        buf += cur->iov_len - MIN(rewind, cur->iov_len);
+        last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+    }
+
+    /* Sort by source iovec index and build destination iovec */
+    qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+    for (i = 0; i < src->niov; i++) {
+        qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+    }
+}
+
+void iov_discard_undo(IOVDiscardUndo *undo)
+{
+    /* Restore original iovec if it was modified */
+    if (undo->modified_iov) {
+        *undo->modified_iov = undo->orig;
+    }
+}
+
+size_t iov_discard_front_undoable(struct iovec **iov,
+                                  unsigned int *iov_cnt,
+                                  size_t bytes,
+                                  IOVDiscardUndo *undo)
+{
+    size_t total = 0;
+    struct iovec *cur;
+
+    if (undo) {
+        undo->modified_iov = NULL;
+    }
+
+    for (cur = *iov; *iov_cnt > 0; cur++) {
+        if (cur->iov_len > bytes) {
+            if (undo) {
+                undo->modified_iov = cur;
+                undo->orig = *cur;
+            }
+
+            cur->iov_base += bytes;
+            cur->iov_len -= bytes;
+            total += bytes;
+            break;
+        }
+
+        bytes -= cur->iov_len;
+        total += cur->iov_len;
+        *iov_cnt -= 1;
+    }
+
+    *iov = cur;
+    return total;
+}
+
+size_t iov_discard_front(struct iovec **iov, unsigned int *iov_cnt,
+                         size_t bytes)
+{
+    return iov_discard_front_undoable(iov, iov_cnt, bytes, NULL);
+}
+
+size_t iov_discard_back_undoable(struct iovec *iov,
+                                 unsigned int *iov_cnt,
+                                 size_t bytes,
+                                 IOVDiscardUndo *undo)
+{
+    size_t total = 0;
+    struct iovec *cur;
+
+    if (undo) {
+        undo->modified_iov = NULL;
+    }
+
+    if (*iov_cnt == 0) {
+        return 0;
+    }
+
+    cur = iov + (*iov_cnt - 1);
+
+    while (*iov_cnt > 0) {
+        if (cur->iov_len > bytes) {
+            if (undo) {
+                undo->modified_iov = cur;
+                undo->orig = *cur;
+            }
+
+            cur->iov_len -= bytes;
+            total += bytes;
+            break;
+        }
+
+        bytes -= cur->iov_len;
+        total += cur->iov_len;
+        cur--;
+        *iov_cnt -= 1;
+    }
+
+    return total;
+}
+
+size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt,
+                        size_t bytes)
+{
+    return iov_discard_back_undoable(iov, iov_cnt, bytes, NULL);
+}
+
+void qemu_iovec_discard_back(QEMUIOVector *qiov, size_t bytes)
+{
+    size_t total;
+    unsigned int niov = qiov->niov;
+
+    assert(qiov->size >= bytes);
+    total = iov_discard_back(qiov->iov, &niov, bytes);
+    assert(total == bytes);
+
+    qiov->niov = niov;
+    qiov->size -= bytes;
+}
diff --git a/util/iova-tree.c b/util/iova-tree.c
new file mode 100644
index 000000000..23ea35b7a
--- /dev/null
+++ b/util/iova-tree.c
@@ -0,0 +1,114 @@
+/*
+ * IOVA tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+
+struct IOVATree {
+    GTree *tree;
+};
+
+static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
+{
+    const DMAMap *m1 = a, *m2 = b;
+
+    if (m1->iova > m2->iova + m2->size) {
+        return 1;
+    }
+
+    if (m1->iova + m1->size < m2->iova) {
+        return -1;
+    }
+
+    /* Overlapped */
+    return 0;
+}
+
+IOVATree *iova_tree_new(void)
+{
+    IOVATree *iova_tree = g_new0(IOVATree, 1);
+
+    /* We don't have values actually, no need to free */
+    iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL);
+
+    return iova_tree;
+}
+
+const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
+{
+    return g_tree_lookup(tree->tree, map);
+}
+
+const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
+{
+    const DMAMap map = { .iova = iova, .size = 0 };
+
+    return iova_tree_find(tree, &map);
+}
+
+static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range)
+{
+    /* Key and value are sharing the same range data */
+    g_tree_insert(gtree, range, range);
+}
+
+int iova_tree_insert(IOVATree *tree, const DMAMap *map)
+{
+    DMAMap *new;
+
+    if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* We don't allow to insert range that overlaps with existings */
+    if (iova_tree_find(tree, map)) {
+        return IOVA_ERR_OVERLAP;
+    }
+
+    new = g_new0(DMAMap, 1);
+    memcpy(new, map, sizeof(*new));
+    iova_tree_insert_internal(tree->tree, new);
+
+    return IOVA_OK;
+}
+
+static gboolean iova_tree_traverse(gpointer key, gpointer value,
+                                gpointer data)
+{
+    iova_tree_iterator iterator = data;
+    DMAMap *map = key;
+
+    g_assert(key == value);
+
+    return iterator(map);
+}
+
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator)
+{
+    g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
+}
+
+int iova_tree_remove(IOVATree *tree, const DMAMap *map)
+{
+    const DMAMap *overlap;
+
+    while ((overlap = iova_tree_find(tree, map))) {
+        g_tree_remove(tree->tree, overlap);
+    }
+
+    return IOVA_OK;
+}
+
+void iova_tree_destroy(IOVATree *tree)
+{
+    g_tree_destroy(tree->tree);
+    g_free(tree);
+}
diff --git a/util/keyval.c b/util/keyval.c
new file mode 100644
index 000000000..904337c8a
--- /dev/null
+++ b/util/keyval.c
@@ -0,0 +1,577 @@
+/*
+ * Parsing KEY=VALUE,... strings
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * KEY=VALUE,... syntax:
+ *
+ *   key-vals     = [ key-val { ',' key-val } [ ',' ] ]
+ *   key-val      = key '=' val | help
+ *   key          = key-fragment { '.' key-fragment }
+ *   key-fragment = / [^=,.]+ /
+ *   val          = { / [^,]+ / | ',,' }
+ *   help         = 'help' | '?'
+ *
+ * Semantics defined by reduction to JSON:
+ *
+ *   key-vals specifies a JSON object, i.e. a tree whose root is an
+ *   object, inner nodes other than the root are objects or arrays,
+ *   and leaves are strings.
+ *
+ *   Each key-val = key-fragment '.' ... '=' val specifies a path from
+ *   root to a leaf (left of '='), and the leaf's value (right of
+ *   '=').
+ *
+ *   A path from the root is defined recursively:
+ *       L '.' key-fragment is a child of the node denoted by path L
+ *       key-fragment is a child of the tree root
+ *   If key-fragment is numeric, the parent is an array and the child
+ *   is its key-fragment-th member, counting from zero.
+ *   Else, the parent is an object, and the child is its member named
+ *   key-fragment.
+ *
+ *   This constrains inner nodes to be either array or object.  The
+ *   constraints must be satisfiable.  Counter-example: a.b=1,a=2 is
+ *   not, because root.a must be an object to satisfy a.b=1 and a
+ *   string to satisfy a=2.
+ *
+ *   Array subscripts can occur in any order, but the set of
+ *   subscripts must not have gaps.  For instance, a.1=v is not okay,
+ *   because root.a[0] is missing.
+ *
+ *   If multiple key-val denote the same leaf, the last one determines
+ *   the value.
+ *
+ * Key-fragments must be valid QAPI names or consist only of decimal
+ * digits.
+ *
+ * The length of any key-fragment must be between 1 and 127.
+ *
+ * If any key-val is help, the object is to be treated as a help
+ * request.
+ *
+ * Design flaw: there is no way to denote an empty array or non-root
+ * object.  While interpreting "key absent" as empty seems natural
+ * (removing a key-val from the input string removes the member when
+ * there are more, so why not when it's the last), it doesn't work:
+ * "key absent" already means "optional object/array absent", which
+ * isn't the same as "empty object/array present".
+ *
+ * Design flaw: scalar values can only be strings; there is no way to
+ * denote numbers, true, false or null.  The special QObject input
+ * visitor returned by qobject_input_visitor_new_keyval() mostly hides
+ * this by automatically converting strings to the type the visitor
+ * expects.  Breaks down for type 'any', where the visitor's
+ * expectation isn't clear.  Code visiting 'any' needs to do the
+ * conversion itself, but only when using this keyval visitor.
+ * Awkward.  Note that we carefully restrict alternate types to avoid
+ * similar ambiguity.
+ *
+ * Alternative syntax for use with an implied key:
+ *
+ *   key-vals     = [ key-val-1st { ',' key-val } [ ',' ] ]
+ *   key-val-1st  = val-no-key | key-val
+ *   val-no-key   = / [^=,]+ / - help
+ *
+ * where val-no-key is syntactic sugar for implied-key=val-no-key.
+ *
+ * Note that you can't use the sugared form when the value contains
+ * '=' or ','.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/help_option.h"
+#include "qemu/option.h"
+
+/*
+ * Convert @key to a list index.
+ * Convert all leading decimal digits to a (non-negative) number,
+ * capped at INT_MAX.
+ * If @end is non-null, assign a pointer to the first character after
+ * the number to *@end.
+ * Else, fail if any characters follow.
+ * On success, return the converted number.
+ * On failure, return a negative value.
+ * Note: since only digits are converted, no two keys can map to the
+ * same number, except by overflow to INT_MAX.
+ */
+static int key_to_index(const char *key, const char **end)
+{
+    int ret;
+    unsigned long index;
+
+    if (*key < '0' || *key > '9') {
+        return -EINVAL;
+    }
+    ret = qemu_strtoul(key, end, 10, &index);
+    if (ret) {
+        return ret == -ERANGE ? INT_MAX : ret;
+    }
+    return index <= INT_MAX ? index : INT_MAX;
+}
+
+/*
+ * Ensure @cur maps @key_in_cur the right way.
+ * If @value is null, it needs to map to a QDict, else to this
+ * QString.
+ * If @cur doesn't have @key_in_cur, put an empty QDict or @value,
+ * respectively.
+ * Else, if it needs to map to a QDict, and already does, do nothing.
+ * Else, if it needs to map to this QString, and already maps to a
+ * QString, replace it by @value.
+ * Else, fail because we have conflicting needs on how to map
+ * @key_in_cur.
+ * In any case, take over the reference to @value, i.e. if the caller
+ * wants to hold on to a reference, it needs to qobject_ref().
+ * Use @key up to @key_cursor to identify the key in error messages.
+ * On success, return the mapped value.
+ * On failure, store an error through @errp and return NULL.
+ */
+static QObject *keyval_parse_put(QDict *cur,
+                                 const char *key_in_cur, QString *value,
+                                 const char *key, const char *key_cursor,
+                                 Error **errp)
+{
+    QObject *old, *new;
+
+    old = qdict_get(cur, key_in_cur);
+    if (old) {
+        if (qobject_type(old) != (value ? QTYPE_QSTRING : QTYPE_QDICT)) {
+            error_setg(errp, "Parameters '%.*s.*' used inconsistently",
+                       (int)(key_cursor - key), key);
+            qobject_unref(value);
+            return NULL;
+        }
+        if (!value) {
+            return old;         /* already QDict, do nothing */
+        }
+        new = QOBJECT(value);   /* replacement */
+    } else {
+        new = value ? QOBJECT(value) : QOBJECT(qdict_new());
+    }
+    qdict_put_obj(cur, key_in_cur, new);
+    return new;
+}
+
+/*
+ * Parse one parameter from @params.
+ *
+ * If we're looking at KEY=VALUE, store result in @qdict.
+ * The first fragment of KEY applies to @qdict.  Subsequent fragments
+ * apply to nested QDicts, which are created on demand.  @implied_key
+ * is as in keyval_parse().
+ *
+ * If we're looking at "help" or "?", set *help to true.
+ *
+ * On success, return a pointer to the next parameter, or else to '\0'.
+ * On failure, return NULL.
+ */
+static const char *keyval_parse_one(QDict *qdict, const char *params,
+                                    const char *implied_key, bool *help,
+                                    Error **errp)
+{
+    const char *key, *key_end, *val_end, *s, *end;
+    size_t len;
+    char key_in_cur[128];
+    QDict *cur;
+    int ret;
+    QObject *next;
+    GString *val;
+
+    key = params;
+    val_end = NULL;
+    len = strcspn(params, "=,");
+    if (len && key[len] != '=') {
+        if (starts_with_help_option(key) == len) {
+            *help = true;
+            s = key + len;
+            if (*s == ',') {
+                s++;
+            }
+            return s;
+        }
+        if (implied_key) {
+            /* Desugar implied key */
+            key = implied_key;
+            val_end = params + len;
+            len = strlen(implied_key);
+        }
+    }
+    key_end = key + len;
+
+    /*
+     * Loop over key fragments: @s points to current fragment, it
+     * applies to @cur.  @key_in_cur[] holds the previous fragment.
+     */
+    cur = qdict;
+    s = key;
+    for (;;) {
+        /* Want a key index (unless it's first) or a QAPI name */
+        if (s != key && key_to_index(s, &end) >= 0) {
+            len = end - s;
+        } else {
+            ret = parse_qapi_name(s, false);
+            len = ret < 0 ? 0 : ret;
+        }
+        assert(s + len <= key_end);
+        if (!len || (s + len < key_end && s[len] != '.')) {
+            assert(key != implied_key);
+            error_setg(errp, "Invalid parameter '%.*s'",
+                       (int)(key_end - key), key);
+            return NULL;
+        }
+        if (len >= sizeof(key_in_cur)) {
+            assert(key != implied_key);
+            error_setg(errp, "Parameter%s '%.*s' is too long",
+                       s != key || s + len != key_end ? " fragment" : "",
+                       (int)len, s);
+            return NULL;
+        }
+
+        if (s != key) {
+            next = keyval_parse_put(cur, key_in_cur, NULL,
+                                    key, s - 1, errp);
+            if (!next) {
+                return NULL;
+            }
+            cur = qobject_to(QDict, next);
+            assert(cur);
+        }
+
+        memcpy(key_in_cur, s, len);
+        key_in_cur[len] = 0;
+        s += len;
+
+        if (*s != '.') {
+            break;
+        }
+        s++;
+    }
+
+    if (key == implied_key) {
+        assert(!*s);
+        val = g_string_new_len(params, val_end - params);
+        s = val_end;
+        if (*s == ',') {
+            s++;
+        }
+    } else {
+        if (*s != '=') {
+            error_setg(errp, "Expected '=' after parameter '%.*s'",
+                       (int)(s - key), key);
+            return NULL;
+        }
+        s++;
+
+        val = g_string_new(NULL);
+        for (;;) {
+            if (!*s) {
+                break;
+            } else if (*s == ',') {
+                s++;
+                if (*s != ',') {
+                    break;
+                }
+            }
+            g_string_append_c(val, *s++);
+        }
+    }
+
+    if (!keyval_parse_put(cur, key_in_cur, qstring_from_gstring(val),
+                          key, key_end, errp)) {
+        return NULL;
+    }
+    return s;
+}
+
+static char *reassemble_key(GSList *key)
+{
+    GString *s = g_string_new("");
+    GSList *p;
+
+    for (p = key; p; p = p->next) {
+        g_string_prepend_c(s, '.');
+        g_string_prepend(s, (char *)p->data);
+    }
+
+    return g_string_free(s, FALSE);
+}
+
+/*
+ * Recursive worker for keyval_merge.
+ *
+ * @str is the path that led to the * current dictionary (to be used for
+ * error messages).  It is modified internally but restored before the
+ * function returns.
+ */
+static void keyval_do_merge(QDict *dest, const QDict *merged, GString *str, Error **errp)
+{
+    size_t save_len = str->len;
+    const QDictEntry *ent;
+    QObject *old_value;
+
+    for (ent = qdict_first(merged); ent; ent = qdict_next(merged, ent)) {
+        old_value = qdict_get(dest, ent->key);
+        if (old_value) {
+            if (qobject_type(old_value) != qobject_type(ent->value)) {
+                error_setg(errp, "Parameter '%s%s' used inconsistently",
+                           str->str, ent->key);
+                return;
+            } else if (qobject_type(ent->value) == QTYPE_QDICT) {
+                /* Merge sub-dictionaries.  */
+                g_string_append(str, ent->key);
+                g_string_append_c(str, '.');
+                keyval_do_merge(qobject_to(QDict, old_value),
+                                qobject_to(QDict, ent->value),
+                                str, errp);
+                g_string_truncate(str, save_len);
+                continue;
+            } else if (qobject_type(ent->value) == QTYPE_QLIST) {
+                /* Append to old list.  */
+                QList *old = qobject_to(QList, old_value);
+                QList *new = qobject_to(QList, ent->value);
+                const QListEntry *item;
+                QLIST_FOREACH_ENTRY(new, item) {
+                    qobject_ref(item->value);
+                    qlist_append_obj(old, item->value);
+                }
+                continue;
+            } else {
+                assert(qobject_type(ent->value) == QTYPE_QSTRING);
+            }
+        }
+
+        qobject_ref(ent->value);
+        qdict_put_obj(dest, ent->key, ent->value);
+    }
+}
+
+/* Merge the @merged dictionary into @dest.
+ *
+ * The dictionaries are expected to be returned by the keyval parser, and
+ * therefore the only expected scalar type is the string.  In case the same
+ * path is present in both @dest and @merged, the semantics are as follows:
+ *
+ * - lists are concatenated
+ *
+ * - dictionaries are merged recursively
+ *
+ * - for scalar values, @merged wins
+ *
+ * In case an error is reported, @dest may already have been modified.
+ *
+ * This function can be used to implement semantics analogous to QemuOpts's
+ * .merge_lists = true case, or to implement -set for options backed by QDicts.
+ *
+ * Note: while QemuOpts is commonly used so that repeated keys overwrite
+ * ("last one wins"), it can also be used so that repeated keys build up
+ * a list. keyval_merge() can only be used when the options' semantics are
+ * the former, not the latter.
+ */
+void keyval_merge(QDict *dest, const QDict *merged, Error **errp)
+{
+    GString *str;
+
+    str = g_string_new("");
+    keyval_do_merge(dest, merged, str, errp);
+    g_string_free(str, TRUE);
+}
+
+/*
+ * Listify @cur recursively.
+ * Replace QDicts whose keys are all valid list indexes by QLists.
+ * @key_of_cur is the list of key fragments leading up to @cur.
+ * On success, return either @cur or its replacement.
+ * On failure, store an error through @errp and return NULL.
+ */
+static QObject *keyval_listify(QDict *cur, GSList *key_of_cur, Error **errp)
+{
+    GSList key_node;
+    bool has_index, has_member;
+    const QDictEntry *ent;
+    QDict *qdict;
+    QObject *val;
+    char *key;
+    size_t nelt;
+    QObject **elt;
+    int index, max_index, i;
+    QList *list;
+
+    key_node.next = key_of_cur;
+
+    /*
+     * Recursively listify @cur's members, and figure out whether @cur
+     * itself is to be listified.
+     */
+    has_index = false;
+    has_member = false;
+    for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) {
+        if (key_to_index(ent->key, NULL) >= 0) {
+            has_index = true;
+        } else {
+            has_member = true;
+        }
+
+        qdict = qobject_to(QDict, ent->value);
+        if (!qdict) {
+            continue;
+        }
+
+        key_node.data = ent->key;
+        val = keyval_listify(qdict, &key_node, errp);
+        if (!val) {
+            return NULL;
+        }
+        if (val != ent->value) {
+            qdict_put_obj(cur, ent->key, val);
+        }
+    }
+
+    if (has_index && has_member) {
+        key = reassemble_key(key_of_cur);
+        error_setg(errp, "Parameters '%s*' used inconsistently", key);
+        g_free(key);
+        return NULL;
+    }
+    if (!has_index) {
+        return QOBJECT(cur);
+    }
+
+    /* Copy @cur's values to @elt[] */
+    nelt = qdict_size(cur) + 1; /* one extra, for use as sentinel */
+    elt = g_new0(QObject *, nelt);
+    max_index = -1;
+    for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) {
+        index = key_to_index(ent->key, NULL);
+        assert(index >= 0);
+        if (index > max_index) {
+            max_index = index;
+        }
+        /*
+         * We iterate @nelt times.  If we get one exceeding @nelt
+         * here, we will put less than @nelt values into @elt[],
+         * triggering the error in the next loop.
+         */
+        if ((size_t)index >= nelt - 1) {
+            continue;
+        }
+        /* Even though dict keys are distinct, indexes need not be */
+        elt[index] = ent->value;
+    }
+
+    /*
+     * Make a list from @elt[], reporting the first missing element,
+     * if any.
+     * If we dropped an index >= nelt in the previous loop, this loop
+     * will run into the sentinel and report index @nelt missing.
+     */
+    list = qlist_new();
+    assert(!elt[nelt-1]);       /* need the sentinel to be null */
+    for (i = 0; i < MIN(nelt, max_index + 1); i++) {
+        if (!elt[i]) {
+            key = reassemble_key(key_of_cur);
+            error_setg(errp, "Parameter '%s%d' missing", key, i);
+            g_free(key);
+            g_free(elt);
+            qobject_unref(list);
+            return NULL;
+        }
+        qobject_ref(elt[i]);
+        qlist_append_obj(list, elt[i]);
+    }
+
+    g_free(elt);
+    return QOBJECT(list);
+}
+
+/*
+ * Parse @params in QEMU's traditional KEY=VALUE,... syntax.
+ *
+ * If @implied_key, the first KEY= can be omitted.  @implied_key is
+ * implied then, and VALUE can't be empty or contain ',' or '='.
+ *
+ * A parameter "help" or "?" without a value isn't added to the
+ * resulting dictionary, but instead is interpreted as help request.
+ * All other options are parsed and returned normally so that context
+ * specific help can be printed.
+ *
+ * If @p_help is not NULL, store whether help is requested there.
+ * If @p_help is NULL and help is requested, fail.
+ *
+ * On success, return @dict, now filled with the parsed keys and values.
+ *
+ * On failure, store an error through @errp and return NULL.  Any keys
+ * and values parsed so far will be in @dict nevertheless.
+ */
+QDict *keyval_parse_into(QDict *qdict, const char *params, const char *implied_key,
+                         bool *p_help, Error **errp)
+{
+    QObject *listified;
+    const char *s;
+    bool help = false;
+
+    s = params;
+    while (*s) {
+        s = keyval_parse_one(qdict, s, implied_key, &help, errp);
+        if (!s) {
+            return NULL;
+        }
+        implied_key = NULL;
+    }
+
+    if (p_help) {
+        *p_help = help;
+    } else if (help) {
+        error_setg(errp, "Help is not available for this option");
+        return NULL;
+    }
+
+    listified = keyval_listify(qdict, NULL, errp);
+    if (!listified) {
+        return NULL;
+    }
+    assert(listified == QOBJECT(qdict));
+    return qdict;
+}
+
+/*
+ * Parse @params in QEMU's traditional KEY=VALUE,... syntax.
+ *
+ * If @implied_key, the first KEY= can be omitted.  @implied_key is
+ * implied then, and VALUE can't be empty or contain ',' or '='.
+ *
+ * A parameter "help" or "?" without a value isn't added to the
+ * resulting dictionary, but instead is interpreted as help request.
+ * All other options are parsed and returned normally so that context
+ * specific help can be printed.
+ *
+ * If @p_help is not NULL, store whether help is requested there.
+ * If @p_help is NULL and help is requested, fail.
+ *
+ * On success, return a dictionary of the parsed keys and values.
+ * On failure, store an error through @errp and return NULL.
+ */
+QDict *keyval_parse(const char *params, const char *implied_key,
+                    bool *p_help, Error **errp)
+{
+    QDict *qdict = qdict_new();
+    QDict *ret = keyval_parse_into(qdict, params, implied_key, p_help, errp);
+
+    if (!ret) {
+        qobject_unref(qdict);
+    }
+    return ret;
+}
diff --git a/util/lockcnt.c b/util/lockcnt.c
new file mode 100644
index 000000000..5da36946b
--- /dev/null
+++ b/util/lockcnt.c
@@ -0,0 +1,399 @@
+/*
+ * QemuLockCnt implementation
+ *
+ * Copyright Red Hat, Inc. 2017
+ *
+ * Author:
+ *   Paolo Bonzini <pbonzini@redhat.com>
+ */
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "trace.h"
+
+#ifdef CONFIG_LINUX
+#include "qemu/futex.h"
+
+/* On Linux, bits 0-1 are a futex-based lock, bits 2-31 are the counter.
+ * For the mutex algorithm see Ulrich Drepper's "Futexes Are Tricky" (ok,
+ * this is not the most relaxing citation I could make...).  It is similar
+ * to mutex2 in the paper.
+ */
+
+#define QEMU_LOCKCNT_STATE_MASK    3
+#define QEMU_LOCKCNT_STATE_FREE    0   /* free, uncontended */
+#define QEMU_LOCKCNT_STATE_LOCKED  1   /* locked, uncontended */
+#define QEMU_LOCKCNT_STATE_WAITING 2   /* locked, contended */
+
+#define QEMU_LOCKCNT_COUNT_STEP    4
+#define QEMU_LOCKCNT_COUNT_SHIFT   2
+
+void qemu_lockcnt_init(QemuLockCnt *lockcnt)
+{
+    lockcnt->count = 0;
+}
+
+void qemu_lockcnt_destroy(QemuLockCnt *lockcnt)
+{
+}
+
+/* *val is the current value of lockcnt->count.
+ *
+ * If the lock is free, try a cmpxchg from *val to new_if_free; return
+ * true and set *val to the old value found by the cmpxchg in
+ * lockcnt->count.
+ *
+ * If the lock is taken, wait for it to be released and return false
+ * *without trying again to take the lock*.  Again, set *val to the
+ * new value of lockcnt->count.
+ *
+ * If *waited is true on return, new_if_free's bottom two bits must not
+ * be QEMU_LOCKCNT_STATE_LOCKED on subsequent calls, because the caller
+ * does not know if there are other waiters.  Furthermore, after *waited
+ * is set the caller has effectively acquired the lock.  If it returns
+ * with the lock not taken, it must wake another futex waiter.
+ */
+static bool qemu_lockcnt_cmpxchg_or_wait(QemuLockCnt *lockcnt, int *val,
+                                         int new_if_free, bool *waited)
+{
+    /* Fast path for when the lock is free.  */
+    if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_FREE) {
+        int expected = *val;
+
+        trace_lockcnt_fast_path_attempt(lockcnt, expected, new_if_free);
+        *val = qatomic_cmpxchg(&lockcnt->count, expected, new_if_free);
+        if (*val == expected) {
+            trace_lockcnt_fast_path_success(lockcnt, expected, new_if_free);
+            *val = new_if_free;
+            return true;
+        }
+    }
+
+    /* The slow path moves from locked to waiting if necessary, then
+     * does a futex wait.  Both steps can be repeated ad nauseam,
+     * only getting out of the loop if we can have another shot at the
+     * fast path.  Once we can, get out to compute the new destination
+     * value for the fast path.
+     */
+    while ((*val & QEMU_LOCKCNT_STATE_MASK) != QEMU_LOCKCNT_STATE_FREE) {
+        if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_LOCKED) {
+            int expected = *val;
+            int new = expected - QEMU_LOCKCNT_STATE_LOCKED + QEMU_LOCKCNT_STATE_WAITING;
+
+            trace_lockcnt_futex_wait_prepare(lockcnt, expected, new);
+            *val = qatomic_cmpxchg(&lockcnt->count, expected, new);
+            if (*val == expected) {
+                *val = new;
+            }
+            continue;
+        }
+
+        if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_WAITING) {
+            *waited = true;
+            trace_lockcnt_futex_wait(lockcnt, *val);
+            qemu_futex_wait(&lockcnt->count, *val);
+            *val = qatomic_read(&lockcnt->count);
+            trace_lockcnt_futex_wait_resume(lockcnt, *val);
+            continue;
+        }
+
+        abort();
+    }
+    return false;
+}
+
+static void lockcnt_wake(QemuLockCnt *lockcnt)
+{
+    trace_lockcnt_futex_wake(lockcnt);
+    qemu_futex_wake(&lockcnt->count, 1);
+}
+
+void qemu_lockcnt_inc(QemuLockCnt *lockcnt)
+{
+    int val = qatomic_read(&lockcnt->count);
+    bool waited = false;
+
+    for (;;) {
+        if (val >= QEMU_LOCKCNT_COUNT_STEP) {
+            int expected = val;
+            val = qatomic_cmpxchg(&lockcnt->count, val,
+                                  val + QEMU_LOCKCNT_COUNT_STEP);
+            if (val == expected) {
+                break;
+            }
+        } else {
+            /* The fast path is (0, unlocked)->(1, unlocked).  */
+            if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, QEMU_LOCKCNT_COUNT_STEP,
+                                             &waited)) {
+                break;
+            }
+        }
+    }
+
+    /* If we were woken by another thread, we should also wake one because
+     * we are effectively releasing the lock that was given to us.  This is
+     * the case where qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING
+     * in the low bits, and qemu_lockcnt_inc_and_unlock would find it and
+     * wake someone.
+     */
+    if (waited) {
+        lockcnt_wake(lockcnt);
+    }
+}
+
+void qemu_lockcnt_dec(QemuLockCnt *lockcnt)
+{
+    qatomic_sub(&lockcnt->count, QEMU_LOCKCNT_COUNT_STEP);
+}
+
+/* Decrement a counter, and return locked if it is decremented to zero.
+ * If the function returns true, it is impossible for the counter to
+ * become nonzero until the next qemu_lockcnt_unlock.
+ */
+bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt)
+{
+    int val = qatomic_read(&lockcnt->count);
+    int locked_state = QEMU_LOCKCNT_STATE_LOCKED;
+    bool waited = false;
+
+    for (;;) {
+        if (val >= 2 * QEMU_LOCKCNT_COUNT_STEP) {
+            int expected = val;
+            val = qatomic_cmpxchg(&lockcnt->count, val,
+                                  val - QEMU_LOCKCNT_COUNT_STEP);
+            if (val == expected) {
+                break;
+            }
+        } else {
+            /* If count is going 1->0, take the lock. The fast path is
+             * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting).
+             */
+            if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) {
+                return true;
+            }
+
+            if (waited) {
+                /* At this point we do not know if there are more waiters.  Assume
+                 * there are.
+                 */
+                locked_state = QEMU_LOCKCNT_STATE_WAITING;
+            }
+        }
+    }
+
+    /* If we were woken by another thread, but we're returning in unlocked
+     * state, we should also wake a thread because we are effectively
+     * releasing the lock that was given to us.  This is the case where
+     * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low
+     * bits, and qemu_lockcnt_unlock would find it and wake someone.
+     */
+    if (waited) {
+        lockcnt_wake(lockcnt);
+    }
+    return false;
+}
+
+/* If the counter is one, decrement it and return locked.  Otherwise do
+ * nothing.
+ *
+ * If the function returns true, it is impossible for the counter to
+ * become nonzero until the next qemu_lockcnt_unlock.
+ */
+bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt)
+{
+    int val = qatomic_read(&lockcnt->count);
+    int locked_state = QEMU_LOCKCNT_STATE_LOCKED;
+    bool waited = false;
+
+    while (val < 2 * QEMU_LOCKCNT_COUNT_STEP) {
+        /* If count is going 1->0, take the lock. The fast path is
+         * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting).
+         */
+        if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) {
+            return true;
+        }
+
+        if (waited) {
+            /* At this point we do not know if there are more waiters.  Assume
+             * there are.
+             */
+            locked_state = QEMU_LOCKCNT_STATE_WAITING;
+        }
+    }
+
+    /* If we were woken by another thread, but we're returning in unlocked
+     * state, we should also wake a thread because we are effectively
+     * releasing the lock that was given to us.  This is the case where
+     * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low
+     * bits, and qemu_lockcnt_inc_and_unlock would find it and wake someone.
+     */
+    if (waited) {
+        lockcnt_wake(lockcnt);
+    }
+    return false;
+}
+
+void qemu_lockcnt_lock(QemuLockCnt *lockcnt)
+{
+    int val = qatomic_read(&lockcnt->count);
+    int step = QEMU_LOCKCNT_STATE_LOCKED;
+    bool waited = false;
+
+    /* The third argument is only used if the low bits of val are 0
+     * (QEMU_LOCKCNT_STATE_FREE), so just blindly mix in the desired
+     * state.
+     */
+    while (!qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, val + step, &waited)) {
+        if (waited) {
+            /* At this point we do not know if there are more waiters.  Assume
+             * there are.
+             */
+            step = QEMU_LOCKCNT_STATE_WAITING;
+        }
+    }
+}
+
+void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt)
+{
+    int expected, new, val;
+
+    val = qatomic_read(&lockcnt->count);
+    do {
+        expected = val;
+        new = (val + QEMU_LOCKCNT_COUNT_STEP) & ~QEMU_LOCKCNT_STATE_MASK;
+        trace_lockcnt_unlock_attempt(lockcnt, val, new);
+        val = qatomic_cmpxchg(&lockcnt->count, val, new);
+    } while (val != expected);
+
+    trace_lockcnt_unlock_success(lockcnt, val, new);
+    if (val & QEMU_LOCKCNT_STATE_WAITING) {
+        lockcnt_wake(lockcnt);
+    }
+}
+
+void qemu_lockcnt_unlock(QemuLockCnt *lockcnt)
+{
+    int expected, new, val;
+
+    val = qatomic_read(&lockcnt->count);
+    do {
+        expected = val;
+        new = val & ~QEMU_LOCKCNT_STATE_MASK;
+        trace_lockcnt_unlock_attempt(lockcnt, val, new);
+        val = qatomic_cmpxchg(&lockcnt->count, val, new);
+    } while (val != expected);
+
+    trace_lockcnt_unlock_success(lockcnt, val, new);
+    if (val & QEMU_LOCKCNT_STATE_WAITING) {
+        lockcnt_wake(lockcnt);
+    }
+}
+
+unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt)
+{
+    return qatomic_read(&lockcnt->count) >> QEMU_LOCKCNT_COUNT_SHIFT;
+}
+#else
+void qemu_lockcnt_init(QemuLockCnt *lockcnt)
+{
+    qemu_mutex_init(&lockcnt->mutex);
+    lockcnt->count = 0;
+}
+
+void qemu_lockcnt_destroy(QemuLockCnt *lockcnt)
+{
+    qemu_mutex_destroy(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_inc(QemuLockCnt *lockcnt)
+{
+    int old;
+    for (;;) {
+        old = qatomic_read(&lockcnt->count);
+        if (old == 0) {
+            qemu_lockcnt_lock(lockcnt);
+            qemu_lockcnt_inc_and_unlock(lockcnt);
+            return;
+        } else {
+            if (qatomic_cmpxchg(&lockcnt->count, old, old + 1) == old) {
+                return;
+            }
+        }
+    }
+}
+
+void qemu_lockcnt_dec(QemuLockCnt *lockcnt)
+{
+    qatomic_dec(&lockcnt->count);
+}
+
+/* Decrement a counter, and return locked if it is decremented to zero.
+ * It is impossible for the counter to become nonzero while the mutex
+ * is taken.
+ */
+bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt)
+{
+    int val = qatomic_read(&lockcnt->count);
+    while (val > 1) {
+        int old = qatomic_cmpxchg(&lockcnt->count, val, val - 1);
+        if (old != val) {
+            val = old;
+            continue;
+        }
+
+        return false;
+    }
+
+    qemu_lockcnt_lock(lockcnt);
+    if (qatomic_fetch_dec(&lockcnt->count) == 1) {
+        return true;
+    }
+
+    qemu_lockcnt_unlock(lockcnt);
+    return false;
+}
+
+/* Decrement a counter and return locked if it is decremented to zero.
+ * Otherwise do nothing.
+ *
+ * It is impossible for the counter to become nonzero while the mutex
+ * is taken.
+ */
+bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt)
+{
+    /* No need for acquire semantics if we return false.  */
+    int val = qatomic_read(&lockcnt->count);
+    if (val > 1) {
+        return false;
+    }
+
+    qemu_lockcnt_lock(lockcnt);
+    if (qatomic_fetch_dec(&lockcnt->count) == 1) {
+        return true;
+    }
+
+    qemu_lockcnt_inc_and_unlock(lockcnt);
+    return false;
+}
+
+void qemu_lockcnt_lock(QemuLockCnt *lockcnt)
+{
+    qemu_mutex_lock(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt)
+{
+    qatomic_inc(&lockcnt->count);
+    qemu_mutex_unlock(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_unlock(QemuLockCnt *lockcnt)
+{
+    qemu_mutex_unlock(&lockcnt->mutex);
+}
+
+unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt)
+{
+    return qatomic_read(&lockcnt->count);
+}
+#endif
diff --git a/util/log.c b/util/log.c
new file mode 100644
index 000000000..2ee1500be
--- /dev/null
+++ b/util/log.c
@@ -0,0 +1,389 @@
+/*
+ * Logging support
+ *
+ *  Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/range.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "trace/control.h"
+#include "qemu/thread.h"
+#include "qemu/lockable.h"
+
+static char *logfilename;
+static QemuMutex qemu_logfile_mutex;
+QemuLogFile *qemu_logfile;
+int qemu_loglevel;
+static int log_append = 0;
+static GArray *debug_regions;
+
+/* Return the number of characters emitted.  */
+int qemu_log(const char *fmt, ...)
+{
+    int ret = 0;
+    QemuLogFile *logfile;
+
+    rcu_read_lock();
+    logfile = qatomic_rcu_read(&qemu_logfile);
+    if (logfile) {
+        va_list ap;
+        va_start(ap, fmt);
+        ret = vfprintf(logfile->fd, fmt, ap);
+        va_end(ap);
+
+        /* Don't pass back error results.  */
+        if (ret < 0) {
+            ret = 0;
+        }
+    }
+    rcu_read_unlock();
+    return ret;
+}
+
+static void __attribute__((__constructor__)) qemu_logfile_init(void)
+{
+    qemu_mutex_init(&qemu_logfile_mutex);
+}
+
+static void qemu_logfile_free(QemuLogFile *logfile)
+{
+    g_assert(logfile);
+
+    if (logfile->fd != stderr) {
+        fclose(logfile->fd);
+    }
+    g_free(logfile);
+}
+
+static bool log_uses_own_buffers;
+
+/* enable or disable low levels log */
+void qemu_set_log(int log_flags)
+{
+    bool need_to_open_file = false;
+    QemuLogFile *logfile;
+
+    qemu_loglevel = log_flags;
+#ifdef CONFIG_TRACE_LOG
+    qemu_loglevel |= LOG_TRACE;
+#endif
+    /*
+     * In all cases we only log if qemu_loglevel is set.
+     * Also:
+     *   If not daemonized we will always log either to stderr
+     *     or to a file (if there is a logfilename).
+     *   If we are daemonized,
+     *     we will only log if there is a logfilename.
+     */
+    if (qemu_loglevel && (!is_daemonized() || logfilename)) {
+        need_to_open_file = true;
+    }
+    QEMU_LOCK_GUARD(&qemu_logfile_mutex);
+    if (qemu_logfile && !need_to_open_file) {
+        logfile = qemu_logfile;
+        qatomic_rcu_set(&qemu_logfile, NULL);
+        call_rcu(logfile, qemu_logfile_free, rcu);
+    } else if (!qemu_logfile && need_to_open_file) {
+        logfile = g_new0(QemuLogFile, 1);
+        if (logfilename) {
+            logfile->fd = fopen(logfilename, log_append ? "a" : "w");
+            if (!logfile->fd) {
+                g_free(logfile);
+                perror(logfilename);
+                _exit(1);
+            }
+            /* In case we are a daemon redirect stderr to logfile */
+            if (is_daemonized()) {
+                dup2(fileno(logfile->fd), STDERR_FILENO);
+                fclose(logfile->fd);
+                /* This will skip closing logfile in qemu_log_close() */
+                logfile->fd = stderr;
+            }
+        } else {
+            /* Default to stderr if no log file specified */
+            assert(!is_daemonized());
+            logfile->fd = stderr;
+        }
+        /* must avoid mmap() usage of glibc by setting a buffer "by hand" */
+        if (log_uses_own_buffers) {
+            static char logfile_buf[4096];
+
+            setvbuf(logfile->fd, logfile_buf, _IOLBF, sizeof(logfile_buf));
+        } else {
+#if defined(_WIN32)
+            /* Win32 doesn't support line-buffering, so use unbuffered output. */
+            setvbuf(logfile->fd, NULL, _IONBF, 0);
+#else
+            setvbuf(logfile->fd, NULL, _IOLBF, 0);
+#endif
+            log_append = 1;
+        }
+        qatomic_rcu_set(&qemu_logfile, logfile);
+    }
+}
+
+void qemu_log_needs_buffers(void)
+{
+    log_uses_own_buffers = true;
+}
+
+/*
+ * Allow the user to include %d in their logfile which will be
+ * substituted with the current PID. This is useful for debugging many
+ * nested linux-user tasks but will result in lots of logs.
+ *
+ * filename may be NULL. In that case, log output is sent to stderr
+ */
+void qemu_set_log_filename(const char *filename, Error **errp)
+{
+    g_free(logfilename);
+    logfilename = NULL;
+
+    if (filename) {
+            char *pidstr = strstr(filename, "%");
+            if (pidstr) {
+                /* We only accept one %d, no other format strings */
+                if (pidstr[1] != 'd' || strchr(pidstr + 2, '%')) {
+                    error_setg(errp, "Bad logfile format: %s", filename);
+                    return;
+                } else {
+                    logfilename = g_strdup_printf(filename, getpid());
+                }
+            } else {
+                logfilename = g_strdup(filename);
+            }
+    }
+
+    qemu_log_close();
+    qemu_set_log(qemu_loglevel);
+}
+
+/* Returns true if addr is in our debug filter or no filter defined
+ */
+bool qemu_log_in_addr_range(uint64_t addr)
+{
+    if (debug_regions) {
+        int i = 0;
+        for (i = 0; i < debug_regions->len; i++) {
+            Range *range = &g_array_index(debug_regions, Range, i);
+            if (range_contains(range, addr)) {
+                return true;
+            }
+        }
+        return false;
+    } else {
+        return true;
+    }
+}
+
+
+void qemu_set_dfilter_ranges(const char *filter_spec, Error **errp)
+{
+    gchar **ranges = g_strsplit(filter_spec, ",", 0);
+    int i;
+
+    if (debug_regions) {
+        g_array_unref(debug_regions);
+        debug_regions = NULL;
+    }
+
+    debug_regions = g_array_sized_new(FALSE, FALSE,
+                                      sizeof(Range), g_strv_length(ranges));
+    for (i = 0; ranges[i]; i++) {
+        const char *r = ranges[i];
+        const char *range_op, *r2, *e;
+        uint64_t r1val, r2val, lob, upb;
+        struct Range range;
+
+        range_op = strstr(r, "-");
+        r2 = range_op ? range_op + 1 : NULL;
+        if (!range_op) {
+            range_op = strstr(r, "+");
+            r2 = range_op ? range_op + 1 : NULL;
+        }
+        if (!range_op) {
+            range_op = strstr(r, "..");
+            r2 = range_op ? range_op + 2 : NULL;
+        }
+        if (!range_op) {
+            error_setg(errp, "Bad range specifier");
+            goto out;
+        }
+
+        if (qemu_strtou64(r, &e, 0, &r1val)
+            || e != range_op) {
+            error_setg(errp, "Invalid number to the left of %.*s",
+                       (int)(r2 - range_op), range_op);
+            goto out;
+        }
+        if (qemu_strtou64(r2, NULL, 0, &r2val)) {
+            error_setg(errp, "Invalid number to the right of %.*s",
+                       (int)(r2 - range_op), range_op);
+            goto out;
+        }
+
+        switch (*range_op) {
+        case '+':
+            lob = r1val;
+            upb = r1val + r2val - 1;
+            break;
+        case '-':
+            upb = r1val;
+            lob = r1val - (r2val - 1);
+            break;
+        case '.':
+            lob = r1val;
+            upb = r2val;
+            break;
+        default:
+            g_assert_not_reached();
+        }
+        if (lob > upb) {
+            error_setg(errp, "Invalid range");
+            goto out;
+        }
+        range_set_bounds(&range, lob, upb);
+        g_array_append_val(debug_regions, range);
+    }
+out:
+    g_strfreev(ranges);
+}
+
+/* fflush() the log file */
+void qemu_log_flush(void)
+{
+    QemuLogFile *logfile;
+
+    rcu_read_lock();
+    logfile = qatomic_rcu_read(&qemu_logfile);
+    if (logfile) {
+        fflush(logfile->fd);
+    }
+    rcu_read_unlock();
+}
+
+/* Close the log file */
+void qemu_log_close(void)
+{
+    QemuLogFile *logfile;
+
+    qemu_mutex_lock(&qemu_logfile_mutex);
+    logfile = qemu_logfile;
+
+    if (logfile) {
+        qatomic_rcu_set(&qemu_logfile, NULL);
+        call_rcu(logfile, qemu_logfile_free, rcu);
+    }
+    qemu_mutex_unlock(&qemu_logfile_mutex);
+}
+
+const QEMULogItem qemu_log_items[] = {
+    { CPU_LOG_TB_OUT_ASM, "out_asm",
+      "show generated host assembly code for each compiled TB" },
+    { CPU_LOG_TB_IN_ASM, "in_asm",
+      "show target assembly code for each compiled TB" },
+    { CPU_LOG_TB_OP, "op",
+      "show micro ops for each compiled TB" },
+    { CPU_LOG_TB_OP_OPT, "op_opt",
+      "show micro ops after optimization" },
+    { CPU_LOG_TB_OP_IND, "op_ind",
+      "show micro ops before indirect lowering" },
+    { CPU_LOG_INT, "int",
+      "show interrupts/exceptions in short format" },
+    { CPU_LOG_EXEC, "exec",
+      "show trace before each executed TB (lots of logs)" },
+    { CPU_LOG_TB_CPU, "cpu",
+      "show CPU registers before entering a TB (lots of logs)" },
+    { CPU_LOG_TB_FPU, "fpu",
+      "include FPU registers in the 'cpu' logging" },
+    { CPU_LOG_MMU, "mmu",
+      "log MMU-related activities" },
+    { CPU_LOG_PCALL, "pcall",
+      "x86 only: show protected mode far calls/returns/exceptions" },
+    { CPU_LOG_RESET, "cpu_reset",
+      "show CPU state before CPU resets" },
+    { LOG_UNIMP, "unimp",
+      "log unimplemented functionality" },
+    { LOG_GUEST_ERROR, "guest_errors",
+      "log when the guest OS does something invalid (eg accessing a\n"
+      "non-existent register)" },
+    { CPU_LOG_PAGE, "page",
+      "dump pages at beginning of user mode emulation" },
+    { CPU_LOG_TB_NOCHAIN, "nochain",
+      "do not chain compiled TBs so that \"exec\" and \"cpu\" show\n"
+      "complete traces" },
+#ifdef CONFIG_PLUGIN
+    { CPU_LOG_PLUGIN, "plugin", "output from TCG plugins\n"},
+#endif
+    { LOG_STRACE, "strace",
+      "log every user-mode syscall, its input, and its result" },
+    { 0, NULL, NULL },
+};
+
+/* takes a comma separated list of log masks. Return 0 if error. */
+int qemu_str_to_log_mask(const char *str)
+{
+    const QEMULogItem *item;
+    int mask = 0;
+    char **parts = g_strsplit(str, ",", 0);
+    char **tmp;
+
+    for (tmp = parts; tmp && *tmp; tmp++) {
+        if (g_str_equal(*tmp, "all")) {
+            for (item = qemu_log_items; item->mask != 0; item++) {
+                mask |= item->mask;
+            }
+#ifdef CONFIG_TRACE_LOG
+        } else if (g_str_has_prefix(*tmp, "trace:") && (*tmp)[6] != '\0') {
+            trace_enable_events((*tmp) + 6);
+            mask |= LOG_TRACE;
+#endif
+        } else {
+            for (item = qemu_log_items; item->mask != 0; item++) {
+                if (g_str_equal(*tmp, item->name)) {
+                    goto found;
+                }
+            }
+            goto error;
+        found:
+            mask |= item->mask;
+        }
+    }
+
+    g_strfreev(parts);
+    return mask;
+
+ error:
+    g_strfreev(parts);
+    return 0;
+}
+
+void qemu_print_log_usage(FILE *f)
+{
+    const QEMULogItem *item;
+    fprintf(f, "Log items (comma separated):\n");
+    for (item = qemu_log_items; item->mask != 0; item++) {
+        fprintf(f, "%-15s %s\n", item->name, item->help);
+    }
+#ifdef CONFIG_TRACE_LOG
+    fprintf(f, "trace:PATTERN   enable trace events\n");
+    fprintf(f, "\nUse \"-d trace:help\" to get a list of trace events.\n\n");
+#endif
+}
diff --git a/util/main-loop.c b/util/main-loop.c
new file mode 100644
index 000000000..06b18b195
--- /dev/null
+++ b/util/main-loop.c
@@ -0,0 +1,594 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/timer.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "block/aio.h"
+#include "qemu/error-report.h"
+#include "qemu/queue.h"
+#include "qemu/compiler.h"
+
+#ifndef _WIN32
+#include <sys/wait.h>
+#endif
+
+#ifndef _WIN32
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them.  We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+/*
+ * Disable CFI checks.
+ * We are going to call a signal hander directly. Such handler may or may not
+ * have been defined in our binary, so there's no guarantee that the pointer
+ * used to set the handler is a cfi-valid pointer. Since the handlers are
+ * stored in kernel memory, changing the handler to an attacker-defined
+ * function requires being able to call a sigaction() syscall,
+ * which is not as easy as overwriting a pointer in memory.
+ */
+QEMU_DISABLE_CFI
+static void sigfd_handler(void *opaque)
+{
+    int fd = (intptr_t)opaque;
+    struct qemu_signalfd_siginfo info;
+    struct sigaction action;
+    ssize_t len;
+
+    while (1) {
+        do {
+            len = read(fd, &info, sizeof(info));
+        } while (len == -1 && errno == EINTR);
+
+        if (len == -1 && errno == EAGAIN) {
+            break;
+        }
+
+        if (len != sizeof(info)) {
+            error_report("read from sigfd returned %zd: %s", len,
+                         g_strerror(errno));
+            return;
+        }
+
+        sigaction(info.ssi_signo, NULL, &action);
+        if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+            sigaction_invoke(&action, &info);
+        } else if (action.sa_handler) {
+            action.sa_handler(info.ssi_signo);
+        }
+    }
+}
+
+static int qemu_signal_init(Error **errp)
+{
+    int sigfd;
+    sigset_t set;
+
+    /*
+     * SIG_IPI must be blocked in the main thread and must not be caught
+     * by sigwait() in the signal thread. Otherwise, the cpu thread will
+     * not catch it reliably.
+     */
+    sigemptyset(&set);
+    sigaddset(&set, SIG_IPI);
+    sigaddset(&set, SIGIO);
+    sigaddset(&set, SIGALRM);
+    sigaddset(&set, SIGBUS);
+    /* SIGINT cannot be handled via signalfd, so that ^C can be used
+     * to interrupt QEMU when it is being run under gdb.  SIGHUP and
+     * SIGTERM are also handled asynchronously, even though it is not
+     * strictly necessary, because they use the same handler as SIGINT.
+     */
+    pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+    sigdelset(&set, SIG_IPI);
+    sigfd = qemu_signalfd(&set);
+    if (sigfd == -1) {
+        error_setg_errno(errp, errno, "failed to create signalfd");
+        return -errno;
+    }
+
+    fcntl_setfl(sigfd, O_NONBLOCK);
+
+    qemu_set_fd_handler(sigfd, sigfd_handler, NULL, (void *)(intptr_t)sigfd);
+
+    return 0;
+}
+
+#else /* _WIN32 */
+
+static int qemu_signal_init(Error **errp)
+{
+    return 0;
+}
+#endif
+
+static AioContext *qemu_aio_context;
+static QEMUBH *qemu_notify_bh;
+
+static void notify_event_cb(void *opaque)
+{
+    /* No need to do anything; this bottom half is only used to
+     * kick the kernel out of ppoll/poll/WaitForMultipleObjects.
+     */
+}
+
+AioContext *qemu_get_aio_context(void)
+{
+    return qemu_aio_context;
+}
+
+void qemu_notify_event(void)
+{
+    if (!qemu_aio_context) {
+        return;
+    }
+    qemu_bh_schedule(qemu_notify_bh);
+}
+
+static GArray *gpollfds;
+
+int qemu_init_main_loop(Error **errp)
+{
+    int ret;
+    GSource *src;
+
+    init_clocks(qemu_timer_notify_cb);
+
+    ret = qemu_signal_init(errp);
+    if (ret) {
+        return ret;
+    }
+
+    qemu_aio_context = aio_context_new(errp);
+    if (!qemu_aio_context) {
+        return -EMFILE;
+    }
+    qemu_set_current_aio_context(qemu_aio_context);
+    qemu_notify_bh = qemu_bh_new(notify_event_cb, NULL);
+    gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
+    src = aio_get_g_source(qemu_aio_context);
+    g_source_set_name(src, "aio-context");
+    g_source_attach(src, NULL);
+    g_source_unref(src);
+    src = iohandler_get_g_source();
+    g_source_set_name(src, "io-handler");
+    g_source_attach(src, NULL);
+    g_source_unref(src);
+    return 0;
+}
+
+static int max_priority;
+
+#ifndef _WIN32
+static int glib_pollfds_idx;
+static int glib_n_poll_fds;
+
+void qemu_fd_register(int fd)
+{
+}
+
+static void glib_pollfds_fill(int64_t *cur_timeout)
+{
+    GMainContext *context = g_main_context_default();
+    int timeout = 0;
+    int64_t timeout_ns;
+    int n;
+
+    g_main_context_prepare(context, &max_priority);
+
+    glib_pollfds_idx = gpollfds->len;
+    n = glib_n_poll_fds;
+    do {
+        GPollFD *pfds;
+        glib_n_poll_fds = n;
+        g_array_set_size(gpollfds, glib_pollfds_idx + glib_n_poll_fds);
+        pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx);
+        n = g_main_context_query(context, max_priority, &timeout, pfds,
+                                 glib_n_poll_fds);
+    } while (n != glib_n_poll_fds);
+
+    if (timeout < 0) {
+        timeout_ns = -1;
+    } else {
+        timeout_ns = (int64_t)timeout * (int64_t)SCALE_MS;
+    }
+
+    *cur_timeout = qemu_soonest_timeout(timeout_ns, *cur_timeout);
+}
+
+static void glib_pollfds_poll(void)
+{
+    GMainContext *context = g_main_context_default();
+    GPollFD *pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx);
+
+    if (g_main_context_check(context, max_priority, pfds, glib_n_poll_fds)) {
+        g_main_context_dispatch(context);
+    }
+}
+
+#define MAX_MAIN_LOOP_SPIN (1000)
+
+static int os_host_main_loop_wait(int64_t timeout)
+{
+    GMainContext *context = g_main_context_default();
+    int ret;
+
+    g_main_context_acquire(context);
+
+    glib_pollfds_fill(&timeout);
+
+    qemu_mutex_unlock_iothread();
+    replay_mutex_unlock();
+
+    ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, timeout);
+
+    replay_mutex_lock();
+    qemu_mutex_lock_iothread();
+
+    glib_pollfds_poll();
+
+    g_main_context_release(context);
+
+    return ret;
+}
+#else
+/***********************************************************/
+/* Polling handling */
+
+typedef struct PollingEntry {
+    PollingFunc *func;
+    void *opaque;
+    struct PollingEntry *next;
+} PollingEntry;
+
+static PollingEntry *first_polling_entry;
+
+int qemu_add_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    pe = g_malloc0(sizeof(PollingEntry));
+    pe->func = func;
+    pe->opaque = opaque;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
+    *ppe = pe;
+    return 0;
+}
+
+void qemu_del_polling_cb(PollingFunc *func, void *opaque)
+{
+    PollingEntry **ppe, *pe;
+    for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
+        pe = *ppe;
+        if (pe->func == func && pe->opaque == opaque) {
+            *ppe = pe->next;
+            g_free(pe);
+            break;
+        }
+    }
+}
+
+/***********************************************************/
+/* Wait objects support */
+typedef struct WaitObjects {
+    int num;
+    int revents[MAXIMUM_WAIT_OBJECTS + 1];
+    HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+    WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
+    void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
+} WaitObjects;
+
+static WaitObjects wait_objects = {0};
+
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    WaitObjects *w = &wait_objects;
+    if (w->num >= MAXIMUM_WAIT_OBJECTS) {
+        return -1;
+    }
+    w->events[w->num] = handle;
+    w->func[w->num] = func;
+    w->opaque[w->num] = opaque;
+    w->revents[w->num] = 0;
+    w->num++;
+    return 0;
+}
+
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+    int i, found;
+    WaitObjects *w = &wait_objects;
+
+    found = 0;
+    for (i = 0; i < w->num; i++) {
+        if (w->events[i] == handle) {
+            found = 1;
+        }
+        if (found) {
+            w->events[i] = w->events[i + 1];
+            w->func[i] = w->func[i + 1];
+            w->opaque[i] = w->opaque[i + 1];
+            w->revents[i] = w->revents[i + 1];
+        }
+    }
+    if (found) {
+        w->num--;
+    }
+}
+
+void qemu_fd_register(int fd)
+{
+    WSAEventSelect(fd, event_notifier_get_handle(&qemu_aio_context->notifier),
+                   FD_READ | FD_ACCEPT | FD_CLOSE |
+                   FD_CONNECT | FD_WRITE | FD_OOB);
+}
+
+static int pollfds_fill(GArray *pollfds, fd_set *rfds, fd_set *wfds,
+                        fd_set *xfds)
+{
+    int nfds = -1;
+    int i;
+
+    for (i = 0; i < pollfds->len; i++) {
+        GPollFD *pfd = &g_array_index(pollfds, GPollFD, i);
+        int fd = pfd->fd;
+        int events = pfd->events;
+        if (events & G_IO_IN) {
+            FD_SET(fd, rfds);
+            nfds = MAX(nfds, fd);
+        }
+        if (events & G_IO_OUT) {
+            FD_SET(fd, wfds);
+            nfds = MAX(nfds, fd);
+        }
+        if (events & G_IO_PRI) {
+            FD_SET(fd, xfds);
+            nfds = MAX(nfds, fd);
+        }
+    }
+    return nfds;
+}
+
+static void pollfds_poll(GArray *pollfds, int nfds, fd_set *rfds,
+                         fd_set *wfds, fd_set *xfds)
+{
+    int i;
+
+    for (i = 0; i < pollfds->len; i++) {
+        GPollFD *pfd = &g_array_index(pollfds, GPollFD, i);
+        int fd = pfd->fd;
+        int revents = 0;
+
+        if (FD_ISSET(fd, rfds)) {
+            revents |= G_IO_IN;
+        }
+        if (FD_ISSET(fd, wfds)) {
+            revents |= G_IO_OUT;
+        }
+        if (FD_ISSET(fd, xfds)) {
+            revents |= G_IO_PRI;
+        }
+        pfd->revents = revents & pfd->events;
+    }
+}
+
+static int os_host_main_loop_wait(int64_t timeout)
+{
+    GMainContext *context = g_main_context_default();
+    GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
+    int select_ret = 0;
+    int g_poll_ret, ret, i, n_poll_fds;
+    PollingEntry *pe;
+    WaitObjects *w = &wait_objects;
+    gint poll_timeout;
+    int64_t poll_timeout_ns;
+    static struct timeval tv0;
+    fd_set rfds, wfds, xfds;
+    int nfds;
+
+    g_main_context_acquire(context);
+
+    /* XXX: need to suppress polling by better using win32 events */
+    ret = 0;
+    for (pe = first_polling_entry; pe != NULL; pe = pe->next) {
+        ret |= pe->func(pe->opaque);
+    }
+    if (ret != 0) {
+        g_main_context_release(context);
+        return ret;
+    }
+
+    FD_ZERO(&rfds);
+    FD_ZERO(&wfds);
+    FD_ZERO(&xfds);
+    nfds = pollfds_fill(gpollfds, &rfds, &wfds, &xfds);
+    if (nfds >= 0) {
+        select_ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv0);
+        if (select_ret != 0) {
+            timeout = 0;
+        }
+        if (select_ret > 0) {
+            pollfds_poll(gpollfds, nfds, &rfds, &wfds, &xfds);
+        }
+    }
+
+    g_main_context_prepare(context, &max_priority);
+    n_poll_fds = g_main_context_query(context, max_priority, &poll_timeout,
+                                      poll_fds, ARRAY_SIZE(poll_fds));
+    g_assert(n_poll_fds + w->num <= ARRAY_SIZE(poll_fds));
+
+    for (i = 0; i < w->num; i++) {
+        poll_fds[n_poll_fds + i].fd = (DWORD_PTR)w->events[i];
+        poll_fds[n_poll_fds + i].events = G_IO_IN;
+    }
+
+    if (poll_timeout < 0) {
+        poll_timeout_ns = -1;
+    } else {
+        poll_timeout_ns = (int64_t)poll_timeout * (int64_t)SCALE_MS;
+    }
+
+    poll_timeout_ns = qemu_soonest_timeout(poll_timeout_ns, timeout);
+
+    qemu_mutex_unlock_iothread();
+
+    replay_mutex_unlock();
+
+    g_poll_ret = qemu_poll_ns(poll_fds, n_poll_fds + w->num, poll_timeout_ns);
+
+    replay_mutex_lock();
+
+    qemu_mutex_lock_iothread();
+    if (g_poll_ret > 0) {
+        for (i = 0; i < w->num; i++) {
+            w->revents[i] = poll_fds[n_poll_fds + i].revents;
+        }
+        for (i = 0; i < w->num; i++) {
+            if (w->revents[i] && w->func[i]) {
+                w->func[i](w->opaque[i]);
+            }
+        }
+    }
+
+    if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
+        g_main_context_dispatch(context);
+    }
+
+    g_main_context_release(context);
+
+    return select_ret || g_poll_ret;
+}
+#endif
+
+static NotifierList main_loop_poll_notifiers =
+    NOTIFIER_LIST_INITIALIZER(main_loop_poll_notifiers);
+
+void main_loop_poll_add_notifier(Notifier *notify)
+{
+    notifier_list_add(&main_loop_poll_notifiers, notify);
+}
+
+void main_loop_poll_remove_notifier(Notifier *notify)
+{
+    notifier_remove(notify);
+}
+
+void main_loop_wait(int nonblocking)
+{
+    MainLoopPoll mlpoll = {
+        .state = MAIN_LOOP_POLL_FILL,
+        .timeout = UINT32_MAX,
+        .pollfds = gpollfds,
+    };
+    int ret;
+    int64_t timeout_ns;
+
+    if (nonblocking) {
+        mlpoll.timeout = 0;
+    }
+
+    /* poll any events */
+    g_array_set_size(gpollfds, 0); /* reset for new iteration */
+    /* XXX: separate device handlers from system ones */
+    notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);
+
+    if (mlpoll.timeout == UINT32_MAX) {
+        timeout_ns = -1;
+    } else {
+        timeout_ns = (uint64_t)mlpoll.timeout * (int64_t)(SCALE_MS);
+    }
+
+    timeout_ns = qemu_soonest_timeout(timeout_ns,
+                                      timerlistgroup_deadline_ns(
+                                          &main_loop_tlg));
+
+    ret = os_host_main_loop_wait(timeout_ns);
+    mlpoll.state = ret < 0 ? MAIN_LOOP_POLL_ERR : MAIN_LOOP_POLL_OK;
+    notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);
+
+    if (icount_enabled()) {
+        /*
+         * CPU thread can infinitely wait for event after
+         * missing the warp
+         */
+        icount_start_warp_timer();
+    }
+    qemu_clock_run_all_timers();
+}
+
+/* Functions to operate on the main QEMU AioContext.  */
+
+QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name)
+{
+    return aio_bh_new_full(qemu_aio_context, cb, opaque, name);
+}
+
+/*
+ * Functions to operate on the I/O handler AioContext.
+ * This context runs on top of main loop. We can't reuse qemu_aio_context
+ * because iohandlers mustn't be polled by aio_poll(qemu_aio_context).
+ */
+static AioContext *iohandler_ctx;
+
+static void iohandler_init(void)
+{
+    if (!iohandler_ctx) {
+        iohandler_ctx = aio_context_new(&error_abort);
+    }
+}
+
+AioContext *iohandler_get_aio_context(void)
+{
+    iohandler_init();
+    return iohandler_ctx;
+}
+
+GSource *iohandler_get_g_source(void)
+{
+    iohandler_init();
+    return aio_get_g_source(iohandler_ctx);
+}
+
+void qemu_set_fd_handler(int fd,
+                         IOHandler *fd_read,
+                         IOHandler *fd_write,
+                         void *opaque)
+{
+    iohandler_init();
+    aio_set_fd_handler(iohandler_ctx, fd, false,
+                       fd_read, fd_write, NULL, opaque);
+}
+
+void event_notifier_set_handler(EventNotifier *e,
+                                EventNotifierHandler *handler)
+{
+    iohandler_init();
+    aio_set_event_notifier(iohandler_ctx, e, false,
+                           handler, NULL);
+}
diff --git a/util/memfd.c b/util/memfd.c
new file mode 100644
index 000000000..4a3c07e0b
--- /dev/null
+++ b/util/memfd.c
@@ -0,0 +1,206 @@
+/*
+ * memfd.c
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * QEMU library functions on POSIX which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/memfd.h"
+#include "qemu/host-utils.h"
+
+#if defined CONFIG_LINUX && !defined CONFIG_MEMFD
+#include <sys/syscall.h>
+#include <asm/unistd.h>
+
+int memfd_create(const char *name, unsigned int flags)
+{
+#ifdef __NR_memfd_create
+    return syscall(__NR_memfd_create, name, flags);
+#else
+    errno = ENOSYS;
+    return -1;
+#endif
+}
+#endif
+
+int qemu_memfd_create(const char *name, size_t size, bool hugetlb,
+                      uint64_t hugetlbsize, unsigned int seals, Error **errp)
+{
+    int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0;
+
+    if (htsize && 1ULL << htsize != hugetlbsize) {
+        error_setg(errp, "Hugepage size must be a power of 2");
+        return -1;
+    }
+
+    htsize = htsize << MFD_HUGE_SHIFT;
+
+#ifdef CONFIG_LINUX
+    int mfd = -1;
+    unsigned int flags = MFD_CLOEXEC;
+
+    if (seals) {
+        flags |= MFD_ALLOW_SEALING;
+    }
+    if (hugetlb) {
+        flags |= MFD_HUGETLB;
+        flags |= htsize;
+    }
+    mfd = memfd_create(name, flags);
+    if (mfd < 0) {
+        error_setg_errno(errp, errno,
+                         "failed to create memfd with flags 0x%x", flags);
+        goto err;
+    }
+
+    if (ftruncate(mfd, size) == -1) {
+        error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
+        goto err;
+    }
+
+    if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
+        error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
+        goto err;
+    }
+
+    return mfd;
+
+err:
+    if (mfd >= 0) {
+        close(mfd);
+    }
+#else
+    error_setg_errno(errp, ENOSYS, "failed to create memfd");
+#endif
+    return -1;
+}
+
+/*
+ * This is a best-effort helper for shared memory allocation, with
+ * optional sealing. The helper will do his best to allocate using
+ * memfd with sealing, but may fallback on other methods without
+ * sealing.
+ */
+void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals,
+                       int *fd, Error **errp)
+{
+    void *ptr;
+    int mfd = qemu_memfd_create(name, size, false, 0, seals, NULL);
+
+    /* some systems have memfd without sealing */
+    if (mfd == -1) {
+        mfd = qemu_memfd_create(name, size, false, 0, 0, NULL);
+    }
+
+    if (mfd == -1) {
+        const char *tmpdir = g_get_tmp_dir();
+        gchar *fname;
+
+        fname = g_strdup_printf("%s/memfd-XXXXXX", tmpdir);
+        mfd = mkstemp(fname);
+        unlink(fname);
+        g_free(fname);
+
+        if (mfd == -1 ||
+            ftruncate(mfd, size) == -1) {
+            goto err;
+        }
+    }
+
+    ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+    if (ptr == MAP_FAILED) {
+        goto err;
+    }
+
+    *fd = mfd;
+    return ptr;
+
+err:
+    error_setg_errno(errp, errno, "failed to allocate shared memory");
+    if (mfd >= 0) {
+        close(mfd);
+    }
+    return NULL;
+}
+
+void qemu_memfd_free(void *ptr, size_t size, int fd)
+{
+    if (ptr) {
+        munmap(ptr, size);
+    }
+
+    if (fd != -1) {
+        close(fd);
+    }
+}
+
+enum {
+    MEMFD_KO,
+    MEMFD_OK,
+    MEMFD_TODO
+};
+
+/**
+ * qemu_memfd_alloc_check():
+ *
+ * Check if qemu_memfd_alloc() can allocate, including using a
+ * fallback implementation when host doesn't support memfd.
+ */
+bool qemu_memfd_alloc_check(void)
+{
+    static int memfd_check = MEMFD_TODO;
+
+    if (memfd_check == MEMFD_TODO) {
+        int fd;
+        void *ptr;
+
+        fd = -1;
+        ptr = qemu_memfd_alloc("test", 4096, 0, &fd, NULL);
+        memfd_check = ptr ? MEMFD_OK : MEMFD_KO;
+        qemu_memfd_free(ptr, 4096, fd);
+    }
+
+    return memfd_check == MEMFD_OK;
+}
+
+/**
+ * qemu_memfd_check():
+ *
+ * Check if host supports memfd.
+ */
+bool qemu_memfd_check(unsigned int flags)
+{
+#ifdef CONFIG_LINUX
+    int mfd = memfd_create("test", flags | MFD_CLOEXEC);
+
+    if (mfd >= 0) {
+        close(mfd);
+        return true;
+    }
+#endif
+
+    return false;
+}
diff --git a/util/meson.build b/util/meson.build
new file mode 100644
index 000000000..05b593055
--- /dev/null
+++ b/util/meson.build
@@ -0,0 +1,89 @@
+util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c'))
+if not config_host_data.get('CONFIG_ATOMIC64')
+  util_ss.add(files('atomic64.c'))
+endif
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('aio-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('fdmon-poll.c'))
+if config_host_data.get('CONFIG_EPOLL_CREATE1')
+  util_ss.add(files('fdmon-epoll.c'))
+endif
+util_ss.add(when: linux_io_uring, if_true: files('fdmon-io_uring.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('compatfd.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('event_notifier-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('mmap-alloc.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('oslib-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: [files('qemu-openpty.c'), util])
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('qemu-thread-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('memfd.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('aio-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('event_notifier-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('oslib-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('qemu-thread-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: winmm)
+util_ss.add(files('envlist.c', 'path.c', 'module.c'))
+util_ss.add(files('host-utils.c'))
+util_ss.add(files('bitmap.c', 'bitops.c'))
+util_ss.add(files('fifo8.c'))
+util_ss.add(files('cacheinfo.c', 'cacheflush.c'))
+util_ss.add(files('error.c', 'qemu-error.c'))
+util_ss.add(files('qemu-print.c'))
+util_ss.add(files('id.c'))
+util_ss.add(files('qemu-config.c', 'notify.c'))
+util_ss.add(files('qemu-option.c', 'qemu-progress.c'))
+util_ss.add(files('keyval.c'))
+util_ss.add(files('crc32c.c'))
+util_ss.add(files('uuid.c'))
+util_ss.add(files('getauxval.c'))
+util_ss.add(files('rcu.c'))
+util_ss.add(when: 'CONFIG_MEMBARRIER', if_true: files('sys_membarrier.c'))
+util_ss.add(files('log.c'))
+util_ss.add(files('pagesize.c'))
+util_ss.add(files('qdist.c'))
+util_ss.add(files('qht.c'))
+util_ss.add(files('qsp.c'))
+util_ss.add(files('range.c'))
+util_ss.add(files('stats64.c'))
+util_ss.add(files('systemd.c'))
+util_ss.add(files('transactions.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('drm.c'))
+util_ss.add(files('guest-random.c'))
+util_ss.add(files('yank.c'))
+
+if have_user
+  util_ss.add(files('selfmap.c'))
+endif
+
+if have_system
+  util_ss.add(files('crc-ccitt.c'))
+  util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
+  util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
+endif
+
+if have_block
+  util_ss.add(files('aiocb.c', 'async.c', 'aio-wait.c'))
+  util_ss.add(files('base64.c'))
+  util_ss.add(files('buffer.c'))
+  util_ss.add(files('bufferiszero.c'))
+  util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND'])))
+  util_ss.add(files('hbitmap.c'))
+  util_ss.add(files('hexdump.c'))
+  util_ss.add(files('iova-tree.c'))
+  util_ss.add(files('iov.c', 'qemu-sockets.c', 'uri.c'))
+  util_ss.add(files('lockcnt.c'))
+  util_ss.add(files('main-loop.c'))
+  util_ss.add(files('nvdimm-utils.c'))
+  util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
+  util_ss.add(when: 'CONFIG_LINUX', if_true: [
+    files('vhost-user-server.c'), vhost_user
+  ])
+  util_ss.add(files('block-helpers.c'))
+  util_ss.add(files('qemu-coroutine-sleep.c'))
+  util_ss.add(files('qemu-co-shared-resource.c'))
+  util_ss.add(files('thread-pool.c', 'qemu-timer.c'))
+  util_ss.add(files('readline.c'))
+  util_ss.add(files('throttle.c'))
+  util_ss.add(files('timed-average.c'))
+  util_ss.add(when: 'CONFIG_INOTIFY1', if_true: files('filemonitor-inotify.c'),
+                                        if_false: files('filemonitor-stub.c'))
+  util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
+endif
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
new file mode 100644
index 000000000..893d86435
--- /dev/null
+++ b/util/mmap-alloc.c
@@ -0,0 +1,306 @@
+/*
+ * Support for RAM backed by mmaped host memory.
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * Authors:
+ *  Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifdef CONFIG_LINUX
+#include <linux/mman.h>
+#else  /* !CONFIG_LINUX */
+#define MAP_SYNC              0x0
+#define MAP_SHARED_VALIDATE   0x0
+#endif /* CONFIG_LINUX */
+
+#include "qemu/osdep.h"
+#include "qemu/mmap-alloc.h"
+#include "qemu/host-utils.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+#define HUGETLBFS_MAGIC       0x958458f6
+
+#ifdef CONFIG_LINUX
+#include <sys/vfs.h>
+#endif
+
+size_t qemu_fd_getpagesize(int fd)
+{
+#ifdef CONFIG_LINUX
+    struct statfs fs;
+    int ret;
+
+    if (fd != -1) {
+        do {
+            ret = fstatfs(fd, &fs);
+        } while (ret != 0 && errno == EINTR);
+
+        if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
+            return fs.f_bsize;
+        }
+    }
+#ifdef __sparc__
+    /* SPARC Linux needs greater alignment than the pagesize */
+    return QEMU_VMALLOC_ALIGN;
+#endif
+#endif
+
+    return qemu_real_host_page_size;
+}
+
+size_t qemu_mempath_getpagesize(const char *mem_path)
+{
+#ifdef CONFIG_LINUX
+    struct statfs fs;
+    int ret;
+
+    if (mem_path) {
+        do {
+            ret = statfs(mem_path, &fs);
+        } while (ret != 0 && errno == EINTR);
+
+        if (ret != 0) {
+            fprintf(stderr, "Couldn't statfs() memory path: %s\n",
+                    strerror(errno));
+            exit(1);
+        }
+
+        if (fs.f_type == HUGETLBFS_MAGIC) {
+            /* It's hugepage, return the huge page size */
+            return fs.f_bsize;
+        }
+    }
+#ifdef __sparc__
+    /* SPARC Linux needs greater alignment than the pagesize */
+    return QEMU_VMALLOC_ALIGN;
+#endif
+#endif
+
+    return qemu_real_host_page_size;
+}
+
+#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
+static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
+{
+#if defined(__linux__)
+    const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
+    const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
+    gchar *content = NULL;
+    const char *endptr;
+    unsigned int tmp;
+
+    /*
+     * hugeltb accounting is different than ordinary swap reservation:
+     * a) Hugetlb pages from the pool are reserved for both private and
+     *    shared mappings. For shared mappings, all mappers have to specify
+     *    MAP_NORESERVE.
+     * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
+     */
+    if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
+        return true;
+    }
+
+    /*
+     * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
+     * are private writable mappings (see mm/mmap.c:accountable_mapping() in
+     * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
+     * implicitly active -- no reservation; this includes shmem. The only
+     * exception is shared anonymous memory, it is accounted like private
+     * anonymous memory.
+     */
+    if (readonly || (shared && fd >= 0)) {
+        return true;
+    }
+
+    /*
+     * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
+     * memory overcommit is set to "never". Sparse memory regions aren't really
+     * possible in this system configuration.
+     *
+     * Bail out now instead of silently committing way more memory than
+     * currently desired by the user.
+     */
+    if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
+        !qemu_strtoui(content, &endptr, 0, &tmp) &&
+        (!endptr || *endptr == '\n')) {
+        if (tmp == 2) {
+            error_report("Skipping reservation of swap space is not supported:"
+                         " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
+            return false;
+        }
+        return true;
+    }
+    /* this interface has been around since Linux 2.6 */
+    error_report("Skipping reservation of swap space is not supported:"
+                 " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
+    return false;
+#endif
+    /*
+     * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
+     * and removed it a while ago.
+     */
+    error_report("Skipping reservation of swap space is not supported");
+    return false;
+}
+
+/*
+ * Reserve a new memory region of the requested size to be used for mapping
+ * from the given fd (if any).
+ */
+static void *mmap_reserve(size_t size, int fd)
+{
+    int flags = MAP_PRIVATE;
+
+#if defined(__powerpc64__) && defined(__linux__)
+    /*
+     * On ppc64 mappings in the same segment (aka slice) must share the same
+     * page size. Since we will be re-allocating part of this segment
+     * from the supplied fd, we should make sure to use the same page size, to
+     * this end we mmap the supplied fd.  In this case, set MAP_NORESERVE to
+     * avoid allocating backing store memory.
+     * We do this unless we are using the system page size, in which case
+     * anonymous memory is OK.
+     */
+    if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size) {
+        fd = -1;
+        flags |= MAP_ANONYMOUS;
+    } else {
+        flags |= MAP_NORESERVE;
+    }
+#else
+    fd = -1;
+    flags |= MAP_ANONYMOUS;
+#endif
+
+    return mmap(0, size, PROT_NONE, flags, fd, 0);
+}
+
+/*
+ * Activate memory in a reserved region from the given fd (if any), to make
+ * it accessible.
+ */
+static void *mmap_activate(void *ptr, size_t size, int fd,
+                           uint32_t qemu_map_flags, off_t map_offset)
+{
+    const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;
+    const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
+    const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
+    const bool sync = qemu_map_flags & QEMU_MAP_SYNC;
+    const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
+    int map_sync_flags = 0;
+    int flags = MAP_FIXED;
+    void *activated_ptr;
+
+    if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {
+        return MAP_FAILED;
+    }
+
+    flags |= fd == -1 ? MAP_ANONYMOUS : 0;
+    flags |= shared ? MAP_SHARED : MAP_PRIVATE;
+    flags |= noreserve ? MAP_NORESERVE : 0;
+    if (shared && sync) {
+        map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
+    }
+
+    activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,
+                         map_offset);
+    if (activated_ptr == MAP_FAILED && map_sync_flags) {
+        if (errno == ENOTSUP) {
+            char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
+            char *file_name = g_malloc0(PATH_MAX);
+            int len = readlink(proc_link, file_name, PATH_MAX - 1);
+
+            if (len < 0) {
+                len = 0;
+            }
+            file_name[len] = '\0';
+            fprintf(stderr, "Warning: requesting persistence across crashes "
+                    "for backend file %s failed. Proceeding without "
+                    "persistence, data might become corrupted in case of host "
+                    "crash.\n", file_name);
+            g_free(proc_link);
+            g_free(file_name);
+            warn_report("Using non DAX backing file with 'pmem=on' option"
+                        " is deprecated");
+        }
+        /*
+         * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
+         * again without these flags to handle backwards compatibility.
+         */
+        activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);
+    }
+    return activated_ptr;
+}
+
+static inline size_t mmap_guard_pagesize(int fd)
+{
+#if defined(__powerpc64__) && defined(__linux__)
+    /* Mappings in the same segment must share the same page size */
+    return qemu_fd_getpagesize(fd);
+#else
+    return qemu_real_host_page_size;
+#endif
+}
+
+void *qemu_ram_mmap(int fd,
+                    size_t size,
+                    size_t align,
+                    uint32_t qemu_map_flags,
+                    off_t map_offset)
+{
+    const size_t guard_pagesize = mmap_guard_pagesize(fd);
+    size_t offset, total;
+    void *ptr, *guardptr;
+
+    /*
+     * Note: this always allocates at least one extra page of virtual address
+     * space, even if size is already aligned.
+     */
+    total = size + align;
+
+    guardptr = mmap_reserve(total, fd);
+    if (guardptr == MAP_FAILED) {
+        return MAP_FAILED;
+    }
+
+    assert(is_power_of_2(align));
+    /* Always align to host page size */
+    assert(align >= guard_pagesize);
+
+    offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
+
+    ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,
+                        map_offset);
+    if (ptr == MAP_FAILED) {
+        munmap(guardptr, total);
+        return MAP_FAILED;
+    }
+
+    if (offset > 0) {
+        munmap(guardptr, offset);
+    }
+
+    /*
+     * Leave a single PROT_NONE page allocated after the RAM block, to serve as
+     * a guard page guarding against potential buffer overflows.
+     */
+    total -= offset;
+    if (total > size + guard_pagesize) {
+        munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);
+    }
+
+    return ptr;
+}
+
+void qemu_ram_munmap(int fd, void *ptr, size_t size)
+{
+    if (ptr) {
+        /* Unmap both the RAM block and the guard page */
+        munmap(ptr, size + mmap_guard_pagesize(fd));
+    }
+}
diff --git a/util/module.c b/util/module.c
new file mode 100644
index 000000000..6bb4ad915
--- /dev/null
+++ b/util/module.c
@@ -0,0 +1,387 @@
+/*
+ * QEMU Module Infrastructure
+ *
+ * Copyright IBM, Corp. 2009
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#ifdef CONFIG_MODULES
+#include <gmodule.h>
+#endif
+#include "qemu/queue.h"
+#include "qemu/module.h"
+#include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#ifdef CONFIG_MODULE_UPGRADES
+#include "qemu-version.h"
+#endif
+#include "trace.h"
+
+typedef struct ModuleEntry
+{
+    void (*init)(void);
+    QTAILQ_ENTRY(ModuleEntry) node;
+    module_init_type type;
+} ModuleEntry;
+
+typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList;
+
+static ModuleTypeList init_type_list[MODULE_INIT_MAX];
+static bool modules_init_done[MODULE_INIT_MAX];
+
+static ModuleTypeList dso_init_list;
+
+static void init_lists(void)
+{
+    static int inited;
+    int i;
+
+    if (inited) {
+        return;
+    }
+
+    for (i = 0; i < MODULE_INIT_MAX; i++) {
+        QTAILQ_INIT(&init_type_list[i]);
+    }
+
+    QTAILQ_INIT(&dso_init_list);
+
+    inited = 1;
+}
+
+
+static ModuleTypeList *find_type(module_init_type type)
+{
+    init_lists();
+
+    return &init_type_list[type];
+}
+
+void register_module_init(void (*fn)(void), module_init_type type)
+{
+    ModuleEntry *e;
+    ModuleTypeList *l;
+
+    e = g_malloc0(sizeof(*e));
+    e->init = fn;
+    e->type = type;
+
+    l = find_type(type);
+
+    QTAILQ_INSERT_TAIL(l, e, node);
+}
+
+void register_dso_module_init(void (*fn)(void), module_init_type type)
+{
+    ModuleEntry *e;
+
+    init_lists();
+
+    e = g_malloc0(sizeof(*e));
+    e->init = fn;
+    e->type = type;
+
+    QTAILQ_INSERT_TAIL(&dso_init_list, e, node);
+}
+
+void module_call_init(module_init_type type)
+{
+    ModuleTypeList *l;
+    ModuleEntry *e;
+
+    if (modules_init_done[type]) {
+        return;
+    }
+
+    l = find_type(type);
+
+    QTAILQ_FOREACH(e, l, node) {
+        e->init();
+    }
+
+    modules_init_done[type] = true;
+}
+
+#ifdef CONFIG_MODULES
+
+static const QemuModinfo module_info_stub[] = { {
+    /* end of list */
+} };
+static const QemuModinfo *module_info = module_info_stub;
+static const char *module_arch;
+
+void module_init_info(const QemuModinfo *info)
+{
+    module_info = info;
+}
+
+void module_allow_arch(const char *arch)
+{
+    module_arch = arch;
+}
+
+static bool module_check_arch(const QemuModinfo *modinfo)
+{
+    if (modinfo->arch) {
+        if (!module_arch) {
+            /* no arch set -> ignore all */
+            return false;
+        }
+        if (strcmp(module_arch, modinfo->arch) != 0) {
+            /* mismatch */
+            return false;
+        }
+    }
+    return true;
+}
+
+static int module_load_file(const char *fname, bool mayfail, bool export_symbols)
+{
+    GModule *g_module;
+    void (*sym)(void);
+    const char *dsosuf = CONFIG_HOST_DSOSUF;
+    int len = strlen(fname);
+    int suf_len = strlen(dsosuf);
+    ModuleEntry *e, *next;
+    int ret, flags;
+
+    if (len <= suf_len || strcmp(&fname[len - suf_len], dsosuf)) {
+        /* wrong suffix */
+        ret = -EINVAL;
+        goto out;
+    }
+    if (access(fname, F_OK)) {
+        ret = -ENOENT;
+        goto out;
+    }
+
+    assert(QTAILQ_EMPTY(&dso_init_list));
+
+    flags = 0;
+    if (!export_symbols) {
+        flags |= G_MODULE_BIND_LOCAL;
+    }
+    g_module = g_module_open(fname, flags);
+    if (!g_module) {
+        if (!mayfail) {
+            fprintf(stderr, "Failed to open module: %s\n",
+                    g_module_error());
+        }
+        ret = -EINVAL;
+        goto out;
+    }
+    if (!g_module_symbol(g_module, DSO_STAMP_FUN_STR, (gpointer *)&sym)) {
+        fprintf(stderr, "Failed to initialize module: %s\n",
+                fname);
+        /* Print some info if this is a QEMU module (but from different build),
+         * this will make debugging user problems easier. */
+        if (g_module_symbol(g_module, "qemu_module_dummy", (gpointer *)&sym)) {
+            fprintf(stderr,
+                    "Note: only modules from the same build can be loaded.\n");
+        }
+        g_module_close(g_module);
+        ret = -EINVAL;
+    } else {
+        QTAILQ_FOREACH(e, &dso_init_list, node) {
+            e->init();
+            register_module_init(e->init, e->type);
+        }
+        ret = 0;
+    }
+
+    trace_module_load_module(fname);
+    QTAILQ_FOREACH_SAFE(e, &dso_init_list, node, next) {
+        QTAILQ_REMOVE(&dso_init_list, e, node);
+        g_free(e);
+    }
+out:
+    return ret;
+}
+#endif
+
+bool module_load_one(const char *prefix, const char *lib_name, bool mayfail)
+{
+    bool success = false;
+
+#ifdef CONFIG_MODULES
+    char *fname = NULL;
+#ifdef CONFIG_MODULE_UPGRADES
+    char *version_dir;
+#endif
+    const char *search_dir;
+    char *dirs[5];
+    char *module_name;
+    int i = 0, n_dirs = 0;
+    int ret;
+    bool export_symbols = false;
+    static GHashTable *loaded_modules;
+    const QemuModinfo *modinfo;
+    const char **sl;
+
+    if (!g_module_supported()) {
+        fprintf(stderr, "Module is not supported by system.\n");
+        return false;
+    }
+
+    if (!loaded_modules) {
+        loaded_modules = g_hash_table_new(g_str_hash, g_str_equal);
+    }
+
+    module_name = g_strdup_printf("%s%s", prefix, lib_name);
+
+    if (g_hash_table_contains(loaded_modules, module_name)) {
+        g_free(module_name);
+        return true;
+    }
+    g_hash_table_add(loaded_modules, module_name);
+
+    for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+        if (modinfo->arch) {
+            if (strcmp(modinfo->name, module_name) == 0) {
+                if (!module_check_arch(modinfo)) {
+                    return false;
+                }
+            }
+        }
+        if (modinfo->deps) {
+            if (strcmp(modinfo->name, module_name) == 0) {
+                /* we depend on other module(s) */
+                for (sl = modinfo->deps; *sl != NULL; sl++) {
+                    module_load_one("", *sl, false);
+                }
+            } else {
+                for (sl = modinfo->deps; *sl != NULL; sl++) {
+                    if (strcmp(module_name, *sl) == 0) {
+                        /* another module depends on us */
+                        export_symbols = true;
+                    }
+                }
+            }
+        }
+    }
+
+    search_dir = getenv("QEMU_MODULE_DIR");
+    if (search_dir != NULL) {
+        dirs[n_dirs++] = g_strdup_printf("%s", search_dir);
+    }
+    dirs[n_dirs++] = get_relocated_path(CONFIG_QEMU_MODDIR);
+    dirs[n_dirs++] = g_strdup(qemu_get_exec_dir());
+
+#ifdef CONFIG_MODULE_UPGRADES
+    version_dir = g_strcanon(g_strdup(QEMU_PKGVERSION),
+                             G_CSET_A_2_Z G_CSET_a_2_z G_CSET_DIGITS "+-.~",
+                             '_');
+    dirs[n_dirs++] = g_strdup_printf("/var/run/qemu/%s", version_dir);
+#endif
+
+    assert(n_dirs <= ARRAY_SIZE(dirs));
+
+    for (i = 0; i < n_dirs; i++) {
+        fname = g_strdup_printf("%s/%s%s",
+                dirs[i], module_name, CONFIG_HOST_DSOSUF);
+        ret = module_load_file(fname, mayfail, export_symbols);
+        g_free(fname);
+        fname = NULL;
+        /* Try loading until loaded a module file */
+        if (!ret) {
+            success = true;
+            break;
+        }
+    }
+
+    if (!success) {
+        g_hash_table_remove(loaded_modules, module_name);
+        g_free(module_name);
+    }
+
+    for (i = 0; i < n_dirs; i++) {
+        g_free(dirs[i]);
+    }
+
+#endif
+    return success;
+}
+
+#ifdef CONFIG_MODULES
+
+static bool module_loaded_qom_all;
+
+void module_load_qom_one(const char *type)
+{
+    const QemuModinfo *modinfo;
+    const char **sl;
+
+    if (!type) {
+        return;
+    }
+
+    trace_module_lookup_object_type(type);
+    for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+        if (!modinfo->objs) {
+            continue;
+        }
+        if (!module_check_arch(modinfo)) {
+            continue;
+        }
+        for (sl = modinfo->objs; *sl != NULL; sl++) {
+            if (strcmp(type, *sl) == 0) {
+                module_load_one("", modinfo->name, false);
+            }
+        }
+    }
+}
+
+void module_load_qom_all(void)
+{
+    const QemuModinfo *modinfo;
+
+    if (module_loaded_qom_all) {
+        return;
+    }
+
+    for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+        if (!modinfo->objs) {
+            continue;
+        }
+        if (!module_check_arch(modinfo)) {
+            continue;
+        }
+        module_load_one("", modinfo->name, false);
+    }
+    module_loaded_qom_all = true;
+}
+
+void qemu_load_module_for_opts(const char *group)
+{
+    const QemuModinfo *modinfo;
+    const char **sl;
+
+    for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+        if (!modinfo->opts) {
+            continue;
+        }
+        for (sl = modinfo->opts; *sl != NULL; sl++) {
+            if (strcmp(group, *sl) == 0) {
+                module_load_one("", modinfo->name, false);
+            }
+        }
+    }
+}
+
+#else
+
+void module_allow_arch(const char *arch) {}
+void qemu_load_module_for_opts(const char *group) {}
+void module_load_qom_one(const char *type) {}
+void module_load_qom_all(void) {}
+
+#endif
diff --git a/util/notify.c b/util/notify.c
new file mode 100644
index 000000000..76bab212a
--- /dev/null
+++ b/util/notify.c
@@ -0,0 +1,76 @@
+/*
+ * Notifier lists
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/notify.h"
+
+void notifier_list_init(NotifierList *list)
+{
+    QLIST_INIT(&list->notifiers);
+}
+
+void notifier_list_add(NotifierList *list, Notifier *notifier)
+{
+    QLIST_INSERT_HEAD(&list->notifiers, notifier, node);
+}
+
+void notifier_remove(Notifier *notifier)
+{
+    QLIST_REMOVE(notifier, node);
+}
+
+void notifier_list_notify(NotifierList *list, void *data)
+{
+    Notifier *notifier, *next;
+
+    QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) {
+        notifier->notify(notifier, data);
+    }
+}
+
+bool notifier_list_empty(NotifierList *list)
+{
+    return QLIST_EMPTY(&list->notifiers);
+}
+
+void notifier_with_return_list_init(NotifierWithReturnList *list)
+{
+    QLIST_INIT(&list->notifiers);
+}
+
+void notifier_with_return_list_add(NotifierWithReturnList *list,
+                                   NotifierWithReturn *notifier)
+{
+    QLIST_INSERT_HEAD(&list->notifiers, notifier, node);
+}
+
+void notifier_with_return_remove(NotifierWithReturn *notifier)
+{
+    QLIST_REMOVE(notifier, node);
+}
+
+int notifier_with_return_list_notify(NotifierWithReturnList *list, void *data)
+{
+    NotifierWithReturn *notifier, *next;
+    int ret = 0;
+
+    QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) {
+        ret = notifier->notify(notifier, data);
+        if (ret != 0) {
+            break;
+        }
+    }
+    return ret;
+}
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 000000000..aa3d199f2
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,30 @@
+#include "qemu/osdep.h"
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+    GSList **list = opaque;
+
+    if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+        *list = g_slist_append(*list, DEVICE(obj));
+    }
+
+    object_child_foreach(obj, nvdimm_device_list, opaque);
+    return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+    GSList *list = NULL;
+
+    object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+    return list;
+}
diff --git a/util/osdep.c b/util/osdep.c
new file mode 100644
index 000000000..42a0a4986
--- /dev/null
+++ b/util/osdep.c
@@ -0,0 +1,617 @@
+/*
+ * QEMU low level functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+/* Needed early for CONFIG_BSD etc. */
+
+#ifdef CONFIG_SOLARIS
+#include <sys/statvfs.h>
+/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
+   discussion about Solaris header problems */
+extern int madvise(char *, size_t, int);
+#endif
+
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/sockets.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+
+static bool fips_enabled = false;
+
+static const char *hw_version = QEMU_HW_VERSION;
+
+int socket_set_cork(int fd, int v)
+{
+#if defined(SOL_TCP) && defined(TCP_CORK)
+    return qemu_setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
+#else
+    return 0;
+#endif
+}
+
+int socket_set_nodelay(int fd)
+{
+    int v = 1;
+    return qemu_setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &v, sizeof(v));
+}
+
+int qemu_madvise(void *addr, size_t len, int advice)
+{
+    if (advice == QEMU_MADV_INVALID) {
+        errno = EINVAL;
+        return -1;
+    }
+#if defined(CONFIG_MADVISE)
+    return madvise(addr, len, advice);
+#elif defined(CONFIG_POSIX_MADVISE)
+    return posix_madvise(addr, len, advice);
+#else
+    errno = EINVAL;
+    return -1;
+#endif
+}
+
+static int qemu_mprotect__osdep(void *addr, size_t size, int prot)
+{
+    g_assert(!((uintptr_t)addr & ~qemu_real_host_page_mask));
+    g_assert(!(size & ~qemu_real_host_page_mask));
+
+#ifdef _WIN32
+    DWORD old_protect;
+
+    if (!VirtualProtect(addr, size, prot, &old_protect)) {
+        g_autofree gchar *emsg = g_win32_error_message(GetLastError());
+        error_report("%s: VirtualProtect failed: %s", __func__, emsg);
+        return -1;
+    }
+    return 0;
+#else
+    if (mprotect(addr, size, prot)) {
+        error_report("%s: mprotect failed: %s", __func__, strerror(errno));
+        return -1;
+    }
+    return 0;
+#endif
+}
+
+int qemu_mprotect_rw(void *addr, size_t size)
+{
+#ifdef _WIN32
+    return qemu_mprotect__osdep(addr, size, PAGE_READWRITE);
+#else
+    return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE);
+#endif
+}
+
+int qemu_mprotect_rwx(void *addr, size_t size)
+{
+#ifdef _WIN32
+    return qemu_mprotect__osdep(addr, size, PAGE_EXECUTE_READWRITE);
+#else
+    return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);
+#endif
+}
+
+int qemu_mprotect_none(void *addr, size_t size)
+{
+#ifdef _WIN32
+    return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);
+#else
+    return qemu_mprotect__osdep(addr, size, PROT_NONE);
+#endif
+}
+
+#ifndef _WIN32
+
+static int fcntl_op_setlk = -1;
+static int fcntl_op_getlk = -1;
+
+/*
+ * Dups an fd and sets the flags
+ */
+int qemu_dup_flags(int fd, int flags)
+{
+    int ret;
+    int serrno;
+    int dup_flags;
+
+    ret = qemu_dup(fd);
+    if (ret == -1) {
+        goto fail;
+    }
+
+    dup_flags = fcntl(ret, F_GETFL);
+    if (dup_flags == -1) {
+        goto fail;
+    }
+
+    if ((flags & O_SYNC) != (dup_flags & O_SYNC)) {
+        errno = EINVAL;
+        goto fail;
+    }
+
+    /* Set/unset flags that we can with fcntl */
+    if (fcntl(ret, F_SETFL, flags) == -1) {
+        goto fail;
+    }
+
+    /* Truncate the file in the cases that open() would truncate it */
+    if (flags & O_TRUNC ||
+            ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))) {
+        if (ftruncate(ret, 0) == -1) {
+            goto fail;
+        }
+    }
+
+    return ret;
+
+fail:
+    serrno = errno;
+    if (ret != -1) {
+        close(ret);
+    }
+    errno = serrno;
+    return -1;
+}
+
+int qemu_dup(int fd)
+{
+    int ret;
+#ifdef F_DUPFD_CLOEXEC
+    ret = fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+    ret = dup(fd);
+    if (ret != -1) {
+        qemu_set_cloexec(ret);
+    }
+#endif
+    return ret;
+}
+
+static int qemu_parse_fdset(const char *param)
+{
+    return qemu_parse_fd(param);
+}
+
+static void qemu_probe_lock_ops(void)
+{
+    if (fcntl_op_setlk == -1) {
+#ifdef F_OFD_SETLK
+        int fd;
+        int ret;
+        struct flock fl = {
+            .l_whence = SEEK_SET,
+            .l_start  = 0,
+            .l_len    = 0,
+            .l_type   = F_WRLCK,
+        };
+
+        fd = open("/dev/null", O_RDWR);
+        if (fd < 0) {
+            fprintf(stderr,
+                    "Failed to open /dev/null for OFD lock probing: %s\n",
+                    strerror(errno));
+            fcntl_op_setlk = F_SETLK;
+            fcntl_op_getlk = F_GETLK;
+            return;
+        }
+        ret = fcntl(fd, F_OFD_GETLK, &fl);
+        close(fd);
+        if (!ret) {
+            fcntl_op_setlk = F_OFD_SETLK;
+            fcntl_op_getlk = F_OFD_GETLK;
+        } else {
+            fcntl_op_setlk = F_SETLK;
+            fcntl_op_getlk = F_GETLK;
+        }
+#else
+        fcntl_op_setlk = F_SETLK;
+        fcntl_op_getlk = F_GETLK;
+#endif
+    }
+}
+
+bool qemu_has_ofd_lock(void)
+{
+    qemu_probe_lock_ops();
+#ifdef F_OFD_SETLK
+    return fcntl_op_setlk == F_OFD_SETLK;
+#else
+    return false;
+#endif
+}
+
+static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type)
+{
+    int ret;
+    struct flock fl = {
+        .l_whence = SEEK_SET,
+        .l_start  = start,
+        .l_len    = len,
+        .l_type   = fl_type,
+    };
+    qemu_probe_lock_ops();
+    do {
+        ret = fcntl(fd, fcntl_op_setlk, &fl);
+    } while (ret == -1 && errno == EINTR);
+    return ret == -1 ? -errno : 0;
+}
+
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive)
+{
+    return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK);
+}
+
+int qemu_unlock_fd(int fd, int64_t start, int64_t len)
+{
+    return qemu_lock_fcntl(fd, start, len, F_UNLCK);
+}
+
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
+{
+    int ret;
+    struct flock fl = {
+        .l_whence = SEEK_SET,
+        .l_start  = start,
+        .l_len    = len,
+        .l_type   = exclusive ? F_WRLCK : F_RDLCK,
+    };
+    qemu_probe_lock_ops();
+    ret = fcntl(fd, fcntl_op_getlk, &fl);
+    if (ret == -1) {
+        return -errno;
+    } else {
+        return fl.l_type == F_UNLCK ? 0 : -EAGAIN;
+    }
+}
+#endif
+
+static int qemu_open_cloexec(const char *name, int flags, mode_t mode)
+{
+    int ret;
+#ifdef O_CLOEXEC
+    ret = open(name, flags | O_CLOEXEC, mode);
+#else
+    ret = open(name, flags, mode);
+    if (ret >= 0) {
+        qemu_set_cloexec(ret);
+    }
+#endif
+    return ret;
+}
+
+/*
+ * Opens a file with FD_CLOEXEC set
+ */
+static int
+qemu_open_internal(const char *name, int flags, mode_t mode, Error **errp)
+{
+    int ret;
+
+#ifndef _WIN32
+    const char *fdset_id_str;
+
+    /* Attempt dup of fd from fd set */
+    if (strstart(name, "/dev/fdset/", &fdset_id_str)) {
+        int64_t fdset_id;
+        int dupfd;
+
+        fdset_id = qemu_parse_fdset(fdset_id_str);
+        if (fdset_id == -1) {
+            error_setg(errp, "Could not parse fdset %s", name);
+            errno = EINVAL;
+            return -1;
+        }
+
+        dupfd = monitor_fdset_dup_fd_add(fdset_id, flags);
+        if (dupfd == -1) {
+            error_setg_errno(errp, errno, "Could not dup FD for %s flags %x",
+                             name, flags);
+            return -1;
+        }
+
+        return dupfd;
+    }
+#endif
+
+    ret = qemu_open_cloexec(name, flags, mode);
+
+    if (ret == -1) {
+        const char *action = flags & O_CREAT ? "create" : "open";
+#ifdef O_DIRECT
+        /* Give more helpful error message for O_DIRECT */
+        if (errno == EINVAL && (flags & O_DIRECT)) {
+            ret = open(name, flags & ~O_DIRECT, mode);
+            if (ret != -1) {
+                close(ret);
+                error_setg(errp, "Could not %s '%s': "
+                           "filesystem does not support O_DIRECT",
+                           action, name);
+                errno = EINVAL; /* restore first open()'s errno */
+                return -1;
+            }
+        }
+#endif /* O_DIRECT */
+        error_setg_errno(errp, errno, "Could not %s '%s'",
+                         action, name);
+    }
+
+    return ret;
+}
+
+
+int qemu_open(const char *name, int flags, Error **errp)
+{
+    assert(!(flags & O_CREAT));
+
+    return qemu_open_internal(name, flags, 0, errp);
+}
+
+
+int qemu_create(const char *name, int flags, mode_t mode, Error **errp)
+{
+    assert(!(flags & O_CREAT));
+
+    return qemu_open_internal(name, flags | O_CREAT, mode, errp);
+}
+
+
+int qemu_open_old(const char *name, int flags, ...)
+{
+    va_list ap;
+    mode_t mode = 0;
+    int ret;
+
+    va_start(ap, flags);
+    if (flags & O_CREAT) {
+        mode = va_arg(ap, int);
+    }
+    va_end(ap);
+
+    ret = qemu_open_internal(name, flags, mode, NULL);
+
+#ifdef O_DIRECT
+    if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
+        error_report("file system may not support O_DIRECT");
+        errno = EINVAL; /* in case it was clobbered */
+    }
+#endif /* O_DIRECT */
+
+    return ret;
+}
+
+int qemu_close(int fd)
+{
+    int64_t fdset_id;
+
+    /* Close fd that was dup'd from an fdset */
+    fdset_id = monitor_fdset_dup_fd_find(fd);
+    if (fdset_id != -1) {
+        int ret;
+
+        ret = close(fd);
+        if (ret == 0) {
+            monitor_fdset_dup_fd_remove(fd);
+        }
+
+        return ret;
+    }
+
+    return close(fd);
+}
+
+/*
+ * Delete a file from the filesystem, unless the filename is /dev/fdset/...
+ *
+ * Returns: On success, zero is returned.  On error, -1 is returned,
+ * and errno is set appropriately.
+ */
+int qemu_unlink(const char *name)
+{
+    if (g_str_has_prefix(name, "/dev/fdset/")) {
+        return 0;
+    }
+
+    return unlink(name);
+}
+
+/*
+ * A variant of write(2) which handles partial write.
+ *
+ * Return the number of bytes transferred.
+ * Set errno if fewer than `count' bytes are written.
+ *
+ * This function don't work with non-blocking fd's.
+ * Any of the possibilities with non-blocking fd's is bad:
+ *   - return a short write (then name is wrong)
+ *   - busy wait adding (errno == EAGAIN) to the loop
+ */
+ssize_t qemu_write_full(int fd, const void *buf, size_t count)
+{
+    ssize_t ret = 0;
+    ssize_t total = 0;
+
+    while (count) {
+        ret = write(fd, buf, count);
+        if (ret < 0) {
+            if (errno == EINTR)
+                continue;
+            break;
+        }
+
+        count -= ret;
+        buf += ret;
+        total += ret;
+    }
+
+    return total;
+}
+
+/*
+ * Opens a socket with FD_CLOEXEC set
+ */
+int qemu_socket(int domain, int type, int protocol)
+{
+    int ret;
+
+#ifdef SOCK_CLOEXEC
+    ret = socket(domain, type | SOCK_CLOEXEC, protocol);
+    if (ret != -1 || errno != EINVAL) {
+        return ret;
+    }
+#endif
+    ret = socket(domain, type, protocol);
+    if (ret >= 0) {
+        qemu_set_cloexec(ret);
+    }
+
+    return ret;
+}
+
+/*
+ * Accept a connection and set FD_CLOEXEC
+ */
+int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen)
+{
+    int ret;
+
+#ifdef CONFIG_ACCEPT4
+    ret = accept4(s, addr, addrlen, SOCK_CLOEXEC);
+    if (ret != -1 || errno != ENOSYS) {
+        return ret;
+    }
+#endif
+    ret = accept(s, addr, addrlen);
+    if (ret >= 0) {
+        qemu_set_cloexec(ret);
+    }
+
+    return ret;
+}
+
+void qemu_set_hw_version(const char *version)
+{
+    hw_version = version;
+}
+
+const char *qemu_hw_version(void)
+{
+    return hw_version;
+}
+
+void fips_set_state(bool requested)
+{
+#ifdef __linux__
+    if (requested) {
+        FILE *fds = fopen("/proc/sys/crypto/fips_enabled", "r");
+        if (fds != NULL) {
+            fips_enabled = (fgetc(fds) == '1');
+            fclose(fds);
+        }
+    }
+#else
+    fips_enabled = false;
+#endif /* __linux__ */
+
+#ifdef _FIPS_DEBUG
+    fprintf(stderr, "FIPS mode %s (requested %s)\n",
+            (fips_enabled ? "enabled" : "disabled"),
+            (requested ? "enabled" : "disabled"));
+#endif
+}
+
+bool fips_get_state(void)
+{
+    return fips_enabled;
+}
+
+#ifdef _WIN32
+static void socket_cleanup(void)
+{
+    WSACleanup();
+}
+#endif
+
+int socket_init(void)
+{
+#ifdef _WIN32
+    WSADATA Data;
+    int ret, err;
+
+    ret = WSAStartup(MAKEWORD(2, 2), &Data);
+    if (ret != 0) {
+        err = WSAGetLastError();
+        fprintf(stderr, "WSAStartup: %d\n", err);
+        return -1;
+    }
+    atexit(socket_cleanup);
+#endif
+    return 0;
+}
+
+
+#ifndef CONFIG_IOVEC
+/* helper function for iov_send_recv() */
+static ssize_t
+readv_writev(int fd, const struct iovec *iov, int iov_cnt, bool do_write)
+{
+    unsigned i = 0;
+    ssize_t ret = 0;
+    while (i < iov_cnt) {
+        ssize_t r = do_write
+            ? write(fd, iov[i].iov_base, iov[i].iov_len)
+            : read(fd, iov[i].iov_base, iov[i].iov_len);
+        if (r > 0) {
+            ret += r;
+        } else if (!r) {
+            break;
+        } else if (errno == EINTR) {
+            continue;
+        } else {
+            /* else it is some "other" error,
+             * only return if there was no data processed. */
+            if (ret == 0) {
+                ret = -1;
+            }
+            break;
+        }
+        i++;
+    }
+    return ret;
+}
+
+ssize_t
+readv(int fd, const struct iovec *iov, int iov_cnt)
+{
+    return readv_writev(fd, iov, iov_cnt, false);
+}
+
+ssize_t
+writev(int fd, const struct iovec *iov, int iov_cnt)
+{
+    return readv_writev(fd, iov, iov_cnt, true);
+}
+#endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
new file mode 100644
index 000000000..e8bdb02e1
--- /dev/null
+++ b/util/oslib-posix.c
@@ -0,0 +1,860 @@
+/*
+ * os-posix-lib.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * QEMU library functions on POSIX which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <termios.h>
+
+#include <glib/gprintf.h>
+
+#include "qemu-common.h"
+#include "sysemu/sysemu.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h"
+#include "qemu/thread.h"
+#include <libgen.h>
+#include "qemu/cutils.h"
+#include "qemu/compiler.h"
+
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <sys/thr.h>
+#include <libutil.h>
+#endif
+
+#ifdef __NetBSD__
+#include <sys/sysctl.h>
+#include <lwp.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+#ifdef __HAIKU__
+#include <kernel/image.h>
+#endif
+
+#include "qemu/mmap-alloc.h"
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#include "qemu/error-report.h"
+#endif
+
+#define MAX_MEM_PREALLOC_THREAD_COUNT 16
+
+struct MemsetThread {
+    char *addr;
+    size_t numpages;
+    size_t hpagesize;
+    QemuThread pgthread;
+    sigjmp_buf env;
+};
+typedef struct MemsetThread MemsetThread;
+
+static MemsetThread *memset_thread;
+static int memset_num_threads;
+static bool memset_thread_failed;
+
+static QemuMutex page_mutex;
+static QemuCond page_cond;
+static bool threads_created_flag;
+
+int qemu_get_thread_id(void)
+{
+#if defined(__linux__)
+    return syscall(SYS_gettid);
+#elif defined(__FreeBSD__)
+    /* thread id is up to INT_MAX */
+    long tid;
+    thr_self(&tid);
+    return (int)tid;
+#elif defined(__NetBSD__)
+    return _lwp_self();
+#elif defined(__OpenBSD__)
+    return getthrid();
+#else
+    return getpid();
+#endif
+}
+
+int qemu_daemon(int nochdir, int noclose)
+{
+    return daemon(nochdir, noclose);
+}
+
+bool qemu_write_pidfile(const char *path, Error **errp)
+{
+    int fd;
+    char pidstr[32];
+
+    while (1) {
+        struct stat a, b;
+        struct flock lock = {
+            .l_type = F_WRLCK,
+            .l_whence = SEEK_SET,
+            .l_len = 0,
+        };
+
+        fd = qemu_open_old(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
+        if (fd == -1) {
+            error_setg_errno(errp, errno, "Cannot open pid file");
+            return false;
+        }
+
+        if (fstat(fd, &b) < 0) {
+            error_setg_errno(errp, errno, "Cannot stat file");
+            goto fail_close;
+        }
+
+        if (fcntl(fd, F_SETLK, &lock)) {
+            error_setg_errno(errp, errno, "Cannot lock pid file");
+            goto fail_close;
+        }
+
+        /*
+         * Now make sure the path we locked is the same one that now
+         * exists on the filesystem.
+         */
+        if (stat(path, &a) < 0) {
+            /*
+             * PID file disappeared, someone else must be racing with
+             * us, so try again.
+             */
+            close(fd);
+            continue;
+        }
+
+        if (a.st_ino == b.st_ino) {
+            break;
+        }
+
+        /*
+         * PID file was recreated, someone else must be racing with
+         * us, so try again.
+         */
+        close(fd);
+    }
+
+    if (ftruncate(fd, 0) < 0) {
+        error_setg_errno(errp, errno, "Failed to truncate pid file");
+        goto fail_unlink;
+    }
+
+    snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
+    if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
+        error_setg(errp, "Failed to write pid file");
+        goto fail_unlink;
+    }
+
+    return true;
+
+fail_unlink:
+    unlink(path);
+fail_close:
+    close(fd);
+    return false;
+}
+
+void *qemu_oom_check(void *ptr)
+{
+    if (ptr == NULL) {
+        fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
+        abort();
+    }
+    return ptr;
+}
+
+void *qemu_try_memalign(size_t alignment, size_t size)
+{
+    void *ptr;
+
+    if (alignment < sizeof(void*)) {
+        alignment = sizeof(void*);
+    } else {
+        g_assert(is_power_of_2(alignment));
+    }
+
+#if defined(CONFIG_POSIX_MEMALIGN)
+    int ret;
+    ret = posix_memalign(&ptr, alignment, size);
+    if (ret != 0) {
+        errno = ret;
+        ptr = NULL;
+    }
+#elif defined(CONFIG_BSD)
+    ptr = valloc(size);
+#else
+    ptr = memalign(alignment, size);
+#endif
+    trace_qemu_memalign(alignment, size, ptr);
+    return ptr;
+}
+
+void *qemu_memalign(size_t alignment, size_t size)
+{
+    return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
+/* alloc shared memory pages */
+void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
+                          bool noreserve)
+{
+    const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
+                                    (noreserve ? QEMU_MAP_NORESERVE : 0);
+    size_t align = QEMU_VMALLOC_ALIGN;
+    void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
+
+    if (ptr == MAP_FAILED) {
+        return NULL;
+    }
+
+    if (alignment) {
+        *alignment = align;
+    }
+
+    trace_qemu_anon_ram_alloc(size, ptr);
+    return ptr;
+}
+
+void qemu_vfree(void *ptr)
+{
+    trace_qemu_vfree(ptr);
+    free(ptr);
+}
+
+void qemu_anon_ram_free(void *ptr, size_t size)
+{
+    trace_qemu_anon_ram_free(ptr, size);
+    qemu_ram_munmap(-1, ptr, size);
+}
+
+void qemu_set_block(int fd)
+{
+    int f;
+    f = fcntl(fd, F_GETFL);
+    assert(f != -1);
+    f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
+    assert(f != -1);
+}
+
+int qemu_try_set_nonblock(int fd)
+{
+    int f;
+    f = fcntl(fd, F_GETFL);
+    if (f == -1) {
+        return -errno;
+    }
+    if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
+        return -errno;
+    }
+    return 0;
+}
+
+void qemu_set_nonblock(int fd)
+{
+    int f;
+    f = qemu_try_set_nonblock(fd);
+    assert(f == 0);
+}
+
+int socket_set_fast_reuse(int fd)
+{
+    int val = 1, ret;
+
+    ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+                     (const char *)&val, sizeof(val));
+
+    assert(ret == 0);
+
+    return ret;
+}
+
+void qemu_set_cloexec(int fd)
+{
+    int f;
+    f = fcntl(fd, F_GETFD);
+    assert(f != -1);
+    f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
+    assert(f != -1);
+}
+
+/*
+ * Creates a pipe with FD_CLOEXEC set on both file descriptors
+ */
+int qemu_pipe(int pipefd[2])
+{
+    int ret;
+
+#ifdef CONFIG_PIPE2
+    ret = pipe2(pipefd, O_CLOEXEC);
+    if (ret != -1 || errno != ENOSYS) {
+        return ret;
+    }
+#endif
+    ret = pipe(pipefd);
+    if (ret == 0) {
+        qemu_set_cloexec(pipefd[0]);
+        qemu_set_cloexec(pipefd[1]);
+    }
+
+    return ret;
+}
+
+char *
+qemu_get_local_state_pathname(const char *relative_pathname)
+{
+    g_autofree char *dir = g_strdup_printf("%s/%s",
+                                           CONFIG_QEMU_LOCALSTATEDIR,
+                                           relative_pathname);
+    return get_relocated_path(dir);
+}
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+    struct termios tty;
+
+    tcgetattr(fd, &tty);
+
+    if (echo) {
+        tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
+    } else {
+        tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
+    }
+
+    tcsetattr(fd, TCSANOW, &tty);
+}
+
+static const char *exec_dir;
+
+void qemu_init_exec_dir(const char *argv0)
+{
+    char *p = NULL;
+    char buf[PATH_MAX];
+
+    if (exec_dir) {
+        return;
+    }
+
+#if defined(__linux__)
+    {
+        int len;
+        len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+        if (len > 0) {
+            buf[len] = 0;
+            p = buf;
+        }
+    }
+#elif defined(__FreeBSD__) \
+      || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
+    {
+#if defined(__FreeBSD__)
+        static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+#else
+        static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
+#endif
+        size_t len = sizeof(buf) - 1;
+
+        *buf = '\0';
+        if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
+            *buf) {
+            buf[sizeof(buf) - 1] = '\0';
+            p = buf;
+        }
+    }
+#elif defined(__APPLE__)
+    {
+        char fpath[PATH_MAX];
+        uint32_t len = sizeof(fpath);
+        if (_NSGetExecutablePath(fpath, &len) == 0) {
+            p = realpath(fpath, buf);
+            if (!p) {
+                return;
+            }
+        }
+    }
+#elif defined(__HAIKU__)
+    {
+        image_info ii;
+        int32_t c = 0;
+
+        *buf = '\0';
+        while (get_next_image_info(0, &c, &ii) == B_OK) {
+            if (ii.type == B_APP_IMAGE) {
+                strncpy(buf, ii.name, sizeof(buf));
+                buf[sizeof(buf) - 1] = 0;
+                p = buf;
+                break;
+            }
+        }
+    }
+#endif
+    /* If we don't have any way of figuring out the actual executable
+       location then try argv[0].  */
+    if (!p && argv0) {
+        p = realpath(argv0, buf);
+    }
+    if (p) {
+        exec_dir = g_path_get_dirname(p);
+    } else {
+        exec_dir = CONFIG_BINDIR;
+    }
+}
+
+const char *qemu_get_exec_dir(void)
+{
+    return exec_dir;
+}
+
+static void sigbus_handler(int signal)
+{
+    int i;
+    if (memset_thread) {
+        for (i = 0; i < memset_num_threads; i++) {
+            if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
+                siglongjmp(memset_thread[i].env, 1);
+            }
+        }
+    }
+}
+
+static void *do_touch_pages(void *arg)
+{
+    MemsetThread *memset_args = (MemsetThread *)arg;
+    sigset_t set, oldset;
+
+    /*
+     * On Linux, the page faults from the loop below can cause mmap_sem
+     * contention with allocation of the thread stacks.  Do not start
+     * clearing until all threads have been created.
+     */
+    qemu_mutex_lock(&page_mutex);
+    while(!threads_created_flag){
+        qemu_cond_wait(&page_cond, &page_mutex);
+    }
+    qemu_mutex_unlock(&page_mutex);
+
+    /* unblock SIGBUS */
+    sigemptyset(&set);
+    sigaddset(&set, SIGBUS);
+    pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
+
+    if (sigsetjmp(memset_args->env, 1)) {
+        memset_thread_failed = true;
+    } else {
+        char *addr = memset_args->addr;
+        size_t numpages = memset_args->numpages;
+        size_t hpagesize = memset_args->hpagesize;
+        size_t i;
+        for (i = 0; i < numpages; i++) {
+            /*
+             * Read & write back the same value, so we don't
+             * corrupt existing user/app data that might be
+             * stored.
+             *
+             * 'volatile' to stop compiler optimizing this away
+             * to a no-op
+             *
+             * TODO: get a better solution from kernel so we
+             * don't need to write at all so we don't cause
+             * wear on the storage backing the region...
+             */
+            *(volatile char *)addr = *addr;
+            addr += hpagesize;
+        }
+    }
+    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+    return NULL;
+}
+
+static inline int get_memset_num_threads(int smp_cpus)
+{
+    long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
+    int ret = 1;
+
+    if (host_procs > 0) {
+        ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
+    }
+    /* In case sysconf() fails, we fall back to single threaded */
+    return ret;
+}
+
+static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
+                            int smp_cpus)
+{
+    static gsize initialized = 0;
+    size_t numpages_per_thread, leftover;
+    char *addr = area;
+    int i = 0;
+
+    if (g_once_init_enter(&initialized)) {
+        qemu_mutex_init(&page_mutex);
+        qemu_cond_init(&page_cond);
+        g_once_init_leave(&initialized, 1);
+    }
+
+    memset_thread_failed = false;
+    threads_created_flag = false;
+    memset_num_threads = get_memset_num_threads(smp_cpus);
+    memset_thread = g_new0(MemsetThread, memset_num_threads);
+    numpages_per_thread = numpages / memset_num_threads;
+    leftover = numpages % memset_num_threads;
+    for (i = 0; i < memset_num_threads; i++) {
+        memset_thread[i].addr = addr;
+        memset_thread[i].numpages = numpages_per_thread + (i < leftover);
+        memset_thread[i].hpagesize = hpagesize;
+        qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
+                           do_touch_pages, &memset_thread[i],
+                           QEMU_THREAD_JOINABLE);
+        addr += memset_thread[i].numpages * hpagesize;
+    }
+
+    qemu_mutex_lock(&page_mutex);
+    threads_created_flag = true;
+    qemu_cond_broadcast(&page_cond);
+    qemu_mutex_unlock(&page_mutex);
+
+    for (i = 0; i < memset_num_threads; i++) {
+        qemu_thread_join(&memset_thread[i].pgthread);
+    }
+    g_free(memset_thread);
+    memset_thread = NULL;
+
+    return memset_thread_failed;
+}
+
+void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
+                     Error **errp)
+{
+    int ret;
+    struct sigaction act, oldact;
+    size_t hpagesize = qemu_fd_getpagesize(fd);
+    size_t numpages = DIV_ROUND_UP(memory, hpagesize);
+
+    memset(&act, 0, sizeof(act));
+    act.sa_handler = &sigbus_handler;
+    act.sa_flags = 0;
+
+    ret = sigaction(SIGBUS, &act, &oldact);
+    if (ret) {
+        error_setg_errno(errp, errno,
+            "os_mem_prealloc: failed to install signal handler");
+        return;
+    }
+
+    /* touch pages simultaneously */
+    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+        error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
+            "pages available to allocate guest RAM");
+    }
+
+    ret = sigaction(SIGBUS, &oldact, NULL);
+    if (ret) {
+        /* Terminate QEMU since it can't recover from error */
+        perror("os_mem_prealloc: failed to reinstall signal handler");
+        exit(1);
+    }
+}
+
+char *qemu_get_pid_name(pid_t pid)
+{
+    char *name = NULL;
+
+#if defined(__FreeBSD__)
+    /* BSDs don't have /proc, but they provide a nice substitute */
+    struct kinfo_proc *proc = kinfo_getproc(pid);
+
+    if (proc) {
+        name = g_strdup(proc->ki_comm);
+        free(proc);
+    }
+#else
+    /* Assume a system with reasonable procfs */
+    char *pid_path;
+    size_t len;
+
+    pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
+    g_file_get_contents(pid_path, &name, &len, NULL);
+    g_free(pid_path);
+#endif
+
+    return name;
+}
+
+
+pid_t qemu_fork(Error **errp)
+{
+    sigset_t oldmask, newmask;
+    struct sigaction sig_action;
+    int saved_errno;
+    pid_t pid;
+
+    /*
+     * Need to block signals now, so that child process can safely
+     * kill off caller's signal handlers without a race.
+     */
+    sigfillset(&newmask);
+    if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
+        error_setg_errno(errp, errno,
+                         "cannot block signals");
+        return -1;
+    }
+
+    pid = fork();
+    saved_errno = errno;
+
+    if (pid < 0) {
+        /* attempt to restore signal mask, but ignore failure, to
+         * avoid obscuring the fork failure */
+        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
+        error_setg_errno(errp, saved_errno,
+                         "cannot fork child process");
+        errno = saved_errno;
+        return -1;
+    } else if (pid) {
+        /* parent process */
+
+        /* Restore our original signal mask now that the child is
+         * safely running. Only documented failures are EFAULT (not
+         * possible, since we are using just-grabbed mask) or EINVAL
+         * (not possible, since we are using correct arguments).  */
+        (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
+    } else {
+        /* child process */
+        size_t i;
+
+        /* Clear out all signal handlers from parent so nothing
+         * unexpected can happen in our child once we unblock
+         * signals */
+        sig_action.sa_handler = SIG_DFL;
+        sig_action.sa_flags = 0;
+        sigemptyset(&sig_action.sa_mask);
+
+        for (i = 1; i < NSIG; i++) {
+            /* Only possible errors are EFAULT or EINVAL The former
+             * won't happen, the latter we expect, so no need to check
+             * return value */
+            (void)sigaction(i, &sig_action, NULL);
+        }
+
+        /* Unmask all signals in child, since we've no idea what the
+         * caller's done with their signal mask and don't want to
+         * propagate that to children */
+        sigemptyset(&newmask);
+        if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
+            Error *local_err = NULL;
+            error_setg_errno(&local_err, errno,
+                             "cannot unblock signals");
+            error_report_err(local_err);
+            _exit(1);
+        }
+    }
+    return pid;
+}
+
+void *qemu_alloc_stack(size_t *sz)
+{
+    void *ptr, *guardpage;
+    int flags;
+#ifdef CONFIG_DEBUG_STACK_USAGE
+    void *ptr2;
+#endif
+    size_t pagesz = qemu_real_host_page_size;
+#ifdef _SC_THREAD_STACK_MIN
+    /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
+    long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
+    *sz = MAX(MAX(min_stack_sz, 0), *sz);
+#endif
+    /* adjust stack size to a multiple of the page size */
+    *sz = ROUND_UP(*sz, pagesz);
+    /* allocate one extra page for the guard page */
+    *sz += pagesz;
+
+    flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#if defined(MAP_STACK) && defined(__OpenBSD__)
+    /* Only enable MAP_STACK on OpenBSD. Other OS's such as
+     * Linux/FreeBSD/NetBSD have a flag with the same name
+     * but have differing functionality. OpenBSD will SEGV
+     * if it spots execution with a stack pointer pointing
+     * at memory that was not allocated with MAP_STACK.
+     */
+    flags |= MAP_STACK;
+#endif
+
+    ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (ptr == MAP_FAILED) {
+        perror("failed to allocate memory for stack");
+        abort();
+    }
+
+#if defined(HOST_IA64)
+    /* separate register stack */
+    guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
+#elif defined(HOST_HPPA)
+    /* stack grows up */
+    guardpage = ptr + *sz - pagesz;
+#else
+    /* stack grows down */
+    guardpage = ptr;
+#endif
+    if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
+        perror("failed to set up stack guard page");
+        abort();
+    }
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+    for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
+        *(uint32_t *)ptr2 = 0xdeadbeaf;
+    }
+#endif
+
+    return ptr;
+}
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static __thread unsigned int max_stack_usage;
+#endif
+
+void qemu_free_stack(void *stack, size_t sz)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+    unsigned int usage;
+    void *ptr;
+
+    for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
+         ptr += sizeof(uint32_t)) {
+        if (*(uint32_t *)ptr != 0xdeadbeaf) {
+            break;
+        }
+    }
+    usage = sz - (uintptr_t) (ptr - stack);
+    if (usage > max_stack_usage) {
+        error_report("thread %d max stack usage increased from %u to %u",
+                     qemu_get_thread_id(), max_stack_usage, usage);
+        max_stack_usage = usage;
+    }
+#endif
+
+    munmap(stack, sz);
+}
+
+/*
+ * Disable CFI checks.
+ * We are going to call a signal hander directly. Such handler may or may not
+ * have been defined in our binary, so there's no guarantee that the pointer
+ * used to set the handler is a cfi-valid pointer. Since the handlers are
+ * stored in kernel memory, changing the handler to an attacker-defined
+ * function requires being able to call a sigaction() syscall,
+ * which is not as easy as overwriting a pointer in memory.
+ */
+QEMU_DISABLE_CFI
+void sigaction_invoke(struct sigaction *action,
+                      struct qemu_signalfd_siginfo *info)
+{
+    siginfo_t si = {};
+    si.si_signo = info->ssi_signo;
+    si.si_errno = info->ssi_errno;
+    si.si_code = info->ssi_code;
+
+    /* Convert the minimal set of fields defined by POSIX.
+     * Positive si_code values are reserved for kernel-generated
+     * signals, where the valid siginfo fields are determined by
+     * the signal number.  But according to POSIX, it is unspecified
+     * whether SI_USER and SI_QUEUE have values less than or equal to
+     * zero.
+     */
+    if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
+        info->ssi_code <= 0) {
+        /* SIGTERM, etc.  */
+        si.si_pid = info->ssi_pid;
+        si.si_uid = info->ssi_uid;
+    } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
+               info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
+        si.si_addr = (void *)(uintptr_t)info->ssi_addr;
+    } else if (info->ssi_signo == SIGCHLD) {
+        si.si_pid = info->ssi_pid;
+        si.si_status = info->ssi_status;
+        si.si_uid = info->ssi_uid;
+    }
+    action->sa_sigaction(info->ssi_signo, &si, NULL);
+}
+
+#ifndef HOST_NAME_MAX
+# ifdef _POSIX_HOST_NAME_MAX
+#  define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+# else
+#  define HOST_NAME_MAX 255
+# endif
+#endif
+
+char *qemu_get_host_name(Error **errp)
+{
+    long len = -1;
+    g_autofree char *hostname = NULL;
+
+#ifdef _SC_HOST_NAME_MAX
+    len = sysconf(_SC_HOST_NAME_MAX);
+#endif /* _SC_HOST_NAME_MAX */
+
+    if (len < 0) {
+        len = HOST_NAME_MAX;
+    }
+
+    /* Unfortunately, gethostname() below does not guarantee a
+     * NULL terminated string. Therefore, allocate one byte more
+     * to be sure. */
+    hostname = g_new0(char, len + 1);
+
+    if (gethostname(hostname, len) < 0) {
+        error_setg_errno(errp, errno,
+                         "cannot get hostname");
+        return NULL;
+    }
+
+    return g_steal_pointer(&hostname);
+}
+
+size_t qemu_get_host_physmem(void)
+{
+#ifdef _SC_PHYS_PAGES
+    long pages = sysconf(_SC_PHYS_PAGES);
+    if (pages > 0) {
+        if (pages > SIZE_MAX / qemu_real_host_page_size) {
+            return SIZE_MAX;
+        } else {
+            return pages * qemu_real_host_page_size;
+        }
+    }
+#endif
+    return 0;
+}
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
new file mode 100644
index 000000000..af559ef33
--- /dev/null
+++ b/util/oslib-win32.c
@@ -0,0 +1,654 @@
+/*
+ * os-win32.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010-2016 Red Hat, Inc.
+ *
+ * QEMU library functions for win32 which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * The implementation of g_poll (functions poll_rest, g_poll) at the end of
+ * this file are based on code from GNOME glib-2 and use a different license,
+ * see the license comment there.
+ */
+
+#include "qemu/osdep.h"
+#include <windows.h>
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "trace.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include <malloc.h>
+
+/* this must come after including "trace.h" */
+#include <shlobj.h>
+
+void *qemu_oom_check(void *ptr)
+{
+    if (ptr == NULL) {
+        fprintf(stderr, "Failed to allocate memory: %lu\n", GetLastError());
+        abort();
+    }
+    return ptr;
+}
+
+void *qemu_try_memalign(size_t alignment, size_t size)
+{
+    void *ptr;
+
+    g_assert(size != 0);
+    if (alignment < sizeof(void *)) {
+        alignment = sizeof(void *);
+    } else {
+        g_assert(is_power_of_2(alignment));
+    }
+    ptr = _aligned_malloc(size, alignment);
+    trace_qemu_memalign(alignment, size, ptr);
+    return ptr;
+}
+
+void *qemu_memalign(size_t alignment, size_t size)
+{
+    return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
+static int get_allocation_granularity(void)
+{
+    SYSTEM_INFO system_info;
+
+    GetSystemInfo(&system_info);
+    return system_info.dwAllocationGranularity;
+}
+
+void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared,
+                          bool noreserve)
+{
+    void *ptr;
+
+    if (noreserve) {
+        /*
+         * We need a MEM_COMMIT before accessing any memory in a MEM_RESERVE
+         * area; we cannot easily mimic POSIX MAP_NORESERVE semantics.
+         */
+        error_report("Skipping reservation of swap space is not supported.");
+        return NULL;
+    }
+
+    ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+    trace_qemu_anon_ram_alloc(size, ptr);
+
+    if (ptr && align) {
+        *align = MAX(get_allocation_granularity(), getpagesize());
+    }
+    return ptr;
+}
+
+void qemu_vfree(void *ptr)
+{
+    trace_qemu_vfree(ptr);
+    _aligned_free(ptr);
+}
+
+void qemu_anon_ram_free(void *ptr, size_t size)
+{
+    trace_qemu_anon_ram_free(ptr, size);
+    if (ptr) {
+        VirtualFree(ptr, 0, MEM_RELEASE);
+    }
+}
+
+#ifndef _POSIX_THREAD_SAFE_FUNCTIONS
+/* FIXME: add proper locking */
+struct tm *gmtime_r(const time_t *timep, struct tm *result)
+{
+    struct tm *p = gmtime(timep);
+    memset(result, 0, sizeof(*result));
+    if (p) {
+        *result = *p;
+        p = result;
+    }
+    return p;
+}
+
+/* FIXME: add proper locking */
+struct tm *localtime_r(const time_t *timep, struct tm *result)
+{
+    struct tm *p = localtime(timep);
+    memset(result, 0, sizeof(*result));
+    if (p) {
+        *result = *p;
+        p = result;
+    }
+    return p;
+}
+#endif /* _POSIX_THREAD_SAFE_FUNCTIONS */
+
+static int socket_error(void)
+{
+    switch (WSAGetLastError()) {
+    case 0:
+        return 0;
+    case WSAEINTR:
+        return EINTR;
+    case WSAEINVAL:
+        return EINVAL;
+    case WSA_INVALID_HANDLE:
+        return EBADF;
+    case WSA_NOT_ENOUGH_MEMORY:
+        return ENOMEM;
+    case WSA_INVALID_PARAMETER:
+        return EINVAL;
+    case WSAENAMETOOLONG:
+        return ENAMETOOLONG;
+    case WSAENOTEMPTY:
+        return ENOTEMPTY;
+    case WSAEWOULDBLOCK:
+         /* not using EWOULDBLOCK as we don't want code to have
+          * to check both EWOULDBLOCK and EAGAIN */
+        return EAGAIN;
+    case WSAEINPROGRESS:
+        return EINPROGRESS;
+    case WSAEALREADY:
+        return EALREADY;
+    case WSAENOTSOCK:
+        return ENOTSOCK;
+    case WSAEDESTADDRREQ:
+        return EDESTADDRREQ;
+    case WSAEMSGSIZE:
+        return EMSGSIZE;
+    case WSAEPROTOTYPE:
+        return EPROTOTYPE;
+    case WSAENOPROTOOPT:
+        return ENOPROTOOPT;
+    case WSAEPROTONOSUPPORT:
+        return EPROTONOSUPPORT;
+    case WSAEOPNOTSUPP:
+        return EOPNOTSUPP;
+    case WSAEAFNOSUPPORT:
+        return EAFNOSUPPORT;
+    case WSAEADDRINUSE:
+        return EADDRINUSE;
+    case WSAEADDRNOTAVAIL:
+        return EADDRNOTAVAIL;
+    case WSAENETDOWN:
+        return ENETDOWN;
+    case WSAENETUNREACH:
+        return ENETUNREACH;
+    case WSAENETRESET:
+        return ENETRESET;
+    case WSAECONNABORTED:
+        return ECONNABORTED;
+    case WSAECONNRESET:
+        return ECONNRESET;
+    case WSAENOBUFS:
+        return ENOBUFS;
+    case WSAEISCONN:
+        return EISCONN;
+    case WSAENOTCONN:
+        return ENOTCONN;
+    case WSAETIMEDOUT:
+        return ETIMEDOUT;
+    case WSAECONNREFUSED:
+        return ECONNREFUSED;
+    case WSAELOOP:
+        return ELOOP;
+    case WSAEHOSTUNREACH:
+        return EHOSTUNREACH;
+    default:
+        return EIO;
+    }
+}
+
+void qemu_set_block(int fd)
+{
+    unsigned long opt = 0;
+    WSAEventSelect(fd, NULL, 0);
+    ioctlsocket(fd, FIONBIO, &opt);
+}
+
+int qemu_try_set_nonblock(int fd)
+{
+    unsigned long opt = 1;
+    if (ioctlsocket(fd, FIONBIO, &opt) != NO_ERROR) {
+        return -socket_error();
+    }
+    return 0;
+}
+
+void qemu_set_nonblock(int fd)
+{
+    (void)qemu_try_set_nonblock(fd);
+}
+
+int socket_set_fast_reuse(int fd)
+{
+    /* Enabling the reuse of an endpoint that was used by a socket still in
+     * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows
+     * fast reuse is the default and SO_REUSEADDR does strange things. So we
+     * don't have to do anything here. More info can be found at:
+     * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */
+    return 0;
+}
+
+int inet_aton(const char *cp, struct in_addr *ia)
+{
+    uint32_t addr = inet_addr(cp);
+    if (addr == 0xffffffff) {
+        return 0;
+    }
+    ia->s_addr = addr;
+    return 1;
+}
+
+void qemu_set_cloexec(int fd)
+{
+}
+
+/* Offset between 1/1/1601 and 1/1/1970 in 100 nanosec units */
+#define _W32_FT_OFFSET (116444736000000000ULL)
+
+int qemu_gettimeofday(qemu_timeval *tp)
+{
+  union {
+    unsigned long long ns100; /*time since 1 Jan 1601 in 100ns units */
+    FILETIME ft;
+  }  _now;
+
+  if(tp) {
+      GetSystemTimeAsFileTime (&_now.ft);
+      tp->tv_usec=(long)((_now.ns100 / 10ULL) % 1000000ULL );
+      tp->tv_sec= (long)((_now.ns100 - _W32_FT_OFFSET) / 10000000ULL);
+  }
+  /* Always return 0 as per Open Group Base Specifications Issue 6.
+     Do not set errno on error.  */
+  return 0;
+}
+
+int qemu_get_thread_id(void)
+{
+    return GetCurrentThreadId();
+}
+
+char *
+qemu_get_local_state_pathname(const char *relative_pathname)
+{
+    HRESULT result;
+    char base_path[MAX_PATH+1] = "";
+
+    result = SHGetFolderPath(NULL, CSIDL_COMMON_APPDATA, NULL,
+                             /* SHGFP_TYPE_CURRENT */ 0, base_path);
+    if (result != S_OK) {
+        /* misconfigured environment */
+        g_critical("CSIDL_COMMON_APPDATA unavailable: %ld", (long)result);
+        abort();
+    }
+    return g_strdup_printf("%s" G_DIR_SEPARATOR_S "%s", base_path,
+                           relative_pathname);
+}
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+    HANDLE handle = (HANDLE)_get_osfhandle(fd);
+    DWORD dwMode = 0;
+
+    if (handle == INVALID_HANDLE_VALUE) {
+        return;
+    }
+
+    GetConsoleMode(handle, &dwMode);
+
+    if (echo) {
+        SetConsoleMode(handle, dwMode | ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT);
+    } else {
+        SetConsoleMode(handle,
+                       dwMode & ~(ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT));
+    }
+}
+
+static const char *exec_dir;
+
+void qemu_init_exec_dir(const char *argv0)
+{
+
+    char *p;
+    char buf[MAX_PATH];
+    DWORD len;
+
+    if (exec_dir) {
+        return;
+    }
+
+    len = GetModuleFileName(NULL, buf, sizeof(buf) - 1);
+    if (len == 0) {
+        return;
+    }
+
+    buf[len] = 0;
+    p = buf + len - 1;
+    while (p != buf && *p != '\\') {
+        p--;
+    }
+    *p = 0;
+    if (access(buf, R_OK) == 0) {
+        exec_dir = g_strdup(buf);
+    } else {
+        exec_dir = CONFIG_BINDIR;
+    }
+}
+
+const char *qemu_get_exec_dir(void)
+{
+    return exec_dir;
+}
+
+int getpagesize(void)
+{
+    SYSTEM_INFO system_info;
+
+    GetSystemInfo(&system_info);
+    return system_info.dwPageSize;
+}
+
+void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
+                     Error **errp)
+{
+    int i;
+    size_t pagesize = qemu_real_host_page_size;
+
+    memory = (memory + pagesize - 1) & -pagesize;
+    for (i = 0; i < memory / pagesize; i++) {
+        memset(area + pagesize * i, 0, 1);
+    }
+}
+
+char *qemu_get_pid_name(pid_t pid)
+{
+    /* XXX Implement me */
+    abort();
+}
+
+
+pid_t qemu_fork(Error **errp)
+{
+    errno = ENOSYS;
+    error_setg_errno(errp, errno,
+                     "cannot fork child process");
+    return -1;
+}
+
+
+#undef connect
+int qemu_connect_wrap(int sockfd, const struct sockaddr *addr,
+                      socklen_t addrlen)
+{
+    int ret;
+    ret = connect(sockfd, addr, addrlen);
+    if (ret < 0) {
+        if (WSAGetLastError() == WSAEWOULDBLOCK) {
+            errno = EINPROGRESS;
+        } else {
+            errno = socket_error();
+        }
+    }
+    return ret;
+}
+
+
+#undef listen
+int qemu_listen_wrap(int sockfd, int backlog)
+{
+    int ret;
+    ret = listen(sockfd, backlog);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef bind
+int qemu_bind_wrap(int sockfd, const struct sockaddr *addr,
+                   socklen_t addrlen)
+{
+    int ret;
+    ret = bind(sockfd, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef socket
+int qemu_socket_wrap(int domain, int type, int protocol)
+{
+    int ret;
+    ret = socket(domain, type, protocol);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef accept
+int qemu_accept_wrap(int sockfd, struct sockaddr *addr,
+                     socklen_t *addrlen)
+{
+    int ret;
+    ret = accept(sockfd, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef shutdown
+int qemu_shutdown_wrap(int sockfd, int how)
+{
+    int ret;
+    ret = shutdown(sockfd, how);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef ioctlsocket
+int qemu_ioctlsocket_wrap(int fd, int req, void *val)
+{
+    int ret;
+    ret = ioctlsocket(fd, req, val);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef closesocket
+int qemu_closesocket_wrap(int fd)
+{
+    int ret;
+    ret = closesocket(fd);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef getsockopt
+int qemu_getsockopt_wrap(int sockfd, int level, int optname,
+                         void *optval, socklen_t *optlen)
+{
+    int ret;
+    ret = getsockopt(sockfd, level, optname, optval, optlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef setsockopt
+int qemu_setsockopt_wrap(int sockfd, int level, int optname,
+                         const void *optval, socklen_t optlen)
+{
+    int ret;
+    ret = setsockopt(sockfd, level, optname, optval, optlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef getpeername
+int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr,
+                          socklen_t *addrlen)
+{
+    int ret;
+    ret = getpeername(sockfd, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef getsockname
+int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr,
+                          socklen_t *addrlen)
+{
+    int ret;
+    ret = getsockname(sockfd, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef send
+ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags)
+{
+    int ret;
+    ret = send(sockfd, buf, len, flags);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef sendto
+ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags,
+                         const struct sockaddr *addr, socklen_t addrlen)
+{
+    int ret;
+    ret = sendto(sockfd, buf, len, flags, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef recv
+ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags)
+{
+    int ret;
+    ret = recv(sockfd, buf, len, flags);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+
+#undef recvfrom
+ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags,
+                           struct sockaddr *addr, socklen_t *addrlen)
+{
+    int ret;
+    ret = recvfrom(sockfd, buf, len, flags, addr, addrlen);
+    if (ret < 0) {
+        errno = socket_error();
+    }
+    return ret;
+}
+
+bool qemu_write_pidfile(const char *filename, Error **errp)
+{
+    char buffer[128];
+    int len;
+    HANDLE file;
+    OVERLAPPED overlap;
+    BOOL ret;
+    memset(&overlap, 0, sizeof(overlap));
+
+    file = CreateFile(filename, GENERIC_WRITE, FILE_SHARE_READ, NULL,
+                      OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+
+    if (file == INVALID_HANDLE_VALUE) {
+        error_setg(errp, "Failed to create PID file");
+        return false;
+    }
+    len = snprintf(buffer, sizeof(buffer), FMT_pid "\n", (pid_t)getpid());
+    ret = WriteFile(file, (LPCVOID)buffer, (DWORD)len,
+                    NULL, &overlap);
+    CloseHandle(file);
+    if (ret == 0) {
+        error_setg(errp, "Failed to write PID file");
+        return false;
+    }
+    return true;
+}
+
+char *qemu_get_host_name(Error **errp)
+{
+    wchar_t tmp[MAX_COMPUTERNAME_LENGTH + 1];
+    DWORD size = G_N_ELEMENTS(tmp);
+
+    if (GetComputerNameW(tmp, &size) == 0) {
+        error_setg_win32(errp, GetLastError(), "failed close handle");
+        return NULL;
+    }
+
+    return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL);
+}
+
+size_t qemu_get_host_physmem(void)
+{
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+
+    if (GlobalMemoryStatusEx(&statex)) {
+        return statex.ullTotalPhys;
+    }
+    return 0;
+}
diff --git a/util/pagesize.c b/util/pagesize.c
new file mode 100644
index 000000000..998632cf6
--- /dev/null
+++ b/util/pagesize.c
@@ -0,0 +1,18 @@
+/*
+ * pagesize.c - query the host about its page size
+ *
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+uintptr_t qemu_real_host_page_size;
+intptr_t qemu_real_host_page_mask;
+
+static void __attribute__((constructor)) init_real_host_page_size(void)
+{
+    qemu_real_host_page_size = getpagesize();
+    qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
+}
diff --git a/util/path.c b/util/path.c
new file mode 100644
index 000000000..8e174eb43
--- /dev/null
+++ b/util/path.c
@@ -0,0 +1,70 @@
+/* Code to mangle pathnames into those matching a given prefix.
+   eg. open("/lib/foo.so") => open("/usr/gnemul/i386-linux/lib/foo.so");
+
+   The assumption is that this area does not change.
+*/
+#include "qemu/osdep.h"
+#include <sys/param.h>
+#include <dirent.h>
+#include "qemu/cutils.h"
+#include "qemu/path.h"
+#include "qemu/thread.h"
+
+static const char *base;
+static GHashTable *hash;
+static QemuMutex lock;
+
+void init_paths(const char *prefix)
+{
+    if (prefix[0] == '\0' || !strcmp(prefix, "/")) {
+        return;
+    }
+
+    if (prefix[0] == '/') {
+        base = g_strdup(prefix);
+    } else {
+        char *cwd = g_get_current_dir();
+        base = g_build_filename(cwd, prefix, NULL);
+        g_free(cwd);
+    }
+
+    hash = g_hash_table_new(g_str_hash, g_str_equal);
+    qemu_mutex_init(&lock);
+}
+
+/* Look for path in emulation dir, otherwise return name. */
+const char *path(const char *name)
+{
+    gpointer key, value;
+    const char *ret;
+
+    /* Only do absolute paths: quick and dirty, but should mostly be OK.  */
+    if (!base || !name || name[0] != '/') {
+        return name;
+    }
+
+    qemu_mutex_lock(&lock);
+
+    /* Have we looked up this file before?  */
+    if (g_hash_table_lookup_extended(hash, name, &key, &value)) {
+        ret = value ? value : name;
+    } else {
+        char *save = g_strdup(name);
+        char *full = g_build_filename(base, name, NULL);
+
+        /* Look for the path; record the result, pass or fail.  */
+        if (access(full, F_OK) == 0) {
+            /* Exists.  */
+            g_hash_table_insert(hash, save, full);
+            ret = full;
+        } else {
+            /* Does not exist.  */
+            g_free(full);
+            g_hash_table_insert(hash, save, NULL);
+            ret = name;
+        }
+    }
+
+    qemu_mutex_unlock(&lock);
+    return ret;
+}
diff --git a/util/qdist.c b/util/qdist.c
new file mode 100644
index 000000000..5f75e24c2
--- /dev/null
+++ b/util/qdist.c
@@ -0,0 +1,397 @@
+/*
+ * qdist.c - QEMU helpers for handling frequency distributions of data.
+ *
+ * Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/qdist.h"
+
+#include <math.h>
+#ifndef NAN
+#define NAN (0.0 / 0.0)
+#endif
+
+#define QDIST_EMPTY_STR "(empty)"
+
+void qdist_init(struct qdist *dist)
+{
+    dist->entries = g_new(struct qdist_entry, 1);
+    dist->size = 1;
+    dist->n = 0;
+}
+
+void qdist_destroy(struct qdist *dist)
+{
+    g_free(dist->entries);
+}
+
+static inline int qdist_cmp_double(double a, double b)
+{
+    if (a > b) {
+        return 1;
+    } else if (a < b) {
+        return -1;
+    }
+    return 0;
+}
+
+static int qdist_cmp(const void *ap, const void *bp)
+{
+    const struct qdist_entry *a = ap;
+    const struct qdist_entry *b = bp;
+
+    return qdist_cmp_double(a->x, b->x);
+}
+
+void qdist_add(struct qdist *dist, double x, long count)
+{
+    struct qdist_entry *entry = NULL;
+
+    if (dist->n) {
+        struct qdist_entry e;
+
+        e.x = x;
+        entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp);
+    }
+
+    if (entry) {
+        entry->count += count;
+        return;
+    }
+
+    if (unlikely(dist->n == dist->size)) {
+        dist->size *= 2;
+        dist->entries = g_renew(struct qdist_entry, dist->entries, dist->size);
+    }
+    dist->n++;
+    entry = &dist->entries[dist->n - 1];
+    entry->x = x;
+    entry->count = count;
+    qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp);
+}
+
+void qdist_inc(struct qdist *dist, double x)
+{
+    qdist_add(dist, x, 1);
+}
+
+/*
+ * Unicode for block elements. See:
+ *   https://en.wikipedia.org/wiki/Block_Elements
+ */
+static const gunichar qdist_blocks[] = {
+    0x2581,
+    0x2582,
+    0x2583,
+    0x2584,
+    0x2585,
+    0x2586,
+    0x2587,
+    0x2588
+};
+
+#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)
+
+/*
+ * Print a distribution into a string.
+ *
+ * This function assumes that appropriate binning has been done on the input;
+ * see qdist_bin__internal() and qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+static char *qdist_pr_internal(const struct qdist *dist)
+{
+    double min, max;
+    GString *s = g_string_new("");
+    size_t i;
+
+    /* if only one entry, its printout will be either full or empty */
+    if (dist->n == 1) {
+        if (dist->entries[0].count) {
+            g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]);
+        } else {
+            g_string_append_c(s, ' ');
+        }
+        goto out;
+    }
+
+    /* get min and max counts */
+    min = dist->entries[0].count;
+    max = min;
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+
+        if (e->count < min) {
+            min = e->count;
+        }
+        if (e->count > max) {
+            max = e->count;
+        }
+    }
+
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+        int index;
+
+        /* make an exception with 0; instead of using block[0], print a space */
+        if (e->count) {
+            /* divide first to avoid loss of precision when e->count == max */
+            index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1);
+            g_string_append_unichar(s, qdist_blocks[index]);
+        } else {
+            g_string_append_c(s, ' ');
+        }
+    }
+ out:
+    return g_string_free(s, FALSE);
+}
+
+/*
+ * Bin the distribution in @from into @n bins of consecutive, non-overlapping
+ * intervals, copying the result to @to.
+ *
+ * This function is internal to qdist: only this file and test code should
+ * ever call it.
+ *
+ * Note: calling this function on an already-binned qdist is a bug.
+ *
+ * If @n == 0 or @from->n == 1, use @from->n.
+ */
+void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n)
+{
+    double xmin, xmax;
+    double step;
+    size_t i, j;
+
+    qdist_init(to);
+
+    if (from->n == 0) {
+        return;
+    }
+    if (n == 0 || from->n == 1) {
+        n = from->n;
+    }
+
+    /* set equally-sized bins between @from's left and right */
+    xmin = qdist_xmin(from);
+    xmax = qdist_xmax(from);
+    step = (xmax - xmin) / n;
+
+    if (n == from->n) {
+        /* if @from's entries are equally spaced, no need to re-bin */
+        for (i = 0; i < from->n; i++) {
+            if (from->entries[i].x != xmin + i * step) {
+                goto rebin;
+            }
+        }
+        /* they're equally spaced, so copy the dist and bail out */
+        to->entries = g_renew(struct qdist_entry, to->entries, n);
+        to->n = from->n;
+        memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n);
+        return;
+    }
+
+ rebin:
+    j = 0;
+    for (i = 0; i < n; i++) {
+        double x;
+        double left, right;
+
+        left = xmin + i * step;
+        right = xmin + (i + 1) * step;
+
+        /* Add x, even if it might not get any counts later */
+        x = left;
+        qdist_add(to, x, 0);
+
+        /*
+         * To avoid double-counting we capture [left, right) ranges, except for
+         * the righmost bin, which captures a [left, right] range.
+         */
+        while (j < from->n && (from->entries[j].x < right || i == n - 1)) {
+            struct qdist_entry *o = &from->entries[j];
+
+            qdist_add(to, x, o->count);
+            j++;
+        }
+    }
+}
+
+/*
+ * Print @dist into a string, after re-binning it into @n bins of consecutive,
+ * non-overlapping intervals.
+ *
+ * If @n == 0, use @orig->n.
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr_plain(const struct qdist *dist, size_t n)
+{
+    struct qdist binned;
+    char *ret;
+
+    if (dist->n == 0) {
+        return g_strdup(QDIST_EMPTY_STR);
+    }
+    qdist_bin__internal(&binned, dist, n);
+    ret = qdist_pr_internal(&binned);
+    qdist_destroy(&binned);
+    return ret;
+}
+
+static char *qdist_pr_label(const struct qdist *dist, size_t n_bins,
+                            uint32_t opt, bool is_left)
+{
+    const char *percent;
+    const char *lparen;
+    const char *rparen;
+    GString *s;
+    double x1, x2, step;
+    double x;
+    double n;
+    int dec;
+
+    s = g_string_new("");
+    if (!(opt & QDIST_PR_LABELS)) {
+        goto out;
+    }
+
+    dec = opt & QDIST_PR_NODECIMAL ? 0 : 1;
+    percent = opt & QDIST_PR_PERCENT ? "%" : "";
+
+    n = n_bins ? n_bins : dist->n;
+    x = is_left ? qdist_xmin(dist) : qdist_xmax(dist);
+    step = (qdist_xmax(dist) - qdist_xmin(dist)) / n;
+
+    if (opt & QDIST_PR_100X) {
+        x *= 100.0;
+        step *= 100.0;
+    }
+    if (opt & QDIST_PR_NOBINRANGE) {
+        lparen = rparen = "";
+        x1 = x;
+        x2 = x; /* unnecessary, but a dumb compiler might not figure it out */
+    } else {
+        lparen = "[";
+        rparen = is_left ? ")" : "]";
+        if (is_left) {
+            x1 = x;
+            x2 = x + step;
+        } else {
+            x1 = x - step;
+            x2 = x;
+        }
+    }
+    g_string_append_printf(s, "%s%.*f", lparen, dec, x1);
+    if (!(opt & QDIST_PR_NOBINRANGE)) {
+        g_string_append_printf(s, ",%.*f%s", dec, x2, rparen);
+    }
+    g_string_append(s, percent);
+ out:
+    return g_string_free(s, FALSE);
+}
+
+/*
+ * Print the distribution's histogram into a string.
+ *
+ * See also: qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt)
+{
+    const char *border = opt & QDIST_PR_BORDER ? "|" : "";
+    char *llabel, *rlabel;
+    char *hgram;
+    GString *s;
+
+    if (dist->n == 0) {
+        return g_strdup(QDIST_EMPTY_STR);
+    }
+
+    s = g_string_new("");
+
+    llabel = qdist_pr_label(dist, n_bins, opt, true);
+    rlabel = qdist_pr_label(dist, n_bins, opt, false);
+    hgram = qdist_pr_plain(dist, n_bins);
+    g_string_append_printf(s, "%s%s%s%s%s",
+                           llabel, border, hgram, border, rlabel);
+    g_free(llabel);
+    g_free(rlabel);
+    g_free(hgram);
+
+    return g_string_free(s, FALSE);
+}
+
+static inline double qdist_x(const struct qdist *dist, int index)
+{
+    if (dist->n == 0) {
+        return NAN;
+    }
+    return dist->entries[index].x;
+}
+
+double qdist_xmin(const struct qdist *dist)
+{
+    return qdist_x(dist, 0);
+}
+
+double qdist_xmax(const struct qdist *dist)
+{
+    return qdist_x(dist, dist->n - 1);
+}
+
+size_t qdist_unique_entries(const struct qdist *dist)
+{
+    return dist->n;
+}
+
+unsigned long qdist_sample_count(const struct qdist *dist)
+{
+    unsigned long count = 0;
+    size_t i;
+
+    for (i = 0; i < dist->n; i++) {
+        struct qdist_entry *e = &dist->entries[i];
+
+        count += e->count;
+    }
+    return count;
+}
+
+static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
+                                 size_t n, unsigned long count)
+{
+    /* amortize the recursion by using a base case > 2 */
+    if (n <= 8) {
+        size_t i;
+        double ret = 0;
+
+        for (i = 0; i < n; i++) {
+            struct qdist_entry *e = &dist->entries[index + i];
+
+            ret += e->x * e->count / count;
+        }
+        return ret;
+    } else {
+        size_t n2 = n / 2;
+
+        return qdist_pairwise_avg(dist, index, n2, count) +
+               qdist_pairwise_avg(dist, index + n2, n - n2, count);
+    }
+}
+
+double qdist_avg(const struct qdist *dist)
+{
+    unsigned long count;
+
+    count = qdist_sample_count(dist);
+    if (!count) {
+        return NAN;
+    }
+    return qdist_pairwise_avg(dist, 0, dist->n, count);
+}
diff --git a/util/qemu-co-shared-resource.c b/util/qemu-co-shared-resource.c
new file mode 100644
index 000000000..a66cc07e7
--- /dev/null
+++ b/util/qemu-co-shared-resource.c
@@ -0,0 +1,90 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/co-shared-resource.h"
+
+struct SharedResource {
+    uint64_t total; /* Set in shres_create() and not changed anymore */
+
+    /* State fields protected by lock */
+    uint64_t available;
+    CoQueue queue;
+
+    QemuMutex lock;
+};
+
+SharedResource *shres_create(uint64_t total)
+{
+    SharedResource *s = g_new0(SharedResource, 1);
+
+    s->total = s->available = total;
+    qemu_co_queue_init(&s->queue);
+    qemu_mutex_init(&s->lock);
+
+    return s;
+}
+
+void shres_destroy(SharedResource *s)
+{
+    assert(s->available == s->total);
+    qemu_mutex_destroy(&s->lock);
+    g_free(s);
+}
+
+/* Called with lock held. */
+static bool co_try_get_from_shres_locked(SharedResource *s, uint64_t n)
+{
+    if (s->available >= n) {
+        s->available -= n;
+        return true;
+    }
+
+    return false;
+}
+
+bool co_try_get_from_shres(SharedResource *s, uint64_t n)
+{
+    QEMU_LOCK_GUARD(&s->lock);
+    return co_try_get_from_shres_locked(s, n);
+}
+
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n)
+{
+    assert(n <= s->total);
+    QEMU_LOCK_GUARD(&s->lock);
+    while (!co_try_get_from_shres_locked(s, n)) {
+        qemu_co_queue_wait(&s->queue, &s->lock);
+    }
+}
+
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n)
+{
+    QEMU_LOCK_GUARD(&s->lock);
+    assert(s->total - s->available >= n);
+    s->available += n;
+    qemu_co_queue_restart_all(&s->queue);
+}
diff --git a/util/qemu-config.c b/util/qemu-config.c
new file mode 100644
index 000000000..436ab63b1
--- /dev/null
+++ b/util/qemu-config.c
@@ -0,0 +1,565 @@
+#include "qemu/osdep.h"
+#include "block/qdict.h" /* for qdict_extract_subqdict() */
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
+
+static QemuOptsList *vm_config_groups[48];
+static QemuOptsList *drive_config_groups[5];
+
+static QemuOptsList *find_list(QemuOptsList **lists, const char *group,
+                               Error **errp)
+{
+    int i;
+
+    qemu_load_module_for_opts(group);
+    for (i = 0; lists[i] != NULL; i++) {
+        if (strcmp(lists[i]->name, group) == 0)
+            break;
+    }
+    if (lists[i] == NULL) {
+        error_setg(errp, "There is no option group '%s'", group);
+    }
+    return lists[i];
+}
+
+QemuOptsList *qemu_find_opts(const char *group)
+{
+    QemuOptsList *ret;
+    Error *local_err = NULL;
+
+    ret = find_list(vm_config_groups, group, &local_err);
+    if (local_err) {
+        error_report_err(local_err);
+    }
+
+    return ret;
+}
+
+QemuOpts *qemu_find_opts_singleton(const char *group)
+{
+    QemuOptsList *list;
+    QemuOpts *opts;
+
+    list = qemu_find_opts(group);
+    assert(list);
+    opts = qemu_opts_find(list, NULL);
+    if (!opts) {
+        opts = qemu_opts_create(list, NULL, 0, &error_abort);
+    }
+    return opts;
+}
+
+static CommandLineParameterInfoList *query_option_descs(const QemuOptDesc *desc)
+{
+    CommandLineParameterInfoList *param_list = NULL;
+    CommandLineParameterInfo *info;
+    int i;
+
+    for (i = 0; desc[i].name != NULL; i++) {
+        info = g_malloc0(sizeof(*info));
+        info->name = g_strdup(desc[i].name);
+
+        switch (desc[i].type) {
+        case QEMU_OPT_STRING:
+            info->type = COMMAND_LINE_PARAMETER_TYPE_STRING;
+            break;
+        case QEMU_OPT_BOOL:
+            info->type = COMMAND_LINE_PARAMETER_TYPE_BOOLEAN;
+            break;
+        case QEMU_OPT_NUMBER:
+            info->type = COMMAND_LINE_PARAMETER_TYPE_NUMBER;
+            break;
+        case QEMU_OPT_SIZE:
+            info->type = COMMAND_LINE_PARAMETER_TYPE_SIZE;
+            break;
+        }
+
+        if (desc[i].help) {
+            info->has_help = true;
+            info->help = g_strdup(desc[i].help);
+        }
+        if (desc[i].def_value_str) {
+            info->has_q_default = true;
+            info->q_default = g_strdup(desc[i].def_value_str);
+        }
+
+        QAPI_LIST_PREPEND(param_list, info);
+    }
+
+    return param_list;
+}
+
+/* remove repeated entry from the info list */
+static void cleanup_infolist(CommandLineParameterInfoList *head)
+{
+    CommandLineParameterInfoList *pre_entry, *cur, *del_entry;
+
+    cur = head;
+    while (cur->next) {
+        pre_entry = head;
+        while (pre_entry != cur->next) {
+            if (!strcmp(pre_entry->value->name, cur->next->value->name)) {
+                del_entry = cur->next;
+                cur->next = cur->next->next;
+                del_entry->next = NULL;
+                qapi_free_CommandLineParameterInfoList(del_entry);
+                break;
+            }
+            pre_entry = pre_entry->next;
+        }
+        cur = cur->next;
+    }
+}
+
+/* merge the description items of two parameter infolists */
+static void connect_infolist(CommandLineParameterInfoList *head,
+                             CommandLineParameterInfoList *new)
+{
+    CommandLineParameterInfoList *cur;
+
+    cur = head;
+    while (cur->next) {
+        cur = cur->next;
+    }
+    cur->next = new;
+}
+
+/* access all the local QemuOptsLists for drive option */
+static CommandLineParameterInfoList *get_drive_infolist(void)
+{
+    CommandLineParameterInfoList *head = NULL, *cur;
+    int i;
+
+    for (i = 0; drive_config_groups[i] != NULL; i++) {
+        if (!head) {
+            head = query_option_descs(drive_config_groups[i]->desc);
+        } else {
+            cur = query_option_descs(drive_config_groups[i]->desc);
+            connect_infolist(head, cur);
+        }
+    }
+    cleanup_infolist(head);
+
+    return head;
+}
+
+/* restore machine options that are now machine's properties */
+static QemuOptsList machine_opts = {
+    .merge_lists = true,
+    .head = QTAILQ_HEAD_INITIALIZER(machine_opts.head),
+    .desc = {
+        {
+            .name = "type",
+            .type = QEMU_OPT_STRING,
+            .help = "emulated machine"
+        },{
+            .name = "accel",
+            .type = QEMU_OPT_STRING,
+            .help = "accelerator list",
+        },{
+            .name = "kernel_irqchip",
+            .type = QEMU_OPT_BOOL,
+            .help = "use KVM in-kernel irqchip",
+        },{
+            .name = "kvm_shadow_mem",
+            .type = QEMU_OPT_SIZE,
+            .help = "KVM shadow MMU size",
+        },{
+            .name = "kernel",
+            .type = QEMU_OPT_STRING,
+            .help = "Linux kernel image file",
+        },{
+            .name = "initrd",
+            .type = QEMU_OPT_STRING,
+            .help = "Linux initial ramdisk file",
+        },{
+            .name = "append",
+            .type = QEMU_OPT_STRING,
+            .help = "Linux kernel command line",
+        },{
+            .name = "dtb",
+            .type = QEMU_OPT_STRING,
+            .help = "Linux kernel device tree file",
+        },{
+            .name = "dumpdtb",
+            .type = QEMU_OPT_STRING,
+            .help = "Dump current dtb to a file and quit",
+        },{
+            .name = "phandle_start",
+            .type = QEMU_OPT_NUMBER,
+            .help = "The first phandle ID we may generate dynamically",
+        },{
+            .name = "dt_compatible",
+            .type = QEMU_OPT_STRING,
+            .help = "Overrides the \"compatible\" property of the dt root node",
+        },{
+            .name = "dump-guest-core",
+            .type = QEMU_OPT_BOOL,
+            .help = "Include guest memory in  a core dump",
+        },{
+            .name = "mem-merge",
+            .type = QEMU_OPT_BOOL,
+            .help = "enable/disable memory merge support",
+        },{
+            .name = "usb",
+            .type = QEMU_OPT_BOOL,
+            .help = "Set on/off to enable/disable usb",
+        },{
+            .name = "firmware",
+            .type = QEMU_OPT_STRING,
+            .help = "firmware image",
+        },{
+            .name = "iommu",
+            .type = QEMU_OPT_BOOL,
+            .help = "Set on/off to enable/disable Intel IOMMU (VT-d)",
+        },{
+            .name = "suppress-vmdesc",
+            .type = QEMU_OPT_BOOL,
+            .help = "Set on to disable self-describing migration",
+        },{
+            .name = "aes-key-wrap",
+            .type = QEMU_OPT_BOOL,
+            .help = "enable/disable AES key wrapping using the CPACF wrapping key",
+        },{
+            .name = "dea-key-wrap",
+            .type = QEMU_OPT_BOOL,
+            .help = "enable/disable DEA key wrapping using the CPACF wrapping key",
+        },{
+            .name = "loadparm",
+            .type = QEMU_OPT_STRING,
+            .help = "Up to 8 chars in set of [A-Za-z0-9. ](lower case chars"
+                    " converted to upper case) to pass to machine"
+                    " loader, boot manager, and guest kernel",
+        },
+        { /* End of list */ }
+    }
+};
+
+CommandLineOptionInfoList *qmp_query_command_line_options(bool has_option,
+                                                          const char *option,
+                                                          Error **errp)
+{
+    CommandLineOptionInfoList *conf_list = NULL;
+    CommandLineOptionInfo *info;
+    int i;
+
+    for (i = 0; vm_config_groups[i] != NULL; i++) {
+        if (!has_option || !strcmp(option, vm_config_groups[i]->name)) {
+            info = g_malloc0(sizeof(*info));
+            info->option = g_strdup(vm_config_groups[i]->name);
+            if (!strcmp("drive", vm_config_groups[i]->name)) {
+                info->parameters = get_drive_infolist();
+            } else {
+                info->parameters =
+                    query_option_descs(vm_config_groups[i]->desc);
+            }
+            QAPI_LIST_PREPEND(conf_list, info);
+        }
+    }
+
+    if (!has_option || !strcmp(option, "machine")) {
+        info = g_malloc0(sizeof(*info));
+        info->option = g_strdup("machine");
+        info->parameters = query_option_descs(machine_opts.desc);
+        QAPI_LIST_PREPEND(conf_list, info);
+    }
+
+    if (conf_list == NULL) {
+        error_setg(errp, "invalid option name: %s", option);
+    }
+
+    return conf_list;
+}
+
+QemuOptsList *qemu_find_opts_err(const char *group, Error **errp)
+{
+    return find_list(vm_config_groups, group, errp);
+}
+
+void qemu_add_drive_opts(QemuOptsList *list)
+{
+    int entries, i;
+
+    entries = ARRAY_SIZE(drive_config_groups);
+    entries--; /* keep list NULL terminated */
+    for (i = 0; i < entries; i++) {
+        if (drive_config_groups[i] == NULL) {
+            drive_config_groups[i] = list;
+            return;
+        }
+    }
+    fprintf(stderr, "ran out of space in drive_config_groups");
+    abort();
+}
+
+void qemu_add_opts(QemuOptsList *list)
+{
+    int entries, i;
+
+    entries = ARRAY_SIZE(vm_config_groups);
+    entries--; /* keep list NULL terminated */
+    for (i = 0; i < entries; i++) {
+        if (vm_config_groups[i] == NULL) {
+            vm_config_groups[i] = list;
+            return;
+        }
+    }
+    fprintf(stderr, "ran out of space in vm_config_groups");
+    abort();
+}
+
+struct ConfigWriteData {
+    QemuOptsList *list;
+    FILE *fp;
+};
+
+static int config_write_opt(void *opaque, const char *name, const char *value,
+                            Error **errp)
+{
+    struct ConfigWriteData *data = opaque;
+
+    fprintf(data->fp, "  %s = \"%s\"\n", name, value);
+    return 0;
+}
+
+static int config_write_opts(void *opaque, QemuOpts *opts, Error **errp)
+{
+    struct ConfigWriteData *data = opaque;
+    const char *id = qemu_opts_id(opts);
+
+    if (id) {
+        fprintf(data->fp, "[%s \"%s\"]\n", data->list->name, id);
+    } else {
+        fprintf(data->fp, "[%s]\n", data->list->name);
+    }
+    qemu_opt_foreach(opts, config_write_opt, data, NULL);
+    fprintf(data->fp, "\n");
+    return 0;
+}
+
+void qemu_config_write(FILE *fp)
+{
+    struct ConfigWriteData data = { .fp = fp };
+    QemuOptsList **lists = vm_config_groups;
+    int i;
+
+    fprintf(fp, "# qemu config file\n\n");
+    for (i = 0; lists[i] != NULL; i++) {
+        data.list = lists[i];
+        qemu_opts_foreach(data.list, config_write_opts, &data, NULL);
+    }
+}
+
+/* Returns number of config groups on success, -errno on error */
+static int qemu_config_foreach(FILE *fp, QEMUConfigCB *cb, void *opaque,
+                               const char *fname, Error **errp)
+{
+    char line[1024], prev_group[64], group[64], arg[64], value[1024];
+    Location loc;
+    Error *local_err = NULL;
+    QDict *qdict = NULL;
+    int res = -EINVAL, lno = 0;
+    int count = 0;
+
+    loc_push_none(&loc);
+    while (fgets(line, sizeof(line), fp) != NULL) {
+        ++lno;
+        if (line[0] == '\n') {
+            /* skip empty lines */
+            continue;
+        }
+        if (line[0] == '#') {
+            /* comment */
+            continue;
+        }
+        if (line[0] == '[') {
+            QDict *prev = qdict;
+            if (sscanf(line, "[%63s \"%63[^\"]\"]", group, value) == 2) {
+                qdict = qdict_new();
+                qdict_put_str(qdict, "id", value);
+                count++;
+            } else if (sscanf(line, "[%63[^]]]", group) == 1) {
+                qdict = qdict_new();
+                count++;
+            }
+            if (qdict != prev) {
+                if (prev) {
+                    cb(prev_group, prev, opaque, &local_err);
+                    qobject_unref(prev);
+                    if (local_err) {
+                        error_propagate(errp, local_err);
+                        goto out;
+                    }
+                }
+                strcpy(prev_group, group);
+                continue;
+            }
+        }
+        loc_set_file(fname, lno);
+        value[0] = '\0';
+        if (sscanf(line, " %63s = \"%1023[^\"]\"", arg, value) == 2 ||
+            sscanf(line, " %63s = \"\"", arg) == 1) {
+            /* arg = value */
+            if (qdict == NULL) {
+                error_setg(errp, "no group defined");
+                goto out;
+            }
+            qdict_put_str(qdict, arg, value);
+            continue;
+        }
+        error_setg(errp, "parse error");
+        goto out;
+    }
+    if (ferror(fp)) {
+        loc_pop(&loc);
+        error_setg_errno(errp, errno, "Cannot read config file");
+        goto out_no_loc;
+    }
+    res = count;
+    if (qdict) {
+        cb(group, qdict, opaque, errp);
+    }
+out:
+    loc_pop(&loc);
+out_no_loc:
+    qobject_unref(qdict);
+    return res;
+}
+
+void qemu_config_do_parse(const char *group, QDict *qdict, void *opaque, Error **errp)
+{
+    QemuOptsList **lists = opaque;
+    QemuOptsList *list;
+
+    list = find_list(lists, group, errp);
+    if (!list) {
+        return;
+    }
+
+    qemu_opts_from_qdict(list, qdict, errp);
+}
+
+int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname, Error **errp)
+{
+    return qemu_config_foreach(fp, qemu_config_do_parse, lists, fname, errp);
+}
+
+int qemu_read_config_file(const char *filename, QEMUConfigCB *cb, Error **errp)
+{
+    FILE *f = fopen(filename, "r");
+    int ret;
+
+    if (f == NULL) {
+        error_setg_file_open(errp, errno, filename);
+        return -errno;
+    }
+
+    ret = qemu_config_foreach(f, cb, vm_config_groups, filename, errp);
+    fclose(f);
+    return ret;
+}
+
+static void config_parse_qdict_section(QDict *options, QemuOptsList *opts,
+                                       Error **errp)
+{
+    QemuOpts *subopts;
+    QDict *subqdict;
+    QList *list = NULL;
+    size_t orig_size, enum_size;
+    char *prefix;
+
+    prefix = g_strdup_printf("%s.", opts->name);
+    qdict_extract_subqdict(options, &subqdict, prefix);
+    g_free(prefix);
+    orig_size = qdict_size(subqdict);
+    if (!orig_size) {
+        goto out;
+    }
+
+    subopts = qemu_opts_create(opts, NULL, 0, errp);
+    if (!subopts) {
+        goto out;
+    }
+
+    if (!qemu_opts_absorb_qdict(subopts, subqdict, errp)) {
+        goto out;
+    }
+
+    enum_size = qdict_size(subqdict);
+    if (enum_size < orig_size && enum_size) {
+        error_setg(errp, "Unknown option '%s' for [%s]",
+                   qdict_first(subqdict)->key, opts->name);
+        goto out;
+    }
+
+    if (enum_size) {
+        /* Multiple, enumerated sections */
+        QListEntry *list_entry;
+        unsigned i = 0;
+
+        /* Not required anymore */
+        qemu_opts_del(subopts);
+
+        qdict_array_split(subqdict, &list);
+        if (qdict_size(subqdict)) {
+            error_setg(errp, "Unused option '%s' for [%s]",
+                       qdict_first(subqdict)->key, opts->name);
+            goto out;
+        }
+
+        QLIST_FOREACH_ENTRY(list, list_entry) {
+            QDict *section = qobject_to(QDict, qlist_entry_obj(list_entry));
+            char *opt_name;
+
+            if (!section) {
+                error_setg(errp, "[%s] section (index %u) does not consist of "
+                           "keys", opts->name, i);
+                goto out;
+            }
+
+            opt_name = g_strdup_printf("%s.%u", opts->name, i++);
+            subopts = qemu_opts_create(opts, opt_name, 1, errp);
+            g_free(opt_name);
+            if (!subopts) {
+                goto out;
+            }
+
+            if (!qemu_opts_absorb_qdict(subopts, section, errp)) {
+                qemu_opts_del(subopts);
+                goto out;
+            }
+
+            if (qdict_size(section)) {
+                error_setg(errp, "[%s] section doesn't support the option '%s'",
+                           opts->name, qdict_first(section)->key);
+                qemu_opts_del(subopts);
+                goto out;
+            }
+        }
+    }
+
+out:
+    qobject_unref(subqdict);
+    qobject_unref(list);
+}
+
+void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists,
+                             Error **errp)
+{
+    int i;
+    Error *local_err = NULL;
+
+    for (i = 0; lists[i]; i++) {
+        config_parse_qdict_section(options, lists[i], &local_err);
+        if (local_err) {
+            error_propagate(errp, local_err);
+            return;
+        }
+    }
+}
diff --git a/util/qemu-coroutine-io.c b/util/qemu-coroutine-io.c
new file mode 100644
index 000000000..5b80bb416
--- /dev/null
+++ b/util/qemu-coroutine-io.c
@@ -0,0 +1,93 @@
+/*
+ * Coroutine-aware I/O functions
+ *
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ * Copyright (c) 2011, Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+#include "qemu/coroutine.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+
+ssize_t coroutine_fn
+qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt,
+                    size_t offset, size_t bytes, bool do_send)
+{
+    size_t done = 0;
+    ssize_t ret;
+    while (done < bytes) {
+        ret = iov_send_recv(sockfd, iov, iov_cnt,
+                            offset + done, bytes - done, do_send);
+        if (ret > 0) {
+            done += ret;
+        } else if (ret < 0) {
+            if (errno == EAGAIN || errno == EWOULDBLOCK) {
+                qemu_coroutine_yield();
+            } else if (done == 0) {
+                return -errno;
+            } else {
+                break;
+            }
+        } else if (ret == 0 && !do_send) {
+            /* write (send) should never return 0.
+             * read (recv) returns 0 for end-of-file (-data).
+             * In both cases there's little point retrying,
+             * but we do for write anyway, just in case */
+            break;
+        }
+    }
+    return done;
+}
+
+ssize_t coroutine_fn
+qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send)
+{
+    struct iovec iov = { .iov_base = buf, .iov_len = bytes };
+    return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send);
+}
+
+typedef struct {
+    AioContext *ctx;
+    Coroutine *co;
+    int fd;
+} FDYieldUntilData;
+
+static void fd_coroutine_enter(void *opaque)
+{
+    FDYieldUntilData *data = opaque;
+    aio_set_fd_handler(data->ctx, data->fd, false, NULL, NULL, NULL, NULL);
+    qemu_coroutine_enter(data->co);
+}
+
+void coroutine_fn yield_until_fd_readable(int fd)
+{
+    FDYieldUntilData data;
+
+    assert(qemu_in_coroutine());
+    data.ctx = qemu_get_current_aio_context();
+    data.co = qemu_coroutine_self();
+    data.fd = fd;
+    aio_set_fd_handler(
+        data.ctx, fd, false, fd_coroutine_enter, NULL, NULL, &data);
+    qemu_coroutine_yield();
+}
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
new file mode 100644
index 000000000..266940383
--- /dev/null
+++ b/util/qemu-coroutine-lock.c
@@ -0,0 +1,467 @@
+/*
+ * coroutine queues and locks
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
+#include "qemu/queue.h"
+#include "block/aio.h"
+#include "trace.h"
+
+void qemu_co_queue_init(CoQueue *queue)
+{
+    QSIMPLEQ_INIT(&queue->entries);
+}
+
+void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+    QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+    if (lock) {
+        qemu_lockable_unlock(lock);
+    }
+
+    /* There is no race condition here.  Other threads will call
+     * aio_co_schedule on our AioContext, which can reenter this
+     * coroutine but only after this yield and after the main loop
+     * has gone through the next iteration.
+     */
+    qemu_coroutine_yield();
+    assert(qemu_in_coroutine());
+
+    /* TODO: OSv implements wait morphing here, where the wakeup
+     * primitive automatically places the woken coroutine on the
+     * mutex's queue.  This avoids the thundering herd effect.
+     * This could be implemented for CoMutexes, but not really for
+     * other cases of QemuLockable.
+     */
+    if (lock) {
+        qemu_lockable_lock(lock);
+    }
+}
+
+static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
+{
+    Coroutine *next;
+
+    if (QSIMPLEQ_EMPTY(&queue->entries)) {
+        return false;
+    }
+
+    while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
+        QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
+        aio_co_wake(next);
+        if (single) {
+            break;
+        }
+    }
+    return true;
+}
+
+bool qemu_co_queue_next(CoQueue *queue)
+{
+    return qemu_co_queue_do_restart(queue, true);
+}
+
+void qemu_co_queue_restart_all(CoQueue *queue)
+{
+    qemu_co_queue_do_restart(queue, false);
+}
+
+bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock)
+{
+    Coroutine *next;
+
+    next = QSIMPLEQ_FIRST(&queue->entries);
+    if (!next) {
+        return false;
+    }
+
+    QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
+    if (lock) {
+        qemu_lockable_unlock(lock);
+    }
+    aio_co_wake(next);
+    if (lock) {
+        qemu_lockable_lock(lock);
+    }
+    return true;
+}
+
+bool qemu_co_queue_empty(CoQueue *queue)
+{
+    return QSIMPLEQ_FIRST(&queue->entries) == NULL;
+}
+
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue.  There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ *   In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ *   not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ *   qemu_co_mutex_unlock.  In this case, qemu_co_mutex_unlock will fail
+ *   the cmpxchg (it will see either 0 or the next sequence value) and
+ *   exit.  The next hand-off cannot begin until qemu_co_mutex_lock has
+ *   woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ *   In this case another iteration starts with mutex->handoff == 0;
+ *   a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ *   qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+    Coroutine *co;
+    QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+    w->co = qemu_coroutine_self();
+    QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+    QSLIST_HEAD(, CoWaitRecord) reversed;
+    QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+    while (!QSLIST_EMPTY(&reversed)) {
+        CoWaitRecord *w = QSLIST_FIRST(&reversed);
+        QSLIST_REMOVE_HEAD(&reversed, next);
+        QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+    }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+    CoWaitRecord *w;
+
+    if (QSLIST_EMPTY(&mutex->to_pop)) {
+        move_waiters(mutex);
+        if (QSLIST_EMPTY(&mutex->to_pop)) {
+            return NULL;
+        }
+    }
+    w = QSLIST_FIRST(&mutex->to_pop);
+    QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+    return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+    return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
+void qemu_co_mutex_init(CoMutex *mutex)
+{
+    memset(mutex, 0, sizeof(*mutex));
+}
+
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+    /* Read co before co->ctx; pairs with smp_wmb() in
+     * qemu_coroutine_enter().
+     */
+    smp_read_barrier_depends();
+    mutex->ctx = co->ctx;
+    aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+                                                     CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+    CoWaitRecord w;
+    unsigned old_handoff;
+
+    trace_qemu_co_mutex_lock_entry(mutex, self);
+    push_waiter(mutex, &w);
+
+    /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+     * a concurrent unlock() the responsibility of waking somebody up.
+     */
+    old_handoff = qatomic_mb_read(&mutex->handoff);
+    if (old_handoff &&
+        has_waiters(mutex) &&
+        qatomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+        /* There can be no concurrent pops, because there can be only
+         * one active handoff at a time.
+         */
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        Coroutine *co = to_wake->co;
+        if (co == self) {
+            /* We got the lock ourselves!  */
+            assert(to_wake == &w);
+            mutex->ctx = ctx;
+            return;
+        }
+
+        qemu_co_mutex_wake(mutex, co);
+    }
+
+    qemu_coroutine_yield();
+    trace_qemu_co_mutex_lock_return(mutex, self);
+}
+
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    Coroutine *self = qemu_coroutine_self();
+    int waiters, i;
+
+    /* Running a very small critical section on pthread_mutex_t and CoMutex
+     * shows that pthread_mutex_t is much faster because it doesn't actually
+     * go to sleep.  What happens is that the critical section is shorter
+     * than the latency of entering the kernel and thus FUTEX_WAIT always
+     * fails.  With CoMutex there is no such latency but you still want to
+     * avoid wait and wakeup.  So introduce it artificially.
+     */
+    i = 0;
+retry_fast_path:
+    waiters = qatomic_cmpxchg(&mutex->locked, 0, 1);
+    if (waiters != 0) {
+        while (waiters == 1 && ++i < 1000) {
+            if (qatomic_read(&mutex->ctx) == ctx) {
+                break;
+            }
+            if (qatomic_read(&mutex->locked) == 0) {
+                goto retry_fast_path;
+            }
+            cpu_relax();
+        }
+        waiters = qatomic_fetch_inc(&mutex->locked);
+    }
+
+    if (waiters == 0) {
+        /* Uncontended.  */
+        trace_qemu_co_mutex_lock_uncontended(mutex, self);
+        mutex->ctx = ctx;
+    } else {
+        qemu_co_mutex_lock_slowpath(ctx, mutex);
+    }
+    mutex->holder = self;
+    self->locks_held++;
+}
+
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    trace_qemu_co_mutex_unlock_entry(mutex, self);
+
+    assert(mutex->locked);
+    assert(mutex->holder == self);
+    assert(qemu_in_coroutine());
+
+    mutex->ctx = NULL;
+    mutex->holder = NULL;
+    self->locks_held--;
+    if (qatomic_fetch_dec(&mutex->locked) == 1) {
+        /* No waiting qemu_co_mutex_lock().  Pfew, that was easy!  */
+        return;
+    }
+
+    for (;;) {
+        CoWaitRecord *to_wake = pop_waiter(mutex);
+        unsigned our_handoff;
+
+        if (to_wake) {
+            qemu_co_mutex_wake(mutex, to_wake->co);
+            break;
+        }
+
+        /* Some concurrent lock() is in progress (we know this because
+         * mutex->locked was >1) but it hasn't yet put itself on the wait
+         * queue.  Pick a sequence number for the handoff protocol (not 0).
+         */
+        if (++mutex->sequence == 0) {
+            mutex->sequence = 1;
+        }
+
+        our_handoff = mutex->sequence;
+        qatomic_mb_set(&mutex->handoff, our_handoff);
+        if (!has_waiters(mutex)) {
+            /* The concurrent lock has not added itself yet, so it
+             * will be able to pick our handoff.
+             */
+            break;
+        }
+
+        /* Try to do the handoff protocol ourselves; if somebody else has
+         * already taken it, however, we're done and they're responsible.
+         */
+        if (qatomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+            break;
+        }
+    }
+
+    trace_qemu_co_mutex_unlock_return(mutex, self);
+}
+
+struct CoRwTicket {
+    bool read;
+    Coroutine *co;
+    QSIMPLEQ_ENTRY(CoRwTicket) next;
+};
+
+void qemu_co_rwlock_init(CoRwlock *lock)
+{
+    qemu_co_mutex_init(&lock->mutex);
+    lock->owners = 0;
+    QSIMPLEQ_INIT(&lock->tickets);
+}
+
+/* Releases the internal CoMutex.  */
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+{
+    CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
+    Coroutine *co = NULL;
+
+    /*
+     * Setting lock->owners here prevents rdlock and wrlock from
+     * sneaking in between unlock and wake.
+     */
+
+    if (tkt) {
+        if (tkt->read) {
+            if (lock->owners >= 0) {
+                lock->owners++;
+                co = tkt->co;
+            }
+        } else {
+            if (lock->owners == 0) {
+                lock->owners = -1;
+                co = tkt->co;
+            }
+        }
+    }
+
+    if (co) {
+        QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        aio_co_wake(co);
+    } else {
+        qemu_co_mutex_unlock(&lock->mutex);
+    }
+}
+
+void qemu_co_rwlock_rdlock(CoRwlock *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    qemu_co_mutex_lock(&lock->mutex);
+    /* For fairness, wait if a writer is in line.  */
+    if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
+        lock->owners++;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { true, self };
+
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners >= 1);
+
+        /* Possibly wake another reader, which will wake the next in line.  */
+        qemu_co_mutex_lock(&lock->mutex);
+        qemu_co_rwlock_maybe_wake_one(lock);
+    }
+
+    self->locks_held++;
+}
+
+void qemu_co_rwlock_unlock(CoRwlock *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    assert(qemu_in_coroutine());
+    self->locks_held--;
+
+    qemu_co_mutex_lock(&lock->mutex);
+    if (lock->owners > 0) {
+        lock->owners--;
+    } else {
+        assert(lock->owners == -1);
+        lock->owners = 0;
+    }
+
+    qemu_co_rwlock_maybe_wake_one(lock);
+}
+
+void qemu_co_rwlock_downgrade(CoRwlock *lock)
+{
+    qemu_co_mutex_lock(&lock->mutex);
+    assert(lock->owners == -1);
+    lock->owners = 1;
+
+    /* Possibly wake another reader, which will wake the next in line.  */
+    qemu_co_rwlock_maybe_wake_one(lock);
+}
+
+void qemu_co_rwlock_wrlock(CoRwlock *lock)
+{
+    Coroutine *self = qemu_coroutine_self();
+
+    qemu_co_mutex_lock(&lock->mutex);
+    if (lock->owners == 0) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_mutex_unlock(&lock->mutex);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
+    }
+
+    self->locks_held++;
+}
+
+void qemu_co_rwlock_upgrade(CoRwlock *lock)
+{
+    qemu_co_mutex_lock(&lock->mutex);
+    assert(lock->owners > 0);
+    /* For fairness, wait if a writer is in line.  */
+    if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
+        lock->owners = -1;
+        qemu_co_mutex_unlock(&lock->mutex);
+    } else {
+        CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+        lock->owners--;
+        QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+        qemu_co_rwlock_maybe_wake_one(lock);
+        qemu_coroutine_yield();
+        assert(lock->owners == -1);
+    }
+}
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
new file mode 100644
index 000000000..571ab521f
--- /dev/null
+++ b/util/qemu-coroutine-sleep.c
@@ -0,0 +1,80 @@
+/*
+ * QEMU coroutine sleep
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/timer.h"
+#include "block/aio.h"
+
+static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns";
+
+void qemu_co_sleep_wake(QemuCoSleep *w)
+{
+    Coroutine *co;
+
+    co = w->to_wake;
+    w->to_wake = NULL;
+    if (co) {
+        /* Write of schedule protected by barrier write in aio_co_schedule */
+        const char *scheduled = qatomic_cmpxchg(&co->scheduled,
+                                                qemu_co_sleep_ns__scheduled, NULL);
+
+        assert(scheduled == qemu_co_sleep_ns__scheduled);
+        aio_co_wake(co);
+    }
+}
+
+static void co_sleep_cb(void *opaque)
+{
+    QemuCoSleep *w = opaque;
+    qemu_co_sleep_wake(w);
+}
+
+void coroutine_fn qemu_co_sleep(QemuCoSleep *w)
+{
+    Coroutine *co = qemu_coroutine_self();
+
+    const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
+                                            qemu_co_sleep_ns__scheduled);
+    if (scheduled) {
+        fprintf(stderr,
+                "%s: Co-routine was already scheduled in '%s'\n",
+                __func__, scheduled);
+        abort();
+    }
+
+    w->to_wake = co;
+    qemu_coroutine_yield();
+
+    /* w->to_wake is cleared before resuming this coroutine.  */
+    assert(w->to_wake == NULL);
+}
+
+void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
+                                            QEMUClockType type, int64_t ns)
+{
+    AioContext *ctx = qemu_get_current_aio_context();
+    QEMUTimer ts;
+
+    aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w);
+    timer_mod(&ts, qemu_clock_get_ns(type) + ns);
+
+    /*
+     * The timer will fire in the current AiOContext, so the callback
+     * must happen after qemu_co_sleep yields and there is no race
+     * between timer_mod and qemu_co_sleep.
+     */
+    qemu_co_sleep(w);
+    timer_del(&ts);
+}
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
new file mode 100644
index 000000000..38fb6d308
--- /dev/null
+++ b/util/qemu-coroutine.c
@@ -0,0 +1,204 @@
+/*
+ * QEMU coroutines
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@linux.vnet.ibm.com>
+ *  Kevin Wolf         <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "block/aio.h"
+
+enum {
+    POOL_BATCH_SIZE = 64,
+};
+
+/** Free list to speed up creation */
+static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+static unsigned int release_pool_size;
+static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+static __thread unsigned int alloc_pool_size;
+static __thread Notifier coroutine_pool_cleanup_notifier;
+
+static void coroutine_pool_cleanup(Notifier *n, void *value)
+{
+    Coroutine *co;
+    Coroutine *tmp;
+
+    QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
+        QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+        qemu_coroutine_delete(co);
+    }
+}
+
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+{
+    Coroutine *co = NULL;
+
+    if (CONFIG_COROUTINE_POOL) {
+        co = QSLIST_FIRST(&alloc_pool);
+        if (!co) {
+            if (release_pool_size > POOL_BATCH_SIZE) {
+                /* Slow path; a good place to register the destructor, too.  */
+                if (!coroutine_pool_cleanup_notifier.notify) {
+                    coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
+                    qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
+                }
+
+                /* This is not exact; there could be a little skew between
+                 * release_pool_size and the actual size of release_pool.  But
+                 * it is just a heuristic, it does not need to be perfect.
+                 */
+                alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
+                QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
+                co = QSLIST_FIRST(&alloc_pool);
+            }
+        }
+        if (co) {
+            QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+            alloc_pool_size--;
+        }
+    }
+
+    if (!co) {
+        co = qemu_coroutine_new();
+    }
+
+    co->entry = entry;
+    co->entry_arg = opaque;
+    QSIMPLEQ_INIT(&co->co_queue_wakeup);
+    return co;
+}
+
+static void coroutine_delete(Coroutine *co)
+{
+    co->caller = NULL;
+
+    if (CONFIG_COROUTINE_POOL) {
+        if (release_pool_size < POOL_BATCH_SIZE * 2) {
+            QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
+            qatomic_inc(&release_pool_size);
+            return;
+        }
+        if (alloc_pool_size < POOL_BATCH_SIZE) {
+            QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
+            alloc_pool_size++;
+            return;
+        }
+    }
+
+    qemu_coroutine_delete(co);
+}
+
+void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
+{
+    QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
+    Coroutine *from = qemu_coroutine_self();
+
+    QSIMPLEQ_INSERT_TAIL(&pending, co, co_queue_next);
+
+    /* Run co and any queued coroutines */
+    while (!QSIMPLEQ_EMPTY(&pending)) {
+        Coroutine *to = QSIMPLEQ_FIRST(&pending);
+        CoroutineAction ret;
+
+        /* Cannot rely on the read barrier for to in aio_co_wake(), as there are
+         * callers outside of aio_co_wake() */
+        const char *scheduled = qatomic_mb_read(&to->scheduled);
+
+        QSIMPLEQ_REMOVE_HEAD(&pending, co_queue_next);
+
+        trace_qemu_aio_coroutine_enter(ctx, from, to, to->entry_arg);
+
+        /* if the Coroutine has already been scheduled, entering it again will
+         * cause us to enter it twice, potentially even after the coroutine has
+         * been deleted */
+        if (scheduled) {
+            fprintf(stderr,
+                    "%s: Co-routine was already scheduled in '%s'\n",
+                    __func__, scheduled);
+            abort();
+        }
+
+        if (to->caller) {
+            fprintf(stderr, "Co-routine re-entered recursively\n");
+            abort();
+        }
+
+        to->caller = from;
+        to->ctx = ctx;
+
+        /* Store to->ctx before anything that stores to.  Matches
+         * barrier in aio_co_wake and qemu_co_mutex_wake.
+         */
+        smp_wmb();
+
+        ret = qemu_coroutine_switch(from, to, COROUTINE_ENTER);
+
+        /* Queued coroutines are run depth-first; previously pending coroutines
+         * run after those queued more recently.
+         */
+        QSIMPLEQ_PREPEND(&pending, &to->co_queue_wakeup);
+
+        switch (ret) {
+        case COROUTINE_YIELD:
+            break;
+        case COROUTINE_TERMINATE:
+            assert(!to->locks_held);
+            trace_qemu_coroutine_terminate(to);
+            coroutine_delete(to);
+            break;
+        default:
+            abort();
+        }
+    }
+}
+
+void qemu_coroutine_enter(Coroutine *co)
+{
+    qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
+}
+
+void qemu_coroutine_enter_if_inactive(Coroutine *co)
+{
+    if (!qemu_coroutine_entered(co)) {
+        qemu_coroutine_enter(co);
+    }
+}
+
+void coroutine_fn qemu_coroutine_yield(void)
+{
+    Coroutine *self = qemu_coroutine_self();
+    Coroutine *to = self->caller;
+
+    trace_qemu_coroutine_yield(self, to);
+
+    if (!to) {
+        fprintf(stderr, "Co-routine is yielding to no one\n");
+        abort();
+    }
+
+    self->caller = NULL;
+    qemu_coroutine_switch(self, to, COROUTINE_YIELD);
+}
+
+bool qemu_coroutine_entered(Coroutine *co)
+{
+    return co->caller;
+}
+
+AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
+{
+    return co->ctx;
+}
diff --git a/util/qemu-error.c b/util/qemu-error.c
new file mode 100644
index 000000000..52a9e013c
--- /dev/null
+++ b/util/qemu-error.c
@@ -0,0 +1,413 @@
+/*
+ * Error reporting
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "qemu/error-report.h"
+
+/*
+ * @report_type is the type of message: error, warning or
+ * informational.
+ */
+typedef enum {
+    REPORT_TYPE_ERROR,
+    REPORT_TYPE_WARNING,
+    REPORT_TYPE_INFO,
+} report_type;
+
+/* Prepend timestamp to messages */
+bool message_with_timestamp;
+bool error_with_guestname;
+const char *error_guest_name;
+
+int error_printf(const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = error_vprintf(fmt, ap);
+    va_end(ap);
+    return ret;
+}
+
+int error_printf_unless_qmp(const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = error_vprintf_unless_qmp(fmt, ap);
+    va_end(ap);
+    return ret;
+}
+
+static Location std_loc = {
+    .kind = LOC_NONE
+};
+static Location *cur_loc = &std_loc;
+
+/*
+ * Push location saved in LOC onto the location stack, return it.
+ * The top of that stack is the current location.
+ * Needs a matching loc_pop().
+ */
+Location *loc_push_restore(Location *loc)
+{
+    assert(!loc->prev);
+    loc->prev = cur_loc;
+    cur_loc = loc;
+    return loc;
+}
+
+/*
+ * Initialize *LOC to "nowhere", push it onto the location stack.
+ * The top of that stack is the current location.
+ * Needs a matching loc_pop().
+ * Return LOC.
+ */
+Location *loc_push_none(Location *loc)
+{
+    loc->kind = LOC_NONE;
+    loc->prev = NULL;
+    return loc_push_restore(loc);
+}
+
+/*
+ * Pop the location stack.
+ * LOC must be the current location, i.e. the top of the stack.
+ */
+Location *loc_pop(Location *loc)
+{
+    assert(cur_loc == loc && loc->prev);
+    cur_loc = loc->prev;
+    loc->prev = NULL;
+    return loc;
+}
+
+/*
+ * Save the current location in LOC, return LOC.
+ */
+Location *loc_save(Location *loc)
+{
+    *loc = *cur_loc;
+    loc->prev = NULL;
+    return loc;
+}
+
+/*
+ * Change the current location to the one saved in LOC.
+ */
+void loc_restore(Location *loc)
+{
+    Location *prev = cur_loc->prev;
+    assert(!loc->prev);
+    *cur_loc = *loc;
+    cur_loc->prev = prev;
+}
+
+/*
+ * Change the current location to "nowhere in particular".
+ */
+void loc_set_none(void)
+{
+    cur_loc->kind = LOC_NONE;
+}
+
+/*
+ * Change the current location to argument ARGV[IDX..IDX+CNT-1].
+ */
+void loc_set_cmdline(char **argv, int idx, int cnt)
+{
+    cur_loc->kind = LOC_CMDLINE;
+    cur_loc->num = cnt;
+    cur_loc->ptr = argv + idx;
+}
+
+/*
+ * Change the current location to file FNAME, line LNO.
+ */
+void loc_set_file(const char *fname, int lno)
+{
+    assert (fname || cur_loc->kind == LOC_FILE);
+    cur_loc->kind = LOC_FILE;
+    cur_loc->num = lno;
+    if (fname) {
+        cur_loc->ptr = fname;
+    }
+}
+
+static const char *progname;
+
+/*
+ * Set the program name for error_print_loc().
+ */
+static void error_set_progname(const char *argv0)
+{
+    const char *p = strrchr(argv0, '/');
+    progname = p ? p + 1 : argv0;
+}
+
+const char *error_get_progname(void)
+{
+    return progname;
+}
+
+/*
+ * Print current location to current monitor if we have one, else to stderr.
+ */
+static void print_loc(void)
+{
+    const char *sep = "";
+    int i;
+    const char *const *argp;
+
+    if (!monitor_cur() && progname) {
+        fprintf(stderr, "%s:", progname);
+        sep = " ";
+    }
+    switch (cur_loc->kind) {
+    case LOC_CMDLINE:
+        argp = cur_loc->ptr;
+        for (i = 0; i < cur_loc->num; i++) {
+            error_printf("%s%s", sep, argp[i]);
+            sep = " ";
+        }
+        error_printf(": ");
+        break;
+    case LOC_FILE:
+        error_printf("%s:", (const char *)cur_loc->ptr);
+        if (cur_loc->num) {
+            error_printf("%d:", cur_loc->num);
+        }
+        error_printf(" ");
+        break;
+    default:
+        error_printf("%s", sep);
+    }
+}
+
+/*
+ * Print a message to current monitor if we have one, else to stderr.
+ * @report_type is the type of message: error, warning or informational.
+ * Format arguments like vsprintf().  The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+static void vreport(report_type type, const char *fmt, va_list ap)
+{
+    GTimeVal tv;
+    gchar *timestr;
+
+    if (message_with_timestamp && !monitor_cur()) {
+        g_get_current_time(&tv);
+        timestr = g_time_val_to_iso8601(&tv);
+        error_printf("%s ", timestr);
+        g_free(timestr);
+    }
+
+    /* Only prepend guest name if -msg guest-name and -name guest=... are set */
+    if (error_with_guestname && error_guest_name && !monitor_cur()) {
+        error_printf("%s ", error_guest_name);
+    }
+
+    print_loc();
+
+    switch (type) {
+    case REPORT_TYPE_ERROR:
+        break;
+    case REPORT_TYPE_WARNING:
+        error_printf("warning: ");
+        break;
+    case REPORT_TYPE_INFO:
+        error_printf("info: ");
+        break;
+    }
+
+    error_vprintf(fmt, ap);
+    error_printf("\n");
+}
+
+/*
+ * Print an error message to current monitor if we have one, else to stderr.
+ * Format arguments like vsprintf().  The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ * It's wrong to call this in a QMP monitor.  Use error_setg() there.
+ */
+void error_vreport(const char *fmt, va_list ap)
+{
+    vreport(REPORT_TYPE_ERROR, fmt, ap);
+}
+
+/*
+ * Print a warning message to current monitor if we have one, else to stderr.
+ * Format arguments like vsprintf().  The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void warn_vreport(const char *fmt, va_list ap)
+{
+    vreport(REPORT_TYPE_WARNING, fmt, ap);
+}
+
+/*
+ * Print an information message to current monitor if we have one, else to
+ * stderr.
+ * Format arguments like vsprintf().  The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void info_vreport(const char *fmt, va_list ap)
+{
+    vreport(REPORT_TYPE_INFO, fmt, ap);
+}
+
+/*
+ * Print an error message to current monitor if we have one, else to stderr.
+ * Format arguments like sprintf().  The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ * It's wrong to call this in a QMP monitor.  Use error_setg() there.
+ */
+void error_report(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vreport(REPORT_TYPE_ERROR, fmt, ap);
+    va_end(ap);
+}
+
+/*
+ * Print a warning message to current monitor if we have one, else to stderr.
+ * Format arguments like sprintf(). The resulting message should be a
+ * single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void warn_report(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vreport(REPORT_TYPE_WARNING, fmt, ap);
+    va_end(ap);
+}
+
+/*
+ * Print an information message to current monitor if we have one, else to
+ * stderr.
+ * Format arguments like sprintf(). The resulting message should be a
+ * single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void info_report(const char *fmt, ...)
+{
+    va_list ap;
+
+    va_start(ap, fmt);
+    vreport(REPORT_TYPE_INFO, fmt, ap);
+    va_end(ap);
+}
+
+/*
+ * Like error_report(), except print just once.
+ * If *printed is false, print the message, and flip *printed to true.
+ * Return whether the message was printed.
+ */
+bool error_report_once_cond(bool *printed, const char *fmt, ...)
+{
+    va_list ap;
+
+    assert(printed);
+    if (*printed) {
+        return false;
+    }
+    *printed = true;
+    va_start(ap, fmt);
+    vreport(REPORT_TYPE_ERROR, fmt, ap);
+    va_end(ap);
+    return true;
+}
+
+/*
+ * Like warn_report(), except print just once.
+ * If *printed is false, print the message, and flip *printed to true.
+ * Return whether the message was printed.
+ */
+bool warn_report_once_cond(bool *printed, const char *fmt, ...)
+{
+    va_list ap;
+
+    assert(printed);
+    if (*printed) {
+        return false;
+    }
+    *printed = true;
+    va_start(ap, fmt);
+    vreport(REPORT_TYPE_WARNING, fmt, ap);
+    va_end(ap);
+    return true;
+}
+
+static char *qemu_glog_domains;
+
+static void qemu_log_func(const gchar *log_domain,
+                          GLogLevelFlags log_level,
+                          const gchar *message,
+                          gpointer user_data)
+{
+    switch (log_level & G_LOG_LEVEL_MASK) {
+    case G_LOG_LEVEL_DEBUG:
+    case G_LOG_LEVEL_INFO:
+        /*
+         * Use same G_MESSAGES_DEBUG logic as glib to enable/disable debug
+         * messages
+         */
+        if (qemu_glog_domains == NULL) {
+            break;
+        }
+        if (strcmp(qemu_glog_domains, "all") != 0 &&
+          (log_domain == NULL || !strstr(qemu_glog_domains, log_domain))) {
+            break;
+        }
+        /* Fall through */
+    case G_LOG_LEVEL_MESSAGE:
+        info_report("%s%s%s",
+                    log_domain ?: "", log_domain ? ": " : "", message);
+
+        break;
+    case G_LOG_LEVEL_WARNING:
+        warn_report("%s%s%s",
+                    log_domain ?: "", log_domain ? ": " : "", message);
+        break;
+    case G_LOG_LEVEL_CRITICAL:
+    case G_LOG_LEVEL_ERROR:
+        error_report("%s%s%s",
+                     log_domain ?: "", log_domain ? ": " : "", message);
+        break;
+    }
+}
+
+void error_init(const char *argv0)
+{
+    /* Set the program name for error_print_loc(). */
+    error_set_progname(argv0);
+
+    /*
+     * This sets up glib logging so libraries using it also print their logs
+     * through error_report(), warn_report(), info_report().
+     */
+    g_log_set_default_handler(qemu_log_func, NULL);
+    g_warn_if_fail(qemu_glog_domains == NULL);
+    qemu_glog_domains = g_strdup(g_getenv("G_MESSAGES_DEBUG"));
+}
diff --git a/util/qemu-openpty.c b/util/qemu-openpty.c
new file mode 100644
index 000000000..427f43a76
--- /dev/null
+++ b/util/qemu-openpty.c
@@ -0,0 +1,139 @@
+/*
+ * qemu-openpty.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * Wrapper function qemu_openpty() implementation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This is not part of oslib-posix.c because this function
+ * uses openpty() which often in -lutil, and if we add this
+ * dependency to oslib-posix.o, every app will have to be
+ * linked with -lutil.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#if defined HAVE_PTY_H
+# include <pty.h>
+#elif defined CONFIG_BSD
+# include <termios.h>
+# if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+#  include <libutil.h>
+# else
+#  include <util.h>
+# endif
+#elif defined CONFIG_SOLARIS
+# include <termios.h>
+# include <stropts.h>
+#else
+# include <termios.h>
+#endif
+
+#ifdef __sun__
+
+#if !defined(HAVE_OPENPTY)
+/* Once illumos has openpty(), this is going to be removed. */
+static int openpty(int *amaster, int *aslave, char *name,
+                   struct termios *termp, struct winsize *winp)
+{
+        const char *slave;
+        int mfd = -1, sfd = -1;
+
+        *amaster = *aslave = -1;
+
+        mfd = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+        if (mfd < 0)
+                goto err;
+
+        if (grantpt(mfd) == -1 || unlockpt(mfd) == -1)
+                goto err;
+
+        if ((slave = ptsname(mfd)) == NULL)
+                goto err;
+
+        if ((sfd = open(slave, O_RDONLY | O_NOCTTY)) == -1)
+                goto err;
+
+        if (ioctl(sfd, I_PUSH, "ptem") == -1 ||
+            (termp != NULL && tcgetattr(sfd, termp) < 0))
+                goto err;
+
+        *amaster = mfd;
+        *aslave = sfd;
+
+        if (winp)
+                ioctl(sfd, TIOCSWINSZ, winp);
+
+        return 0;
+
+err:
+        if (sfd != -1)
+                close(sfd);
+        close(mfd);
+        return -1;
+}
+#endif
+
+static void cfmakeraw (struct termios *termios_p)
+{
+        termios_p->c_iflag &=
+                ~(IGNBRK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON);
+        termios_p->c_oflag &= ~OPOST;
+        termios_p->c_lflag &= ~(ECHO|ECHONL|ICANON|ISIG|IEXTEN);
+        termios_p->c_cflag &= ~(CSIZE|PARENB);
+        termios_p->c_cflag |= CS8;
+
+        termios_p->c_cc[VMIN] = 0;
+        termios_p->c_cc[VTIME] = 0;
+}
+#endif
+
+int qemu_openpty_raw(int *aslave, char *pty_name)
+{
+    int amaster;
+    struct termios tty;
+#if defined(__OpenBSD__) || defined(__DragonFly__)
+    char pty_buf[PATH_MAX];
+#define q_ptsname(x) pty_buf
+#else
+    char *pty_buf = NULL;
+#define q_ptsname(x) ptsname(x)
+#endif
+
+    if (openpty(&amaster, aslave, pty_buf, NULL, NULL) < 0) {
+        return -1;
+    }
+
+    /* Set raw attributes on the pty. */
+    tcgetattr(*aslave, &tty);
+    cfmakeraw(&tty);
+    tcsetattr(*aslave, TCSAFLUSH, &tty);
+
+    if (pty_name) {
+        strcpy(pty_name, q_ptsname(amaster));
+    }
+
+    return amaster;
+}
diff --git a/util/qemu-option.c b/util/qemu-option.c
new file mode 100644
index 000000000..eedd08929
--- /dev/null
+++ b/util/qemu-option.c
@@ -0,0 +1,1226 @@
+/*
+ * Commandline option parsing functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qnum.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/option_int.h"
+#include "qemu/cutils.h"
+#include "qemu/id.h"
+#include "qemu/help_option.h"
+
+/*
+ * Extracts the name of an option from the parameter string (@p points at the
+ * first byte of the option name)
+ *
+ * The option name is @len characters long and is copied into @option. The
+ * caller is responsible for free'ing @option when no longer required.
+ *
+ * The return value is the position of the delimiter/zero byte after the option
+ * name in @p.
+ */
+static const char *get_opt_name(const char *p, char **option, size_t len)
+{
+    *option = g_strndup(p, len);
+    return p + len;
+}
+
+/*
+ * Extracts the value of an option from the parameter string p (p points at the
+ * first byte of the option value)
+ *
+ * This function is comparable to get_opt_name with the difference that the
+ * delimiter is fixed to be comma which starts a new option. To specify an
+ * option value that contains commas, double each comma.
+ */
+const char *get_opt_value(const char *p, char **value)
+{
+    size_t capacity = 0, length;
+    const char *offset;
+
+    *value = NULL;
+    while (1) {
+        offset = qemu_strchrnul(p, ',');
+        length = offset - p;
+        if (*offset != '\0' && *(offset + 1) == ',') {
+            length++;
+        }
+        *value = g_renew(char, *value, capacity + length + 1);
+        strncpy(*value + capacity, p, length);
+        (*value)[capacity + length] = '\0';
+        capacity += length;
+        if (*offset == '\0' ||
+            *(offset + 1) != ',') {
+            break;
+        }
+
+        p += (offset - p) + 2;
+    }
+
+    return offset;
+}
+
+static bool parse_option_number(const char *name, const char *value,
+                                uint64_t *ret, Error **errp)
+{
+    uint64_t number;
+    int err;
+
+    err = qemu_strtou64(value, NULL, 0, &number);
+    if (err == -ERANGE) {
+        error_setg(errp, "Value '%s' is too large for parameter '%s'",
+                   value, name);
+        return false;
+    }
+    if (err) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, "a number");
+        return false;
+    }
+    *ret = number;
+    return true;
+}
+
+static const QemuOptDesc *find_desc_by_name(const QemuOptDesc *desc,
+                                            const char *name)
+{
+    int i;
+
+    for (i = 0; desc[i].name != NULL; i++) {
+        if (strcmp(desc[i].name, name) == 0) {
+            return &desc[i];
+        }
+    }
+
+    return NULL;
+}
+
+static const char *find_default_by_name(QemuOpts *opts, const char *name)
+{
+    const QemuOptDesc *desc = find_desc_by_name(opts->list->desc, name);
+
+    return desc ? desc->def_value_str : NULL;
+}
+
+bool parse_option_size(const char *name, const char *value,
+                       uint64_t *ret, Error **errp)
+{
+    uint64_t size;
+    int err;
+
+    err = qemu_strtosz(value, NULL, &size);
+    if (err == -ERANGE) {
+        error_setg(errp, "Value '%s' is out of range for parameter '%s'",
+                   value, name);
+        return false;
+    }
+    if (err) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name,
+                   "a non-negative number below 2^64");
+        error_append_hint(errp, "Optional suffix k, M, G, T, P or E means"
+                          " kilo-, mega-, giga-, tera-, peta-\n"
+                          "and exabytes, respectively.\n");
+        return false;
+    }
+    *ret = size;
+    return true;
+}
+
+static const char *opt_type_to_string(enum QemuOptType type)
+{
+    switch (type) {
+    case QEMU_OPT_STRING:
+        return "str";
+    case QEMU_OPT_BOOL:
+        return "bool (on/off)";
+    case QEMU_OPT_NUMBER:
+        return "num";
+    case QEMU_OPT_SIZE:
+        return "size";
+    }
+
+    g_assert_not_reached();
+}
+
+/**
+ * Print the list of options available in the given list.  If
+ * @print_caption is true, a caption (including the list name, if it
+ * exists) is printed.  The options itself will be indented, so
+ * @print_caption should only be set to false if the caller prints its
+ * own custom caption (so that the indentation makes sense).
+ */
+void qemu_opts_print_help(QemuOptsList *list, bool print_caption)
+{
+    QemuOptDesc *desc;
+    int i;
+    GPtrArray *array = g_ptr_array_new();
+
+    assert(list);
+    desc = list->desc;
+    while (desc && desc->name) {
+        GString *str = g_string_new(NULL);
+        g_string_append_printf(str, "  %s=<%s>", desc->name,
+                               opt_type_to_string(desc->type));
+        if (desc->help) {
+            if (str->len < 24) {
+                g_string_append_printf(str, "%*s", 24 - (int)str->len, "");
+            }
+            g_string_append_printf(str, " - %s", desc->help);
+        }
+        g_ptr_array_add(array, g_string_free(str, false));
+        desc++;
+    }
+
+    g_ptr_array_sort(array, (GCompareFunc)qemu_pstrcmp0);
+    if (print_caption && array->len > 0) {
+        if (list->name) {
+            printf("%s options:\n", list->name);
+        } else {
+            printf("Options:\n");
+        }
+    } else if (array->len == 0) {
+        if (list->name) {
+            printf("There are no options for %s.\n", list->name);
+        } else {
+            printf("No options available.\n");
+        }
+    }
+    for (i = 0; i < array->len; i++) {
+        printf("%s\n", (char *)array->pdata[i]);
+    }
+    g_ptr_array_set_free_func(array, g_free);
+    g_ptr_array_free(array, true);
+
+}
+/* ------------------------------------------------------------------ */
+
+QemuOpt *qemu_opt_find(QemuOpts *opts, const char *name)
+{
+    QemuOpt *opt;
+
+    QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) {
+        if (strcmp(opt->name, name) != 0)
+            continue;
+        return opt;
+    }
+    return NULL;
+}
+
+static void qemu_opt_del(QemuOpt *opt)
+{
+    QTAILQ_REMOVE(&opt->opts->head, opt, next);
+    g_free(opt->name);
+    g_free(opt->str);
+    g_free(opt);
+}
+
+/* qemu_opt_set allows many settings for the same option.
+ * This function deletes all settings for an option.
+ */
+static void qemu_opt_del_all(QemuOpts *opts, const char *name)
+{
+    QemuOpt *opt, *next_opt;
+
+    QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next_opt) {
+        if (!strcmp(opt->name, name)) {
+            qemu_opt_del(opt);
+        }
+    }
+}
+
+const char *qemu_opt_get(QemuOpts *opts, const char *name)
+{
+    QemuOpt *opt;
+
+    if (opts == NULL) {
+        return NULL;
+    }
+
+    opt = qemu_opt_find(opts, name);
+    if (!opt) {
+        return find_default_by_name(opts, name);
+    }
+
+    return opt->str;
+}
+
+void qemu_opt_iter_init(QemuOptsIter *iter, QemuOpts *opts, const char *name)
+{
+    iter->opts = opts;
+    iter->opt = QTAILQ_FIRST(&opts->head);
+    iter->name = name;
+}
+
+const char *qemu_opt_iter_next(QemuOptsIter *iter)
+{
+    QemuOpt *ret = iter->opt;
+    if (iter->name) {
+        while (ret && !g_str_equal(iter->name, ret->name)) {
+            ret = QTAILQ_NEXT(ret, next);
+        }
+    }
+    iter->opt = ret ? QTAILQ_NEXT(ret, next) : NULL;
+    return ret ? ret->str : NULL;
+}
+
+/* Get a known option (or its default) and remove it from the list
+ * all in one action. Return a malloced string of the option value.
+ * Result must be freed by caller with g_free().
+ */
+char *qemu_opt_get_del(QemuOpts *opts, const char *name)
+{
+    QemuOpt *opt;
+    char *str;
+
+    if (opts == NULL) {
+        return NULL;
+    }
+
+    opt = qemu_opt_find(opts, name);
+    if (!opt) {
+        return g_strdup(find_default_by_name(opts, name));
+    }
+    str = opt->str;
+    opt->str = NULL;
+    qemu_opt_del_all(opts, name);
+    return str;
+}
+
+bool qemu_opt_has_help_opt(QemuOpts *opts)
+{
+    QemuOpt *opt;
+
+    QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) {
+        if (is_help_option(opt->name)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+static bool qemu_opt_get_bool_helper(QemuOpts *opts, const char *name,
+                                     bool defval, bool del)
+{
+    QemuOpt *opt;
+    const char *def_val;
+    bool ret = defval;
+
+    if (opts == NULL) {
+        return ret;
+    }
+
+    opt = qemu_opt_find(opts, name);
+    if (opt == NULL) {
+        def_val = find_default_by_name(opts, name);
+        if (def_val) {
+            qapi_bool_parse(name, def_val, &ret, &error_abort);
+        }
+        return ret;
+    }
+    assert(opt->desc && opt->desc->type == QEMU_OPT_BOOL);
+    ret = opt->value.boolean;
+    if (del) {
+        qemu_opt_del_all(opts, name);
+    }
+    return ret;
+}
+
+bool qemu_opt_get_bool(QemuOpts *opts, const char *name, bool defval)
+{
+    return qemu_opt_get_bool_helper(opts, name, defval, false);
+}
+
+bool qemu_opt_get_bool_del(QemuOpts *opts, const char *name, bool defval)
+{
+    return qemu_opt_get_bool_helper(opts, name, defval, true);
+}
+
+static uint64_t qemu_opt_get_number_helper(QemuOpts *opts, const char *name,
+                                           uint64_t defval, bool del)
+{
+    QemuOpt *opt;
+    const char *def_val;
+    uint64_t ret = defval;
+
+    if (opts == NULL) {
+        return ret;
+    }
+
+    opt = qemu_opt_find(opts, name);
+    if (opt == NULL) {
+        def_val = find_default_by_name(opts, name);
+        if (def_val) {
+            parse_option_number(name, def_val, &ret, &error_abort);
+        }
+        return ret;
+    }
+    assert(opt->desc && opt->desc->type == QEMU_OPT_NUMBER);
+    ret = opt->value.uint;
+    if (del) {
+        qemu_opt_del_all(opts, name);
+    }
+    return ret;
+}
+
+uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
+{
+    return qemu_opt_get_number_helper(opts, name, defval, false);
+}
+
+uint64_t qemu_opt_get_number_del(QemuOpts *opts, const char *name,
+                                 uint64_t defval)
+{
+    return qemu_opt_get_number_helper(opts, name, defval, true);
+}
+
+static uint64_t qemu_opt_get_size_helper(QemuOpts *opts, const char *name,
+                                         uint64_t defval, bool del)
+{
+    QemuOpt *opt;
+    const char *def_val;
+    uint64_t ret = defval;
+
+    if (opts == NULL) {
+        return ret;
+    }
+
+    opt = qemu_opt_find(opts, name);
+    if (opt == NULL) {
+        def_val = find_default_by_name(opts, name);
+        if (def_val) {
+            parse_option_size(name, def_val, &ret, &error_abort);
+        }
+        return ret;
+    }
+    assert(opt->desc && opt->desc->type == QEMU_OPT_SIZE);
+    ret = opt->value.uint;
+    if (del) {
+        qemu_opt_del_all(opts, name);
+    }
+    return ret;
+}
+
+uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
+{
+    return qemu_opt_get_size_helper(opts, name, defval, false);
+}
+
+uint64_t qemu_opt_get_size_del(QemuOpts *opts, const char *name,
+                               uint64_t defval)
+{
+    return qemu_opt_get_size_helper(opts, name, defval, true);
+}
+
+static bool qemu_opt_parse(QemuOpt *opt, Error **errp)
+{
+    if (opt->desc == NULL)
+        return true;
+
+    switch (opt->desc->type) {
+    case QEMU_OPT_STRING:
+        /* nothing */
+        return true;
+    case QEMU_OPT_BOOL:
+        return qapi_bool_parse(opt->name, opt->str, &opt->value.boolean, errp);
+    case QEMU_OPT_NUMBER:
+        return parse_option_number(opt->name, opt->str, &opt->value.uint,
+                                   errp);
+    case QEMU_OPT_SIZE:
+        return parse_option_size(opt->name, opt->str, &opt->value.uint,
+                                 errp);
+    default:
+        abort();
+    }
+}
+
+static bool opts_accepts_any(const QemuOptsList *list)
+{
+    return list->desc[0].name == NULL;
+}
+
+int qemu_opt_unset(QemuOpts *opts, const char *name)
+{
+    QemuOpt *opt = qemu_opt_find(opts, name);
+
+    assert(opts_accepts_any(opts->list));
+
+    if (opt == NULL) {
+        return -1;
+    } else {
+        qemu_opt_del(opt);
+        return 0;
+    }
+}
+
+static QemuOpt *opt_create(QemuOpts *opts, const char *name, char *value)
+{
+    QemuOpt *opt = g_malloc0(sizeof(*opt));
+
+    opt->name = g_strdup(name);
+    opt->str = value;
+    opt->opts = opts;
+    QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+
+    return opt;
+}
+
+static bool opt_validate(QemuOpt *opt, Error **errp)
+{
+    const QemuOptDesc *desc;
+    const QemuOptsList *list = opt->opts->list;
+
+    desc = find_desc_by_name(list->desc, opt->name);
+    if (!desc && !opts_accepts_any(list)) {
+        error_setg(errp, QERR_INVALID_PARAMETER, opt->name);
+        return false;
+    }
+
+    opt->desc = desc;
+    if (!qemu_opt_parse(opt, errp)) {
+        return false;
+    }
+
+    return true;
+}
+
+bool qemu_opt_set(QemuOpts *opts, const char *name, const char *value,
+                  Error **errp)
+{
+    QemuOpt *opt = opt_create(opts, name, g_strdup(value));
+
+    if (!opt_validate(opt, errp)) {
+        qemu_opt_del(opt);
+        return false;
+    }
+    return true;
+}
+
+bool qemu_opt_set_bool(QemuOpts *opts, const char *name, bool val,
+                       Error **errp)
+{
+    QemuOpt *opt;
+    const QemuOptDesc *desc;
+    const QemuOptsList *list = opts->list;
+
+    desc = find_desc_by_name(list->desc, name);
+    if (!desc && !opts_accepts_any(list)) {
+        error_setg(errp, QERR_INVALID_PARAMETER, name);
+        return false;
+    }
+
+    opt = g_malloc0(sizeof(*opt));
+    opt->name = g_strdup(name);
+    opt->opts = opts;
+    opt->desc = desc;
+    opt->value.boolean = !!val;
+    opt->str = g_strdup(val ? "on" : "off");
+    QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+    return true;
+}
+
+bool qemu_opt_set_number(QemuOpts *opts, const char *name, int64_t val,
+                         Error **errp)
+{
+    QemuOpt *opt;
+    const QemuOptDesc *desc;
+    const QemuOptsList *list = opts->list;
+
+    desc = find_desc_by_name(list->desc, name);
+    if (!desc && !opts_accepts_any(list)) {
+        error_setg(errp, QERR_INVALID_PARAMETER, name);
+        return false;
+    }
+
+    opt = g_malloc0(sizeof(*opt));
+    opt->name = g_strdup(name);
+    opt->opts = opts;
+    opt->desc = desc;
+    opt->value.uint = val;
+    opt->str = g_strdup_printf("%" PRId64, val);
+    QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+    return true;
+}
+
+/**
+ * For each member of @opts, call @func(@opaque, name, value, @errp).
+ * @func() may store an Error through @errp, but must return non-zero then.
+ * When @func() returns non-zero, break the loop and return that value.
+ * Return zero when the loop completes.
+ */
+int qemu_opt_foreach(QemuOpts *opts, qemu_opt_loopfunc func, void *opaque,
+                     Error **errp)
+{
+    QemuOpt *opt;
+    int rc;
+
+    QTAILQ_FOREACH(opt, &opts->head, next) {
+        rc = func(opaque, opt->name, opt->str, errp);
+        if (rc) {
+            return rc;
+        }
+        assert(!errp || !*errp);
+    }
+    return 0;
+}
+
+QemuOpts *qemu_opts_find(QemuOptsList *list, const char *id)
+{
+    QemuOpts *opts;
+
+    QTAILQ_FOREACH(opts, &list->head, next) {
+        if (!opts->id && !id) {
+            return opts;
+        }
+        if (opts->id && id && !strcmp(opts->id, id)) {
+            return opts;
+        }
+    }
+    return NULL;
+}
+
+QemuOpts *qemu_opts_create(QemuOptsList *list, const char *id,
+                           int fail_if_exists, Error **errp)
+{
+    QemuOpts *opts = NULL;
+
+    if (list->merge_lists) {
+        if (id) {
+            error_setg(errp, QERR_INVALID_PARAMETER, "id");
+            return NULL;
+        }
+        opts = qemu_opts_find(list, NULL);
+        if (opts) {
+            return opts;
+        }
+    } else if (id) {
+        assert(fail_if_exists);
+        if (!id_wellformed(id)) {
+            error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id",
+                       "an identifier");
+            error_append_hint(errp, "Identifiers consist of letters, digits, "
+                              "'-', '.', '_', starting with a letter.\n");
+            return NULL;
+        }
+        opts = qemu_opts_find(list, id);
+        if (opts != NULL) {
+            error_setg(errp, "Duplicate ID '%s' for %s", id, list->name);
+            return NULL;
+        }
+    }
+    opts = g_malloc0(sizeof(*opts));
+    opts->id = g_strdup(id);
+    opts->list = list;
+    loc_save(&opts->loc);
+    QTAILQ_INIT(&opts->head);
+    QTAILQ_INSERT_TAIL(&list->head, opts, next);
+    return opts;
+}
+
+void qemu_opts_reset(QemuOptsList *list)
+{
+    QemuOpts *opts, *next_opts;
+
+    QTAILQ_FOREACH_SAFE(opts, &list->head, next, next_opts) {
+        qemu_opts_del(opts);
+    }
+}
+
+void qemu_opts_loc_restore(QemuOpts *opts)
+{
+    loc_restore(&opts->loc);
+}
+
+const char *qemu_opts_id(QemuOpts *opts)
+{
+    return opts->id;
+}
+
+/* The id string will be g_free()d by qemu_opts_del */
+void qemu_opts_set_id(QemuOpts *opts, char *id)
+{
+    opts->id = id;
+}
+
+void qemu_opts_del(QemuOpts *opts)
+{
+    QemuOpt *opt;
+
+    if (opts == NULL) {
+        return;
+    }
+
+    for (;;) {
+        opt = QTAILQ_FIRST(&opts->head);
+        if (opt == NULL)
+            break;
+        qemu_opt_del(opt);
+    }
+    QTAILQ_REMOVE(&opts->list->head, opts, next);
+    g_free(opts->id);
+    g_free(opts);
+}
+
+/* print value, escaping any commas in value */
+static void escaped_print(const char *value)
+{
+    const char *ptr;
+
+    for (ptr = value; *ptr; ++ptr) {
+        if (*ptr == ',') {
+            putchar(',');
+        }
+        putchar(*ptr);
+    }
+}
+
+void qemu_opts_print(QemuOpts *opts, const char *separator)
+{
+    QemuOpt *opt;
+    QemuOptDesc *desc = opts->list->desc;
+    const char *sep = "";
+
+    if (opts->id) {
+        printf("id=%s", opts->id); /* passed id_wellformed -> no commas */
+        sep = separator;
+    }
+
+    if (desc[0].name == NULL) {
+        QTAILQ_FOREACH(opt, &opts->head, next) {
+            printf("%s%s=", sep, opt->name);
+            escaped_print(opt->str);
+            sep = separator;
+        }
+        return;
+    }
+    for (; desc && desc->name; desc++) {
+        const char *value;
+        opt = qemu_opt_find(opts, desc->name);
+
+        value = opt ? opt->str : desc->def_value_str;
+        if (!value) {
+            continue;
+        }
+        if (desc->type == QEMU_OPT_STRING) {
+            printf("%s%s=", sep, desc->name);
+            escaped_print(value);
+        } else if ((desc->type == QEMU_OPT_SIZE ||
+                    desc->type == QEMU_OPT_NUMBER) && opt) {
+            printf("%s%s=%" PRId64, sep, desc->name, opt->value.uint);
+        } else {
+            printf("%s%s=%s", sep, desc->name, value);
+        }
+        sep = separator;
+    }
+}
+
+static const char *get_opt_name_value(const char *params,
+                                      const char *firstname,
+                                      bool warn_on_flag,
+                                      bool *help_wanted,
+                                      char **name, char **value)
+{
+    const char *p;
+    const char *prefix = "";
+    size_t len;
+    bool is_help = false;
+
+    len = strcspn(params, "=,");
+    if (params[len] != '=') {
+        /* found "foo,more" */
+        if (firstname) {
+            /* implicitly named first option */
+            *name = g_strdup(firstname);
+            p = get_opt_value(params, value);
+        } else {
+            /* option without value, must be a flag */
+            p = get_opt_name(params, name, len);
+            if (strncmp(*name, "no", 2) == 0) {
+                memmove(*name, *name + 2, strlen(*name + 2) + 1);
+                *value = g_strdup("off");
+                prefix = "no";
+            } else {
+                *value = g_strdup("on");
+                is_help = is_help_option(*name);
+            }
+            if (!is_help && warn_on_flag) {
+                warn_report("short-form boolean option '%s%s' deprecated", prefix, *name);
+                if (g_str_equal(*name, "delay")) {
+                    error_printf("Please use nodelay=%s instead\n", prefix[0] ? "on" : "off");
+                } else {
+                    error_printf("Please use %s=%s instead\n", *name, *value);
+                }
+            }
+        }
+    } else {
+        /* found "foo=bar,more" */
+        p = get_opt_name(params, name, len);
+        assert(*p == '=');
+        p++;
+        p = get_opt_value(p, value);
+    }
+
+    assert(!*p || *p == ',');
+    if (help_wanted && is_help) {
+        *help_wanted = true;
+    }
+    if (*p == ',') {
+        p++;
+    }
+    return p;
+}
+
+static bool opts_do_parse(QemuOpts *opts, const char *params,
+                          const char *firstname,
+                          bool warn_on_flag, bool *help_wanted, Error **errp)
+{
+    char *option, *value;
+    const char *p;
+    QemuOpt *opt;
+
+    for (p = params; *p;) {
+        p = get_opt_name_value(p, firstname, warn_on_flag, help_wanted, &option, &value);
+        if (help_wanted && *help_wanted) {
+            g_free(option);
+            g_free(value);
+            return false;
+        }
+        firstname = NULL;
+
+        if (!strcmp(option, "id")) {
+            g_free(option);
+            g_free(value);
+            continue;
+        }
+
+        opt = opt_create(opts, option, value);
+        g_free(option);
+        if (!opt_validate(opt, errp)) {
+            qemu_opt_del(opt);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static char *opts_parse_id(const char *params)
+{
+    const char *p;
+    char *name, *value;
+
+    for (p = params; *p;) {
+        p = get_opt_name_value(p, NULL, false, NULL, &name, &value);
+        if (!strcmp(name, "id")) {
+            g_free(name);
+            return value;
+        }
+        g_free(name);
+        g_free(value);
+    }
+
+    return NULL;
+}
+
+bool has_help_option(const char *params)
+{
+    const char *p;
+    char *name, *value;
+    bool ret = false;
+
+    for (p = params; *p;) {
+        p = get_opt_name_value(p, NULL, false, &ret, &name, &value);
+        g_free(name);
+        g_free(value);
+        if (ret) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/**
+ * Store options parsed from @params into @opts.
+ * If @firstname is non-null, the first key=value in @params may omit
+ * key=, and is treated as if key was @firstname.
+ * On error, store an error object through @errp if non-null.
+ */
+bool qemu_opts_do_parse(QemuOpts *opts, const char *params,
+                       const char *firstname, Error **errp)
+{
+    return opts_do_parse(opts, params, firstname, false, NULL, errp);
+}
+
+static QemuOpts *opts_parse(QemuOptsList *list, const char *params,
+                            bool permit_abbrev,
+                            bool warn_on_flag, bool *help_wanted, Error **errp)
+{
+    const char *firstname;
+    char *id = opts_parse_id(params);
+    QemuOpts *opts;
+
+    assert(!permit_abbrev || list->implied_opt_name);
+    firstname = permit_abbrev ? list->implied_opt_name : NULL;
+
+    opts = qemu_opts_create(list, id, !list->merge_lists, errp);
+    g_free(id);
+    if (opts == NULL) {
+        return NULL;
+    }
+
+    if (!opts_do_parse(opts, params, firstname,
+                       warn_on_flag, help_wanted, errp)) {
+        qemu_opts_del(opts);
+        return NULL;
+    }
+
+    return opts;
+}
+
+/**
+ * Create a QemuOpts in @list and with options parsed from @params.
+ * If @permit_abbrev, the first key=value in @params may omit key=,
+ * and is treated as if key was @list->implied_opt_name.
+ * On error, store an error object through @errp if non-null.
+ * Return the new QemuOpts on success, null pointer on error.
+ */
+QemuOpts *qemu_opts_parse(QemuOptsList *list, const char *params,
+                          bool permit_abbrev, Error **errp)
+{
+    return opts_parse(list, params, permit_abbrev, false, NULL, errp);
+}
+
+/**
+ * Create a QemuOpts in @list and with options parsed from @params.
+ * If @permit_abbrev, the first key=value in @params may omit key=,
+ * and is treated as if key was @list->implied_opt_name.
+ * Report errors with error_report_err().  This is inappropriate in
+ * QMP context.  Do not use this function there!
+ * Return the new QemuOpts on success, null pointer on error.
+ */
+QemuOpts *qemu_opts_parse_noisily(QemuOptsList *list, const char *params,
+                                  bool permit_abbrev)
+{
+    Error *err = NULL;
+    QemuOpts *opts;
+    bool help_wanted = false;
+
+    opts = opts_parse(list, params, permit_abbrev, true,
+                      opts_accepts_any(list) ? NULL : &help_wanted,
+                      &err);
+    if (!opts) {
+        assert(!!err + !!help_wanted == 1);
+        if (help_wanted) {
+            qemu_opts_print_help(list, true);
+        } else {
+            error_report_err(err);
+        }
+    }
+    return opts;
+}
+
+static bool qemu_opts_from_qdict_entry(QemuOpts *opts,
+                                       const QDictEntry *entry,
+                                       Error **errp)
+{
+    const char *key = qdict_entry_key(entry);
+    QObject *obj = qdict_entry_value(entry);
+    char buf[32];
+    g_autofree char *tmp = NULL;
+    const char *value;
+
+    if (!strcmp(key, "id")) {
+        return true;
+    }
+
+    switch (qobject_type(obj)) {
+    case QTYPE_QSTRING:
+        value = qstring_get_str(qobject_to(QString, obj));
+        break;
+    case QTYPE_QNUM:
+        tmp = qnum_to_string(qobject_to(QNum, obj));
+        value = tmp;
+        break;
+    case QTYPE_QBOOL:
+        pstrcpy(buf, sizeof(buf),
+                qbool_get_bool(qobject_to(QBool, obj)) ? "on" : "off");
+        value = buf;
+        break;
+    default:
+        return true;
+    }
+
+    return qemu_opt_set(opts, key, value, errp);
+}
+
+/*
+ * Create QemuOpts from a QDict.
+ * Use value of key "id" as ID if it exists and is a QString.  Only
+ * QStrings, QNums and QBools are copied.  Entries with other types
+ * are silently ignored.
+ */
+QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict,
+                               Error **errp)
+{
+    QemuOpts *opts;
+    const QDictEntry *entry;
+
+    opts = qemu_opts_create(list, qdict_get_try_str(qdict, "id"), 1, errp);
+    if (!opts) {
+        return NULL;
+    }
+
+    for (entry = qdict_first(qdict);
+         entry;
+         entry = qdict_next(qdict, entry)) {
+        if (!qemu_opts_from_qdict_entry(opts, entry, errp)) {
+            qemu_opts_del(opts);
+            return NULL;
+        }
+    }
+
+    return opts;
+}
+
+/*
+ * Adds all QDict entries to the QemuOpts that can be added and removes them
+ * from the QDict. When this function returns, the QDict contains only those
+ * entries that couldn't be added to the QemuOpts.
+ */
+bool qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp)
+{
+    const QDictEntry *entry, *next;
+
+    entry = qdict_first(qdict);
+
+    while (entry != NULL) {
+        next = qdict_next(qdict, entry);
+
+        if (opts_accepts_any(opts->list) ||
+            find_desc_by_name(opts->list->desc, entry->key)) {
+            if (!qemu_opts_from_qdict_entry(opts, entry, errp)) {
+                return false;
+            }
+            qdict_del(qdict, entry->key);
+        }
+
+        entry = next;
+    }
+
+    return true;
+}
+
+/*
+ * Convert from QemuOpts to QDict. The QDict values are of type QString.
+ *
+ * If @list is given, only add those options to the QDict that are contained in
+ * the list. If @del is true, any options added to the QDict are removed from
+ * the QemuOpts, otherwise they remain there.
+ *
+ * If two options in @opts have the same name, they are processed in order
+ * so that the last one wins (consistent with the reverse iteration in
+ * qemu_opt_find()), but all of them are deleted if @del is true.
+ *
+ * TODO We'll want to use types appropriate for opt->desc->type, but
+ * this is enough for now.
+ */
+QDict *qemu_opts_to_qdict_filtered(QemuOpts *opts, QDict *qdict,
+                                   QemuOptsList *list, bool del)
+{
+    QemuOpt *opt, *next;
+
+    if (!qdict) {
+        qdict = qdict_new();
+    }
+    if (opts->id) {
+        qdict_put_str(qdict, "id", opts->id);
+    }
+    QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next) {
+        if (list) {
+            QemuOptDesc *desc;
+            bool found = false;
+            for (desc = list->desc; desc->name; desc++) {
+                if (!strcmp(desc->name, opt->name)) {
+                    found = true;
+                    break;
+                }
+            }
+            if (!found) {
+                continue;
+            }
+        }
+        qdict_put_str(qdict, opt->name, opt->str);
+        if (del) {
+            qemu_opt_del(opt);
+        }
+    }
+    return qdict;
+}
+
+/* Copy all options in a QemuOpts to the given QDict. See
+ * qemu_opts_to_qdict_filtered() for details. */
+QDict *qemu_opts_to_qdict(QemuOpts *opts, QDict *qdict)
+{
+    return qemu_opts_to_qdict_filtered(opts, qdict, NULL, false);
+}
+
+/* Validate parsed opts against descriptions where no
+ * descriptions were provided in the QemuOptsList.
+ */
+bool qemu_opts_validate(QemuOpts *opts, const QemuOptDesc *desc, Error **errp)
+{
+    QemuOpt *opt;
+
+    assert(opts_accepts_any(opts->list));
+
+    QTAILQ_FOREACH(opt, &opts->head, next) {
+        opt->desc = find_desc_by_name(desc, opt->name);
+        if (!opt->desc) {
+            error_setg(errp, QERR_INVALID_PARAMETER, opt->name);
+            return false;
+        }
+
+        if (!qemu_opt_parse(opt, errp)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * For each member of @list, call @func(@opaque, member, @errp).
+ * Call it with the current location temporarily set to the member's.
+ * @func() may store an Error through @errp, but must return non-zero then.
+ * When @func() returns non-zero, break the loop and return that value.
+ * Return zero when the loop completes.
+ */
+int qemu_opts_foreach(QemuOptsList *list, qemu_opts_loopfunc func,
+                      void *opaque, Error **errp)
+{
+    Location loc;
+    QemuOpts *opts, *next;
+    int rc = 0;
+
+    loc_push_none(&loc);
+    QTAILQ_FOREACH_SAFE(opts, &list->head, next, next) {
+        loc_restore(&opts->loc);
+        rc = func(opaque, opts, errp);
+        if (rc) {
+            break;
+        }
+        assert(!errp || !*errp);
+    }
+    loc_pop(&loc);
+    return rc;
+}
+
+static size_t count_opts_list(QemuOptsList *list)
+{
+    QemuOptDesc *desc = NULL;
+    size_t num_opts = 0;
+
+    if (!list) {
+        return 0;
+    }
+
+    desc = list->desc;
+    while (desc && desc->name) {
+        num_opts++;
+        desc++;
+    }
+
+    return num_opts;
+}
+
+void qemu_opts_free(QemuOptsList *list)
+{
+    g_free(list);
+}
+
+/* Realloc dst option list and append options from an option list (list)
+ * to it. dst could be NULL or a malloced list.
+ * The lifetime of dst must be shorter than the input list because the
+ * QemuOptDesc->name, ->help, and ->def_value_str strings are shared.
+ */
+QemuOptsList *qemu_opts_append(QemuOptsList *dst,
+                               QemuOptsList *list)
+{
+    size_t num_opts, num_dst_opts;
+    QemuOptDesc *desc;
+    bool need_init = false;
+    bool need_head_update;
+
+    if (!list) {
+        return dst;
+    }
+
+    /* If dst is NULL, after realloc, some area of dst should be initialized
+     * before adding options to it.
+     */
+    if (!dst) {
+        need_init = true;
+        need_head_update = true;
+    } else {
+        /* Moreover, even if dst is not NULL, the realloc may move it to a
+         * different address in which case we may get a stale tail pointer
+         * in dst->head. */
+        need_head_update = QTAILQ_EMPTY(&dst->head);
+    }
+
+    num_opts = count_opts_list(dst);
+    num_dst_opts = num_opts;
+    num_opts += count_opts_list(list);
+    dst = g_realloc(dst, sizeof(QemuOptsList) +
+                    (num_opts + 1) * sizeof(QemuOptDesc));
+    if (need_init) {
+        dst->name = NULL;
+        dst->implied_opt_name = NULL;
+        dst->merge_lists = false;
+    }
+    if (need_head_update) {
+        QTAILQ_INIT(&dst->head);
+    }
+    dst->desc[num_dst_opts].name = NULL;
+
+    /* append list->desc to dst->desc */
+    if (list) {
+        desc = list->desc;
+        while (desc && desc->name) {
+            if (find_desc_by_name(dst->desc, desc->name) == NULL) {
+                dst->desc[num_dst_opts++] = *desc;
+                dst->desc[num_dst_opts].name = NULL;
+            }
+            desc++;
+        }
+    }
+
+    return dst;
+}
diff --git a/util/qemu-print.c b/util/qemu-print.c
new file mode 100644
index 000000000..69ba612f5
--- /dev/null
+++ b/util/qemu-print.c
@@ -0,0 +1,70 @@
+/*
+ * Print to stream or current monitor
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "qemu/qemu-print.h"
+
+/*
+ * Print like vprintf().
+ * Print to current monitor if we have one, else to stdout.
+ */
+int qemu_vprintf(const char *fmt, va_list ap)
+{
+    Monitor *cur_mon = monitor_cur();
+    if (cur_mon) {
+        return monitor_vprintf(cur_mon, fmt, ap);
+    }
+    return vprintf(fmt, ap);
+}
+
+/*
+ * Print like printf().
+ * Print to current monitor if we have one, else to stdout.
+ */
+int qemu_printf(const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = qemu_vprintf(fmt, ap);
+    va_end(ap);
+    return ret;
+}
+
+/*
+ * Print like vfprintf()
+ * Print to @stream if non-null, else to current monitor.
+ */
+int qemu_vfprintf(FILE *stream, const char *fmt, va_list ap)
+{
+    if (!stream) {
+        return monitor_vprintf(monitor_cur(), fmt, ap);
+    }
+    return vfprintf(stream, fmt, ap);
+}
+
+/*
+ * Print like fprintf().
+ * Print to @stream if non-null, else to current monitor.
+ */
+int qemu_fprintf(FILE *stream, const char *fmt, ...)
+{
+    va_list ap;
+    int ret;
+
+    va_start(ap, fmt);
+    ret = qemu_vfprintf(stream, fmt, ap);
+    va_end(ap);
+    return ret;
+}
diff --git a/util/qemu-progress.c b/util/qemu-progress.c
new file mode 100644
index 000000000..20d51f8c1
--- /dev/null
+++ b/util/qemu-progress.c
@@ -0,0 +1,162 @@
+/*
+ * QEMU progress printing utility functions
+ *
+ * Copyright (C) 2011 Jes Sorensen <Jes.Sorensen@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+struct progress_state {
+    float current;
+    float last_print;
+    float min_skip;
+    void (*print)(void);
+    void (*end)(void);
+};
+
+static struct progress_state state;
+static volatile sig_atomic_t print_pending;
+
+/*
+ * Simple progress print function.
+ * @percent relative percent of current operation
+ * @max percent of total operation
+ */
+static void progress_simple_print(void)
+{
+    printf("    (%3.2f/100%%)\r", state.current);
+    fflush(stdout);
+}
+
+static void progress_simple_end(void)
+{
+    printf("\n");
+}
+
+static void progress_simple_init(void)
+{
+    state.print = progress_simple_print;
+    state.end = progress_simple_end;
+}
+
+#ifdef CONFIG_POSIX
+static void sigusr_print(int signal)
+{
+    print_pending = 1;
+}
+#endif
+
+static void progress_dummy_print(void)
+{
+    if (print_pending) {
+        fprintf(stderr, "    (%3.2f/100%%)\n", state.current);
+        print_pending = 0;
+    }
+}
+
+static void progress_dummy_end(void)
+{
+}
+
+static void progress_dummy_init(void)
+{
+#ifdef CONFIG_POSIX
+    struct sigaction action;
+    sigset_t set;
+
+    memset(&action, 0, sizeof(action));
+    sigfillset(&action.sa_mask);
+    action.sa_handler = sigusr_print;
+    action.sa_flags = 0;
+    sigaction(SIGUSR1, &action, NULL);
+#ifdef SIGINFO
+    sigaction(SIGINFO, &action, NULL);
+#endif
+
+    /*
+     * SIGUSR1 is SIG_IPI and gets blocked in qemu_init_main_loop(). In the
+     * tools that use the progress report SIGUSR1 isn't used in this meaning
+     * and instead should print the progress, so reenable it.
+     */
+    sigemptyset(&set);
+    sigaddset(&set, SIGUSR1);
+    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+#endif
+
+    state.print = progress_dummy_print;
+    state.end = progress_dummy_end;
+}
+
+/*
+ * Initialize progress reporting.
+ * If @enabled is false, actual reporting is suppressed.  The user can
+ * still trigger a report by sending a SIGUSR1.
+ * Reports are also suppressed unless we've had at least @min_skip
+ * percent progress since the last report.
+ */
+void qemu_progress_init(int enabled, float min_skip)
+{
+    state.min_skip = min_skip;
+    if (enabled) {
+        progress_simple_init();
+    } else {
+        progress_dummy_init();
+    }
+}
+
+void qemu_progress_end(void)
+{
+    state.end();
+}
+
+/*
+ * Report progress.
+ * @delta is how much progress we made.
+ * If @max is zero, @delta is an absolute value of the total job done.
+ * Else, @delta is a progress delta since the last call, as a fraction
+ * of @max.  I.e. the delta is @delta * @max / 100. This allows
+ * relative accounting of functions which may be a different fraction of
+ * the full job, depending on the context they are called in. I.e.
+ * a function might be considered 40% of the full job if used from
+ * bdrv_img_create() but only 20% if called from img_convert().
+ */
+void qemu_progress_print(float delta, int max)
+{
+    float current;
+
+    if (max == 0) {
+        current = delta;
+    } else {
+        current = state.current + delta / 100 * max;
+    }
+    if (current > 100) {
+        current = 100;
+    }
+    state.current = current;
+
+    if (current > (state.last_print + state.min_skip) ||
+        current < (state.last_print - state.min_skip) ||
+        current == 100 || current == 0) {
+        state.last_print = state.current;
+        state.print();
+    }
+}
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
new file mode 100644
index 000000000..0585e7a62
--- /dev/null
+++ b/util/qemu-sockets.c
@@ -0,0 +1,1482 @@
+/*
+ *  inet and unix socket functions for qemu
+ *
+ *  (c) 2008 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include "qemu/osdep.h"
+
+#ifdef CONFIG_AF_VSOCK
+#include <linux/vm_sockets.h>
+#endif /* CONFIG_AF_VSOCK */
+
+#include "qemu-common.h"
+#include "monitor/monitor.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-sockets.h"
+#include "qemu/sockets.h"
+#include "qemu/main-loop.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+
+#ifndef AI_ADDRCONFIG
+# define AI_ADDRCONFIG 0
+#endif
+
+#ifndef AI_V4MAPPED
+# define AI_V4MAPPED 0
+#endif
+
+#ifndef AI_NUMERICSERV
+# define AI_NUMERICSERV 0
+#endif
+
+
+static int inet_getport(struct addrinfo *e)
+{
+    struct sockaddr_in *i4;
+    struct sockaddr_in6 *i6;
+
+    switch (e->ai_family) {
+    case PF_INET6:
+        i6 = (void*)e->ai_addr;
+        return ntohs(i6->sin6_port);
+    case PF_INET:
+        i4 = (void*)e->ai_addr;
+        return ntohs(i4->sin_port);
+    default:
+        return 0;
+    }
+}
+
+static void inet_setport(struct addrinfo *e, int port)
+{
+    struct sockaddr_in *i4;
+    struct sockaddr_in6 *i6;
+
+    switch (e->ai_family) {
+    case PF_INET6:
+        i6 = (void*)e->ai_addr;
+        i6->sin6_port = htons(port);
+        break;
+    case PF_INET:
+        i4 = (void*)e->ai_addr;
+        i4->sin_port = htons(port);
+        break;
+    }
+}
+
+NetworkAddressFamily inet_netfamily(int family)
+{
+    switch (family) {
+    case PF_INET6: return NETWORK_ADDRESS_FAMILY_IPV6;
+    case PF_INET:  return NETWORK_ADDRESS_FAMILY_IPV4;
+    case PF_UNIX:  return NETWORK_ADDRESS_FAMILY_UNIX;
+#ifdef CONFIG_AF_VSOCK
+    case PF_VSOCK: return NETWORK_ADDRESS_FAMILY_VSOCK;
+#endif /* CONFIG_AF_VSOCK */
+    }
+    return NETWORK_ADDRESS_FAMILY_UNKNOWN;
+}
+
+bool fd_is_socket(int fd)
+{
+    int optval;
+    socklen_t optlen = sizeof(optval);
+    return !qemu_getsockopt(fd, SOL_SOCKET, SO_TYPE, &optval, &optlen);
+}
+
+
+/*
+ * Matrix we're trying to apply
+ *
+ *  ipv4  ipv6   family
+ *   -     -       PF_UNSPEC
+ *   -     f       PF_INET
+ *   -     t       PF_INET6
+ *   f     -       PF_INET6
+ *   f     f       <error>
+ *   f     t       PF_INET6
+ *   t     -       PF_INET
+ *   t     f       PF_INET
+ *   t     t       PF_INET6/PF_UNSPEC
+ *
+ * NB, this matrix is only about getting the necessary results
+ * from getaddrinfo(). Some of the cases require further work
+ * after reading results from getaddrinfo in order to fully
+ * apply the logic the end user wants.
+ *
+ * In the first and last cases, we must set IPV6_V6ONLY=0
+ * when binding, to allow a single listener to potentially
+ * accept both IPv4+6 addresses.
+ */
+int inet_ai_family_from_address(InetSocketAddress *addr,
+                                Error **errp)
+{
+    if (addr->has_ipv6 && addr->has_ipv4 &&
+        !addr->ipv6 && !addr->ipv4) {
+        error_setg(errp, "Cannot disable IPv4 and IPv6 at same time");
+        return PF_UNSPEC;
+    }
+    if ((addr->has_ipv6 && addr->ipv6) && (addr->has_ipv4 && addr->ipv4)) {
+        /*
+         * Some backends can only do a single listener. In that case
+         * we want empty hostname to resolve to "::" and then use the
+         * flag IPV6_V6ONLY==0 to get both protocols on 1 socket. This
+         * doesn't work for addresses other than "", so they're just
+         * inevitably broken until multiple listeners can be used,
+         * and thus we honour getaddrinfo automatic protocol detection
+         * Once all backends do multi-listener, remove the PF_INET6
+         * branch entirely.
+         */
+        if (!addr->host || g_str_equal(addr->host, "")) {
+            return PF_INET6;
+        } else {
+            return PF_UNSPEC;
+        }
+    }
+    if ((addr->has_ipv6 && addr->ipv6) || (addr->has_ipv4 && !addr->ipv4)) {
+        return PF_INET6;
+    }
+    if ((addr->has_ipv4 && addr->ipv4) || (addr->has_ipv6 && !addr->ipv6)) {
+        return PF_INET;
+    }
+    return PF_UNSPEC;
+}
+
+static int create_fast_reuse_socket(struct addrinfo *e)
+{
+    int slisten = qemu_socket(e->ai_family, e->ai_socktype, e->ai_protocol);
+    if (slisten < 0) {
+        return -1;
+    }
+    socket_set_fast_reuse(slisten);
+    return slisten;
+}
+
+static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e)
+{
+#ifndef IPV6_V6ONLY
+    return bind(socket, e->ai_addr, e->ai_addrlen);
+#else
+    /*
+     * Deals with first & last cases in matrix in comment
+     * for inet_ai_family_from_address().
+     */
+    int v6only =
+        ((!saddr->has_ipv4 && !saddr->has_ipv6) ||
+         (saddr->has_ipv4 && saddr->ipv4 &&
+          saddr->has_ipv6 && saddr->ipv6)) ? 0 : 1;
+    int stat;
+
+ rebind:
+    if (e->ai_family == PF_INET6) {
+        qemu_setsockopt(socket, IPPROTO_IPV6, IPV6_V6ONLY, &v6only,
+                        sizeof(v6only));
+    }
+
+    stat = bind(socket, e->ai_addr, e->ai_addrlen);
+    if (!stat) {
+        return 0;
+    }
+
+    /* If we got EADDRINUSE from an IPv6 bind & v6only is unset,
+     * it could be that the IPv4 port is already claimed, so retry
+     * with v6only set
+     */
+    if (e->ai_family == PF_INET6 && errno == EADDRINUSE && !v6only) {
+        v6only = 1;
+        goto rebind;
+    }
+    return stat;
+#endif
+}
+
+static int inet_listen_saddr(InetSocketAddress *saddr,
+                             int port_offset,
+                             int num,
+                             Error **errp)
+{
+    struct addrinfo ai,*res,*e;
+    char port[33];
+    char uaddr[INET6_ADDRSTRLEN+1];
+    char uport[33];
+    int rc, port_min, port_max, p;
+    int slisten = -1;
+    int saved_errno = 0;
+    bool socket_created = false;
+    Error *err = NULL;
+
+    if (saddr->keep_alive) {
+        error_setg(errp, "keep-alive option is not supported for passive "
+                   "sockets");
+        return -1;
+    }
+
+    memset(&ai,0, sizeof(ai));
+    ai.ai_flags = AI_PASSIVE;
+    if (saddr->has_numeric && saddr->numeric) {
+        ai.ai_flags |= AI_NUMERICHOST | AI_NUMERICSERV;
+    }
+    ai.ai_family = inet_ai_family_from_address(saddr, &err);
+    ai.ai_socktype = SOCK_STREAM;
+
+    if (err) {
+        error_propagate(errp, err);
+        return -1;
+    }
+
+    if (saddr->host == NULL) {
+        error_setg(errp, "host not specified");
+        return -1;
+    }
+    if (saddr->port != NULL) {
+        pstrcpy(port, sizeof(port), saddr->port);
+    } else {
+        port[0] = '\0';
+    }
+
+    /* lookup */
+    if (port_offset) {
+        unsigned long long baseport;
+        if (strlen(port) == 0) {
+            error_setg(errp, "port not specified");
+            return -1;
+        }
+        if (parse_uint_full(port, &baseport, 10) < 0) {
+            error_setg(errp, "can't convert to a number: %s", port);
+            return -1;
+        }
+        if (baseport > 65535 ||
+            baseport + port_offset > 65535) {
+            error_setg(errp, "port %s out of range", port);
+            return -1;
+        }
+        snprintf(port, sizeof(port), "%d", (int)baseport + port_offset);
+    }
+    rc = getaddrinfo(strlen(saddr->host) ? saddr->host : NULL,
+                     strlen(port) ? port : NULL, &ai, &res);
+    if (rc != 0) {
+        error_setg(errp, "address resolution failed for %s:%s: %s",
+                   saddr->host, port, gai_strerror(rc));
+        return -1;
+    }
+
+    /* create socket + bind/listen */
+    for (e = res; e != NULL; e = e->ai_next) {
+#ifdef HAVE_IPPROTO_MPTCP
+        if (saddr->has_mptcp && saddr->mptcp) {
+            e->ai_protocol = IPPROTO_MPTCP;
+        }
+#endif
+        getnameinfo((struct sockaddr*)e->ai_addr,e->ai_addrlen,
+                        uaddr,INET6_ADDRSTRLEN,uport,32,
+                        NI_NUMERICHOST | NI_NUMERICSERV);
+
+        port_min = inet_getport(e);
+        port_max = saddr->has_to ? saddr->to + port_offset : port_min;
+        for (p = port_min; p <= port_max; p++) {
+            inet_setport(e, p);
+
+            slisten = create_fast_reuse_socket(e);
+            if (slisten < 0) {
+                /* First time we expect we might fail to create the socket
+                 * eg if 'e' has AF_INET6 but ipv6 kmod is not loaded.
+                 * Later iterations should always succeed if first iteration
+                 * worked though, so treat that as fatal.
+                 */
+                if (p == port_min) {
+                    continue;
+                } else {
+                    error_setg_errno(errp, errno,
+                                     "Failed to recreate failed listening socket");
+                    goto listen_failed;
+                }
+            }
+            socket_created = true;
+
+            rc = try_bind(slisten, saddr, e);
+            if (rc < 0) {
+                if (errno != EADDRINUSE) {
+                    error_setg_errno(errp, errno, "Failed to bind socket");
+                    goto listen_failed;
+                }
+            } else {
+                if (!listen(slisten, num)) {
+                    goto listen_ok;
+                }
+                if (errno != EADDRINUSE) {
+                    error_setg_errno(errp, errno, "Failed to listen on socket");
+                    goto listen_failed;
+                }
+            }
+            /* Someone else managed to bind to the same port and beat us
+             * to listen on it! Socket semantics does not allow us to
+             * recover from this situation, so we need to recreate the
+             * socket to allow bind attempts for subsequent ports:
+             */
+            closesocket(slisten);
+            slisten = -1;
+        }
+    }
+    error_setg_errno(errp, errno,
+                     socket_created ?
+                     "Failed to find an available port" :
+                     "Failed to create a socket");
+listen_failed:
+    saved_errno = errno;
+    if (slisten >= 0) {
+        closesocket(slisten);
+    }
+    freeaddrinfo(res);
+    errno = saved_errno;
+    return -1;
+
+listen_ok:
+    freeaddrinfo(res);
+    return slisten;
+}
+
+#ifdef _WIN32
+#define QEMU_SOCKET_RC_INPROGRESS(rc) \
+    ((rc) == -EINPROGRESS || (rc) == -EWOULDBLOCK || (rc) == -WSAEALREADY)
+#else
+#define QEMU_SOCKET_RC_INPROGRESS(rc) \
+    ((rc) == -EINPROGRESS)
+#endif
+
+static int inet_connect_addr(const InetSocketAddress *saddr,
+                             struct addrinfo *addr, Error **errp)
+{
+    int sock, rc;
+
+    sock = qemu_socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket family %d",
+                         addr->ai_family);
+        return -1;
+    }
+    socket_set_fast_reuse(sock);
+
+    /* connect to peer */
+    do {
+        rc = 0;
+        if (connect(sock, addr->ai_addr, addr->ai_addrlen) < 0) {
+            rc = -errno;
+        }
+    } while (rc == -EINTR);
+
+    if (rc < 0) {
+        error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+                         saddr->host, saddr->port);
+        closesocket(sock);
+        return -1;
+    }
+
+    return sock;
+}
+
+static struct addrinfo *inet_parse_connect_saddr(InetSocketAddress *saddr,
+                                                 Error **errp)
+{
+    struct addrinfo ai, *res;
+    int rc;
+    Error *err = NULL;
+    static int useV4Mapped = 1;
+
+    memset(&ai, 0, sizeof(ai));
+
+    ai.ai_flags = AI_CANONNAME | AI_ADDRCONFIG;
+    if (qatomic_read(&useV4Mapped)) {
+        ai.ai_flags |= AI_V4MAPPED;
+    }
+    ai.ai_family = inet_ai_family_from_address(saddr, &err);
+    ai.ai_socktype = SOCK_STREAM;
+
+    if (err) {
+        error_propagate(errp, err);
+        return NULL;
+    }
+
+    if (saddr->host == NULL || saddr->port == NULL) {
+        error_setg(errp, "host and/or port not specified");
+        return NULL;
+    }
+
+    /* lookup */
+    rc = getaddrinfo(saddr->host, saddr->port, &ai, &res);
+
+    /* At least FreeBSD and OS-X 10.6 declare AI_V4MAPPED but
+     * then don't implement it in their getaddrinfo(). Detect
+     * this and retry without the flag since that's preferable
+     * to a fatal error
+     */
+    if (rc == EAI_BADFLAGS &&
+        (ai.ai_flags & AI_V4MAPPED)) {
+        qatomic_set(&useV4Mapped, 0);
+        ai.ai_flags &= ~AI_V4MAPPED;
+        rc = getaddrinfo(saddr->host, saddr->port, &ai, &res);
+    }
+    if (rc != 0) {
+        error_setg(errp, "address resolution failed for %s:%s: %s",
+                   saddr->host, saddr->port, gai_strerror(rc));
+        return NULL;
+    }
+    return res;
+}
+
+/**
+ * Create a socket and connect it to an address.
+ *
+ * @saddr: Inet socket address specification
+ * @errp: set on error
+ *
+ * Returns: -1 on error, file descriptor on success.
+ */
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp)
+{
+    Error *local_err = NULL;
+    struct addrinfo *res, *e;
+    int sock = -1;
+
+    res = inet_parse_connect_saddr(saddr, errp);
+    if (!res) {
+        return -1;
+    }
+
+    for (e = res; e != NULL; e = e->ai_next) {
+        error_free(local_err);
+        local_err = NULL;
+
+#ifdef HAVE_IPPROTO_MPTCP
+        if (saddr->has_mptcp && saddr->mptcp) {
+            e->ai_protocol = IPPROTO_MPTCP;
+        }
+#endif
+
+        sock = inet_connect_addr(saddr, e, &local_err);
+        if (sock >= 0) {
+            break;
+        }
+    }
+
+    freeaddrinfo(res);
+
+    if (sock < 0) {
+        error_propagate(errp, local_err);
+        return sock;
+    }
+
+    if (saddr->keep_alive) {
+        int val = 1;
+        int ret = qemu_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+                                  &val, sizeof(val));
+
+        if (ret < 0) {
+            error_setg_errno(errp, errno, "Unable to set KEEPALIVE");
+            close(sock);
+            return -1;
+        }
+    }
+
+    return sock;
+}
+
+static int inet_dgram_saddr(InetSocketAddress *sraddr,
+                            InetSocketAddress *sladdr,
+                            Error **errp)
+{
+    struct addrinfo ai, *peer = NULL, *local = NULL;
+    const char *addr;
+    const char *port;
+    int sock = -1, rc;
+    Error *err = NULL;
+
+    /* lookup peer addr */
+    memset(&ai,0, sizeof(ai));
+    ai.ai_flags = AI_CANONNAME | AI_V4MAPPED | AI_ADDRCONFIG;
+    ai.ai_family = inet_ai_family_from_address(sraddr, &err);
+    ai.ai_socktype = SOCK_DGRAM;
+
+    if (err) {
+        error_propagate(errp, err);
+        goto err;
+    }
+
+    addr = sraddr->host;
+    port = sraddr->port;
+    if (addr == NULL || strlen(addr) == 0) {
+        addr = "localhost";
+    }
+    if (port == NULL || strlen(port) == 0) {
+        error_setg(errp, "remote port not specified");
+        goto err;
+    }
+
+    if ((rc = getaddrinfo(addr, port, &ai, &peer)) != 0) {
+        error_setg(errp, "address resolution failed for %s:%s: %s", addr, port,
+                   gai_strerror(rc));
+        goto err;
+    }
+
+    /* lookup local addr */
+    memset(&ai,0, sizeof(ai));
+    ai.ai_flags = AI_PASSIVE;
+    ai.ai_family = peer->ai_family;
+    ai.ai_socktype = SOCK_DGRAM;
+
+    if (sladdr) {
+        addr = sladdr->host;
+        port = sladdr->port;
+        if (addr == NULL || strlen(addr) == 0) {
+            addr = NULL;
+        }
+        if (!port || strlen(port) == 0) {
+            port = "0";
+        }
+    } else {
+        addr = NULL;
+        port = "0";
+    }
+
+    if ((rc = getaddrinfo(addr, port, &ai, &local)) != 0) {
+        error_setg(errp, "address resolution failed for %s:%s: %s", addr, port,
+                   gai_strerror(rc));
+        goto err;
+    }
+
+    /* create socket */
+    sock = qemu_socket(peer->ai_family, peer->ai_socktype, peer->ai_protocol);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket family %d",
+                         peer->ai_family);
+        goto err;
+    }
+    socket_set_fast_reuse(sock);
+
+    /* bind socket */
+    if (bind(sock, local->ai_addr, local->ai_addrlen) < 0) {
+        error_setg_errno(errp, errno, "Failed to bind socket");
+        goto err;
+    }
+
+    /* connect to peer */
+    if (connect(sock,peer->ai_addr,peer->ai_addrlen) < 0) {
+        error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+                         addr, port);
+        goto err;
+    }
+
+    freeaddrinfo(local);
+    freeaddrinfo(peer);
+    return sock;
+
+err:
+    if (sock != -1) {
+        closesocket(sock);
+    }
+    if (local) {
+        freeaddrinfo(local);
+    }
+    if (peer) {
+        freeaddrinfo(peer);
+    }
+
+    return -1;
+}
+
+/* compatibility wrapper */
+static int inet_parse_flag(const char *flagname, const char *optstr, bool *val,
+                           Error **errp)
+{
+    char *end;
+    size_t len;
+
+    end = strstr(optstr, ",");
+    if (end) {
+        if (end[1] == ',') { /* Reject 'ipv6=on,,foo' */
+            error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr);
+            return -1;
+        }
+        len = end - optstr;
+    } else {
+        len = strlen(optstr);
+    }
+    if (len == 0 || (len == 3 && strncmp(optstr, "=on", len) == 0)) {
+        *val = true;
+    } else if (len == 4 && strncmp(optstr, "=off", len) == 0) {
+        *val = false;
+    } else {
+        error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr);
+        return -1;
+    }
+    return 0;
+}
+
+int inet_parse(InetSocketAddress *addr, const char *str, Error **errp)
+{
+    const char *optstr, *h;
+    char host[65];
+    char port[33];
+    int to;
+    int pos;
+    char *begin;
+
+    memset(addr, 0, sizeof(*addr));
+
+    /* parse address */
+    if (str[0] == ':') {
+        /* no host given */
+        host[0] = '\0';
+        if (sscanf(str, ":%32[^,]%n", port, &pos) != 1) {
+            error_setg(errp, "error parsing port in address '%s'", str);
+            return -1;
+        }
+    } else if (str[0] == '[') {
+        /* IPv6 addr */
+        if (sscanf(str, "[%64[^]]]:%32[^,]%n", host, port, &pos) != 2) {
+            error_setg(errp, "error parsing IPv6 address '%s'", str);
+            return -1;
+        }
+    } else {
+        /* hostname or IPv4 addr */
+        if (sscanf(str, "%64[^:]:%32[^,]%n", host, port, &pos) != 2) {
+            error_setg(errp, "error parsing address '%s'", str);
+            return -1;
+        }
+    }
+
+    addr->host = g_strdup(host);
+    addr->port = g_strdup(port);
+
+    /* parse options */
+    optstr = str + pos;
+    h = strstr(optstr, ",to=");
+    if (h) {
+        h += 4;
+        if (sscanf(h, "%d%n", &to, &pos) != 1 ||
+            (h[pos] != '\0' && h[pos] != ',')) {
+            error_setg(errp, "error parsing to= argument");
+            return -1;
+        }
+        addr->has_to = true;
+        addr->to = to;
+    }
+    begin = strstr(optstr, ",ipv4");
+    if (begin) {
+        if (inet_parse_flag("ipv4", begin + 5, &addr->ipv4, errp) < 0) {
+            return -1;
+        }
+        addr->has_ipv4 = true;
+    }
+    begin = strstr(optstr, ",ipv6");
+    if (begin) {
+        if (inet_parse_flag("ipv6", begin + 5, &addr->ipv6, errp) < 0) {
+            return -1;
+        }
+        addr->has_ipv6 = true;
+    }
+    begin = strstr(optstr, ",keep-alive");
+    if (begin) {
+        if (inet_parse_flag("keep-alive", begin + strlen(",keep-alive"),
+                            &addr->keep_alive, errp) < 0)
+        {
+            return -1;
+        }
+        addr->has_keep_alive = true;
+    }
+#ifdef HAVE_IPPROTO_MPTCP
+    begin = strstr(optstr, ",mptcp");
+    if (begin) {
+        if (inet_parse_flag("mptcp", begin + strlen(",mptcp"),
+                            &addr->mptcp, errp) < 0)
+        {
+            return -1;
+        }
+        addr->has_mptcp = true;
+    }
+#endif
+    return 0;
+}
+
+
+/**
+ * Create a blocking socket and connect it to an address.
+ *
+ * @str: address string
+ * @errp: set in case of an error
+ *
+ * Returns -1 in case of error, file descriptor on success
+ **/
+int inet_connect(const char *str, Error **errp)
+{
+    int sock = -1;
+    InetSocketAddress *addr = g_new(InetSocketAddress, 1);
+
+    if (!inet_parse(addr, str, errp)) {
+        sock = inet_connect_saddr(addr, errp);
+    }
+    qapi_free_InetSocketAddress(addr);
+    return sock;
+}
+
+#ifdef CONFIG_AF_VSOCK
+static bool vsock_parse_vaddr_to_sockaddr(const VsockSocketAddress *vaddr,
+                                          struct sockaddr_vm *svm,
+                                          Error **errp)
+{
+    unsigned long long val;
+
+    memset(svm, 0, sizeof(*svm));
+    svm->svm_family = AF_VSOCK;
+
+    if (parse_uint_full(vaddr->cid, &val, 10) < 0 ||
+        val > UINT32_MAX) {
+        error_setg(errp, "Failed to parse cid '%s'", vaddr->cid);
+        return false;
+    }
+    svm->svm_cid = val;
+
+    if (parse_uint_full(vaddr->port, &val, 10) < 0 ||
+        val > UINT32_MAX) {
+        error_setg(errp, "Failed to parse port '%s'", vaddr->port);
+        return false;
+    }
+    svm->svm_port = val;
+
+    return true;
+}
+
+static int vsock_connect_addr(const VsockSocketAddress *vaddr,
+                              const struct sockaddr_vm *svm, Error **errp)
+{
+    int sock, rc;
+
+    sock = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket family %d",
+                         AF_VSOCK);
+        return -1;
+    }
+
+    /* connect to peer */
+    do {
+        rc = 0;
+        if (connect(sock, (const struct sockaddr *)svm, sizeof(*svm)) < 0) {
+            rc = -errno;
+        }
+    } while (rc == -EINTR);
+
+    if (rc < 0) {
+        error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+                         vaddr->cid, vaddr->port);
+        closesocket(sock);
+        return -1;
+    }
+
+    return sock;
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp)
+{
+    struct sockaddr_vm svm;
+
+    if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+        return -1;
+    }
+
+    return vsock_connect_addr(vaddr, &svm, errp);
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+                              int num,
+                              Error **errp)
+{
+    struct sockaddr_vm svm;
+    int slisten;
+
+    if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+        return -1;
+    }
+
+    slisten = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+    if (slisten < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket");
+        return -1;
+    }
+
+    if (bind(slisten, (const struct sockaddr *)&svm, sizeof(svm)) != 0) {
+        error_setg_errno(errp, errno, "Failed to bind socket");
+        closesocket(slisten);
+        return -1;
+    }
+
+    if (listen(slisten, num) != 0) {
+        error_setg_errno(errp, errno, "Failed to listen on socket");
+        closesocket(slisten);
+        return -1;
+    }
+    return slisten;
+}
+
+static int vsock_parse(VsockSocketAddress *addr, const char *str,
+                       Error **errp)
+{
+    char cid[33];
+    char port[33];
+    int n;
+
+    if (sscanf(str, "%32[^:]:%32[^,]%n", cid, port, &n) != 2) {
+        error_setg(errp, "error parsing address '%s'", str);
+        return -1;
+    }
+    if (str[n] != '\0') {
+        error_setg(errp, "trailing characters in address '%s'", str);
+        return -1;
+    }
+
+    addr->cid = g_strdup(cid);
+    addr->port = g_strdup(port);
+    return 0;
+}
+#else
+static void vsock_unsupported(Error **errp)
+{
+    error_setg(errp, "socket family AF_VSOCK unsupported");
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp)
+{
+    vsock_unsupported(errp);
+    return -1;
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+                              int num,
+                              Error **errp)
+{
+    vsock_unsupported(errp);
+    return -1;
+}
+
+static int vsock_parse(VsockSocketAddress *addr, const char *str,
+                        Error **errp)
+{
+    vsock_unsupported(errp);
+    return -1;
+}
+#endif /* CONFIG_AF_VSOCK */
+
+#ifndef _WIN32
+
+static bool saddr_is_abstract(UnixSocketAddress *saddr)
+{
+#ifdef CONFIG_LINUX
+    return saddr->abstract;
+#else
+    return false;
+#endif
+}
+
+static bool saddr_is_tight(UnixSocketAddress *saddr)
+{
+#ifdef CONFIG_LINUX
+    return !saddr->has_tight || saddr->tight;
+#else
+    return false;
+#endif
+}
+
+static int unix_listen_saddr(UnixSocketAddress *saddr,
+                             int num,
+                             Error **errp)
+{
+    bool abstract = saddr_is_abstract(saddr);
+    struct sockaddr_un un;
+    int sock, fd;
+    char *pathbuf = NULL;
+    const char *path;
+    size_t pathlen;
+    size_t addrlen;
+
+    sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create Unix socket");
+        return -1;
+    }
+
+    if (saddr->path[0] || abstract) {
+        path = saddr->path;
+    } else {
+        const char *tmpdir = getenv("TMPDIR");
+        tmpdir = tmpdir ? tmpdir : "/tmp";
+        path = pathbuf = g_strdup_printf("%s/qemu-socket-XXXXXX", tmpdir);
+    }
+
+    pathlen = strlen(path);
+    if (pathlen > sizeof(un.sun_path) ||
+        (abstract && pathlen > (sizeof(un.sun_path) - 1))) {
+        error_setg(errp, "UNIX socket path '%s' is too long", path);
+        error_append_hint(errp, "Path must be less than %zu bytes\n",
+                          abstract ? sizeof(un.sun_path) - 1 :
+                          sizeof(un.sun_path));
+        goto err;
+    }
+
+    if (pathbuf != NULL) {
+        /*
+         * This dummy fd usage silences the mktemp() unsecure warning.
+         * Using mkstemp() doesn't make things more secure here
+         * though.  bind() complains about existing files, so we have
+         * to unlink first and thus re-open the race window.  The
+         * worst case possible is bind() failing, i.e. a DoS attack.
+         */
+        fd = mkstemp(pathbuf);
+        if (fd < 0) {
+            error_setg_errno(errp, errno,
+                             "Failed to make a temporary socket %s", pathbuf);
+            goto err;
+        }
+        close(fd);
+    }
+
+    if (!abstract && unlink(path) < 0 && errno != ENOENT) {
+        error_setg_errno(errp, errno,
+                         "Failed to unlink socket %s", path);
+        goto err;
+    }
+
+    memset(&un, 0, sizeof(un));
+    un.sun_family = AF_UNIX;
+    addrlen = sizeof(un);
+
+    if (abstract) {
+        un.sun_path[0] = '\0';
+        memcpy(&un.sun_path[1], path, pathlen);
+        if (saddr_is_tight(saddr)) {
+            addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen;
+        }
+    } else {
+        memcpy(un.sun_path, path, pathlen);
+    }
+
+    if (bind(sock, (struct sockaddr *) &un, addrlen) < 0) {
+        error_setg_errno(errp, errno, "Failed to bind socket to %s", path);
+        goto err;
+    }
+    if (listen(sock, num) < 0) {
+        error_setg_errno(errp, errno, "Failed to listen on socket");
+        goto err;
+    }
+
+    g_free(pathbuf);
+    return sock;
+
+err:
+    g_free(pathbuf);
+    closesocket(sock);
+    return -1;
+}
+
+static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp)
+{
+    bool abstract = saddr_is_abstract(saddr);
+    struct sockaddr_un un;
+    int sock, rc;
+    size_t pathlen;
+    size_t addrlen;
+
+    if (saddr->path == NULL) {
+        error_setg(errp, "unix connect: no path specified");
+        return -1;
+    }
+
+    sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0);
+    if (sock < 0) {
+        error_setg_errno(errp, errno, "Failed to create socket");
+        return -1;
+    }
+
+    pathlen = strlen(saddr->path);
+    if (pathlen > sizeof(un.sun_path) ||
+        (abstract && pathlen > (sizeof(un.sun_path) - 1))) {
+        error_setg(errp, "UNIX socket path '%s' is too long", saddr->path);
+        error_append_hint(errp, "Path must be less than %zu bytes\n",
+                          abstract ? sizeof(un.sun_path) - 1 :
+                          sizeof(un.sun_path));
+        goto err;
+    }
+
+    memset(&un, 0, sizeof(un));
+    un.sun_family = AF_UNIX;
+    addrlen = sizeof(un);
+
+    if (abstract) {
+        un.sun_path[0] = '\0';
+        memcpy(&un.sun_path[1], saddr->path, pathlen);
+        if (saddr_is_tight(saddr)) {
+            addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen;
+        }
+    } else {
+        memcpy(un.sun_path, saddr->path, pathlen);
+    }
+    /* connect to peer */
+    do {
+        rc = 0;
+        if (connect(sock, (struct sockaddr *) &un, addrlen) < 0) {
+            rc = -errno;
+        }
+    } while (rc == -EINTR);
+
+    if (rc < 0) {
+        error_setg_errno(errp, -rc, "Failed to connect to '%s'",
+                         saddr->path);
+        goto err;
+    }
+
+    return sock;
+
+ err:
+    close(sock);
+    return -1;
+}
+
+#else
+
+static int unix_listen_saddr(UnixSocketAddress *saddr,
+                             int num,
+                             Error **errp)
+{
+    error_setg(errp, "unix sockets are not available on windows");
+    errno = ENOTSUP;
+    return -1;
+}
+
+static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp)
+{
+    error_setg(errp, "unix sockets are not available on windows");
+    errno = ENOTSUP;
+    return -1;
+}
+#endif
+
+/* compatibility wrapper */
+int unix_listen(const char *str, Error **errp)
+{
+    UnixSocketAddress *saddr;
+    int sock;
+
+    saddr = g_new0(UnixSocketAddress, 1);
+    saddr->path = g_strdup(str);
+    sock = unix_listen_saddr(saddr, 1, errp);
+    qapi_free_UnixSocketAddress(saddr);
+    return sock;
+}
+
+int unix_connect(const char *path, Error **errp)
+{
+    UnixSocketAddress *saddr;
+    int sock;
+
+    saddr = g_new0(UnixSocketAddress, 1);
+    saddr->path = g_strdup(path);
+    sock = unix_connect_saddr(saddr, errp);
+    qapi_free_UnixSocketAddress(saddr);
+    return sock;
+}
+
+
+SocketAddress *socket_parse(const char *str, Error **errp)
+{
+    SocketAddress *addr;
+
+    addr = g_new0(SocketAddress, 1);
+    if (strstart(str, "unix:", NULL)) {
+        if (str[5] == '\0') {
+            error_setg(errp, "invalid Unix socket address");
+            goto fail;
+        } else {
+            addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+            addr->u.q_unix.path = g_strdup(str + 5);
+        }
+    } else if (strstart(str, "fd:", NULL)) {
+        if (str[3] == '\0') {
+            error_setg(errp, "invalid file descriptor address");
+            goto fail;
+        } else {
+            addr->type = SOCKET_ADDRESS_TYPE_FD;
+            addr->u.fd.str = g_strdup(str + 3);
+        }
+    } else if (strstart(str, "vsock:", NULL)) {
+        addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+        if (vsock_parse(&addr->u.vsock, str + strlen("vsock:"), errp)) {
+            goto fail;
+        }
+    } else {
+        addr->type = SOCKET_ADDRESS_TYPE_INET;
+        if (inet_parse(&addr->u.inet, str, errp)) {
+            goto fail;
+        }
+    }
+    return addr;
+
+fail:
+    qapi_free_SocketAddress(addr);
+    return NULL;
+}
+
+static int socket_get_fd(const char *fdstr, Error **errp)
+{
+    Monitor *cur_mon = monitor_cur();
+    int fd;
+    if (cur_mon) {
+        fd = monitor_get_fd(cur_mon, fdstr, errp);
+        if (fd < 0) {
+            return -1;
+        }
+    } else {
+        if (qemu_strtoi(fdstr, NULL, 10, &fd) < 0) {
+            error_setg_errno(errp, errno,
+                             "Unable to parse FD number %s",
+                             fdstr);
+            return -1;
+        }
+    }
+    if (!fd_is_socket(fd)) {
+        error_setg(errp, "File descriptor '%s' is not a socket", fdstr);
+        close(fd);
+        return -1;
+    }
+    return fd;
+}
+
+int socket_address_parse_named_fd(SocketAddress *addr, Error **errp)
+{
+    int fd;
+
+    if (addr->type != SOCKET_ADDRESS_TYPE_FD) {
+        return 0;
+    }
+
+    fd = socket_get_fd(addr->u.fd.str, errp);
+    if (fd < 0) {
+        return fd;
+    }
+
+    g_free(addr->u.fd.str);
+    addr->u.fd.str = g_strdup_printf("%d", fd);
+
+    return 0;
+}
+
+int socket_connect(SocketAddress *addr, Error **errp)
+{
+    int fd;
+
+    switch (addr->type) {
+    case SOCKET_ADDRESS_TYPE_INET:
+        fd = inet_connect_saddr(&addr->u.inet, errp);
+        break;
+
+    case SOCKET_ADDRESS_TYPE_UNIX:
+        fd = unix_connect_saddr(&addr->u.q_unix, errp);
+        break;
+
+    case SOCKET_ADDRESS_TYPE_FD:
+        fd = socket_get_fd(addr->u.fd.str, errp);
+        break;
+
+    case SOCKET_ADDRESS_TYPE_VSOCK:
+        fd = vsock_connect_saddr(&addr->u.vsock, errp);
+        break;
+
+    default:
+        abort();
+    }
+    return fd;
+}
+
+int socket_listen(SocketAddress *addr, int num, Error **errp)
+{
+    int fd;
+
+    trace_socket_listen(num);
+    switch (addr->type) {
+    case SOCKET_ADDRESS_TYPE_INET:
+        fd = inet_listen_saddr(&addr->u.inet, 0, num, errp);
+        break;
+
+    case SOCKET_ADDRESS_TYPE_UNIX:
+        fd = unix_listen_saddr(&addr->u.q_unix, num, errp);
+        break;
+
+    case SOCKET_ADDRESS_TYPE_FD:
+        fd = socket_get_fd(addr->u.fd.str, errp);
+        if (fd < 0) {
+            return -1;
+        }
+
+        /*
+         * If the socket is not yet in the listen state, then transition it to
+         * the listen state now.
+         *
+         * If it's already listening then this updates the backlog value as
+         * requested.
+         *
+         * If this socket cannot listen because it's already in another state
+         * (e.g. unbound or connected) then we'll catch the error here.
+         */
+        if (listen(fd, num) != 0) {
+            error_setg_errno(errp, errno, "Failed to listen on fd socket");
+            closesocket(fd);
+            return -1;
+        }
+        break;
+
+    case SOCKET_ADDRESS_TYPE_VSOCK:
+        fd = vsock_listen_saddr(&addr->u.vsock, num, errp);
+        break;
+
+    default:
+        abort();
+    }
+    return fd;
+}
+
+void socket_listen_cleanup(int fd, Error **errp)
+{
+    SocketAddress *addr;
+
+    addr = socket_local_address(fd, errp);
+    if (!addr) {
+        return;
+    }
+
+    if (addr->type == SOCKET_ADDRESS_TYPE_UNIX
+        && addr->u.q_unix.path) {
+        if (unlink(addr->u.q_unix.path) < 0 && errno != ENOENT) {
+            error_setg_errno(errp, errno,
+                             "Failed to unlink socket %s",
+                             addr->u.q_unix.path);
+        }
+    }
+
+    qapi_free_SocketAddress(addr);
+}
+
+int socket_dgram(SocketAddress *remote, SocketAddress *local, Error **errp)
+{
+    int fd;
+
+    /*
+     * TODO SOCKET_ADDRESS_TYPE_FD when fd is AF_INET or AF_INET6
+     * (although other address families can do SOCK_DGRAM, too)
+     */
+    switch (remote->type) {
+    case SOCKET_ADDRESS_TYPE_INET:
+        fd = inet_dgram_saddr(&remote->u.inet,
+                              local ? &local->u.inet : NULL, errp);
+        break;
+
+    default:
+        error_setg(errp, "socket type unsupported for datagram");
+        fd = -1;
+    }
+    return fd;
+}
+
+
+static SocketAddress *
+socket_sockaddr_to_address_inet(struct sockaddr_storage *sa,
+                                socklen_t salen,
+                                Error **errp)
+{
+    char host[NI_MAXHOST];
+    char serv[NI_MAXSERV];
+    SocketAddress *addr;
+    InetSocketAddress *inet;
+    int ret;
+
+    ret = getnameinfo((struct sockaddr *)sa, salen,
+                      host, sizeof(host),
+                      serv, sizeof(serv),
+                      NI_NUMERICHOST | NI_NUMERICSERV);
+    if (ret != 0) {
+        error_setg(errp, "Cannot format numeric socket address: %s",
+                   gai_strerror(ret));
+        return NULL;
+    }
+
+    addr = g_new0(SocketAddress, 1);
+    addr->type = SOCKET_ADDRESS_TYPE_INET;
+    inet = &addr->u.inet;
+    inet->host = g_strdup(host);
+    inet->port = g_strdup(serv);
+    if (sa->ss_family == AF_INET) {
+        inet->has_ipv4 = inet->ipv4 = true;
+    } else {
+        inet->has_ipv6 = inet->ipv6 = true;
+    }
+
+    return addr;
+}
+
+
+#ifndef WIN32
+static SocketAddress *
+socket_sockaddr_to_address_unix(struct sockaddr_storage *sa,
+                                socklen_t salen,
+                                Error **errp)
+{
+    SocketAddress *addr;
+    struct sockaddr_un *su = (struct sockaddr_un *)sa;
+
+    addr = g_new0(SocketAddress, 1);
+    addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+    salen -= offsetof(struct sockaddr_un, sun_path);
+#ifdef CONFIG_LINUX
+    if (salen > 0 && !su->sun_path[0]) {
+        /* Linux abstract socket */
+        addr->u.q_unix.path = g_strndup(su->sun_path + 1, salen - 1);
+        addr->u.q_unix.has_abstract = true;
+        addr->u.q_unix.abstract = true;
+        addr->u.q_unix.has_tight = true;
+        addr->u.q_unix.tight = salen < sizeof(su->sun_path);
+        return addr;
+    }
+#endif
+
+    addr->u.q_unix.path = g_strndup(su->sun_path, salen);
+    return addr;
+}
+#endif /* WIN32 */
+
+#ifdef CONFIG_AF_VSOCK
+static SocketAddress *
+socket_sockaddr_to_address_vsock(struct sockaddr_storage *sa,
+                                 socklen_t salen,
+                                 Error **errp)
+{
+    SocketAddress *addr;
+    VsockSocketAddress *vaddr;
+    struct sockaddr_vm *svm = (struct sockaddr_vm *)sa;
+
+    addr = g_new0(SocketAddress, 1);
+    addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+    vaddr = &addr->u.vsock;
+    vaddr->cid = g_strdup_printf("%u", svm->svm_cid);
+    vaddr->port = g_strdup_printf("%u", svm->svm_port);
+
+    return addr;
+}
+#endif /* CONFIG_AF_VSOCK */
+
+SocketAddress *
+socket_sockaddr_to_address(struct sockaddr_storage *sa,
+                           socklen_t salen,
+                           Error **errp)
+{
+    switch (sa->ss_family) {
+    case AF_INET:
+    case AF_INET6:
+        return socket_sockaddr_to_address_inet(sa, salen, errp);
+
+#ifndef WIN32
+    case AF_UNIX:
+        return socket_sockaddr_to_address_unix(sa, salen, errp);
+#endif /* WIN32 */
+
+#ifdef CONFIG_AF_VSOCK
+    case AF_VSOCK:
+        return socket_sockaddr_to_address_vsock(sa, salen, errp);
+#endif
+
+    default:
+        error_setg(errp, "socket family %d unsupported",
+                   sa->ss_family);
+        return NULL;
+    }
+    return 0;
+}
+
+
+SocketAddress *socket_local_address(int fd, Error **errp)
+{
+    struct sockaddr_storage ss;
+    socklen_t sslen = sizeof(ss);
+
+    if (getsockname(fd, (struct sockaddr *)&ss, &sslen) < 0) {
+        error_setg_errno(errp, errno, "%s",
+                         "Unable to query local socket address");
+        return NULL;
+    }
+
+    return socket_sockaddr_to_address(&ss, sslen, errp);
+}
+
+
+SocketAddress *socket_remote_address(int fd, Error **errp)
+{
+    struct sockaddr_storage ss;
+    socklen_t sslen = sizeof(ss);
+
+    if (getpeername(fd, (struct sockaddr *)&ss, &sslen) < 0) {
+        error_setg_errno(errp, errno, "%s",
+                         "Unable to query remote socket address");
+        return NULL;
+    }
+
+    return socket_sockaddr_to_address(&ss, sslen, errp);
+}
+
+
+SocketAddress *socket_address_flatten(SocketAddressLegacy *addr_legacy)
+{
+    SocketAddress *addr;
+
+    if (!addr_legacy) {
+        return NULL;
+    }
+
+    addr = g_new(SocketAddress, 1);
+
+    switch (addr_legacy->type) {
+    case SOCKET_ADDRESS_TYPE_INET:
+        addr->type = SOCKET_ADDRESS_TYPE_INET;
+        QAPI_CLONE_MEMBERS(InetSocketAddress, &addr->u.inet,
+                           addr_legacy->u.inet.data);
+        break;
+    case SOCKET_ADDRESS_TYPE_UNIX:
+        addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+        QAPI_CLONE_MEMBERS(UnixSocketAddress, &addr->u.q_unix,
+                           addr_legacy->u.q_unix.data);
+        break;
+    case SOCKET_ADDRESS_TYPE_VSOCK:
+        addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+        QAPI_CLONE_MEMBERS(VsockSocketAddress, &addr->u.vsock,
+                           addr_legacy->u.vsock.data);
+        break;
+    case SOCKET_ADDRESS_TYPE_FD:
+        addr->type = SOCKET_ADDRESS_TYPE_FD;
+        QAPI_CLONE_MEMBERS(String, &addr->u.fd, addr_legacy->u.fd.data);
+        break;
+    default:
+        abort();
+    }
+
+    return addr;
+}
diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h
new file mode 100644
index 000000000..2af6b1208
--- /dev/null
+++ b/util/qemu-thread-common.h
@@ -0,0 +1,54 @@
+/*
+ * Common qemu-thread implementation header file.
+ *
+ * Copyright Red Hat, Inc. 2018
+ *
+ * Authors:
+ *  Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_THREAD_COMMON_H
+#define QEMU_THREAD_COMMON_H
+
+#include "qemu/thread.h"
+#include "trace.h"
+
+static inline void qemu_mutex_post_init(QemuMutex *mutex)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = NULL;
+    mutex->line = 0;
+#endif
+    mutex->initialized = true;
+}
+
+static inline void qemu_mutex_pre_lock(QemuMutex *mutex,
+                                       const char *file, int line)
+{
+    trace_qemu_mutex_lock(mutex, file, line);
+}
+
+static inline void qemu_mutex_post_lock(QemuMutex *mutex,
+                                        const char *file, int line)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = file;
+    mutex->line = line;
+#endif
+    trace_qemu_mutex_locked(mutex, file, line);
+}
+
+static inline void qemu_mutex_pre_unlock(QemuMutex *mutex,
+                                         const char *file, int line)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+    mutex->file = NULL;
+    mutex->line = 0;
+#endif
+    trace_qemu_mutex_unlock(mutex, file, line);
+}
+
+#endif
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
new file mode 100644
index 000000000..e1225b63b
--- /dev/null
+++ b/util/qemu-thread-posix.c
@@ -0,0 +1,632 @@
+/*
+ * Wrappers around mutex/cond/thread functions
+ *
+ * Copyright Red Hat, Inc. 2009
+ *
+ * Author:
+ *  Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "qemu/notify.h"
+#include "qemu-thread-common.h"
+#include "qemu/tsan.h"
+
+static bool name_threads;
+
+void qemu_thread_naming(bool enable)
+{
+    name_threads = enable;
+
+#if !defined CONFIG_PTHREAD_SETNAME_NP_W_TID && \
+    !defined CONFIG_PTHREAD_SETNAME_NP_WO_TID
+    /* This is a debugging option, not fatal */
+    if (enable) {
+        fprintf(stderr, "qemu: thread naming not supported on this host\n");
+    }
+#endif
+}
+
+static void error_exit(int err, const char *msg)
+{
+    fprintf(stderr, "qemu: %s: %s\n", msg, strerror(err));
+    abort();
+}
+
+static void compute_abs_deadline(struct timespec *ts, int ms)
+{
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    ts->tv_nsec = tv.tv_usec * 1000 + (ms % 1000) * 1000000;
+    ts->tv_sec = tv.tv_sec + ms / 1000;
+    if (ts->tv_nsec >= 1000000000) {
+        ts->tv_sec++;
+        ts->tv_nsec -= 1000000000;
+    }
+}
+
+void qemu_mutex_init(QemuMutex *mutex)
+{
+    int err;
+
+    err = pthread_mutex_init(&mutex->lock, NULL);
+    if (err)
+        error_exit(err, __func__);
+    qemu_mutex_post_init(mutex);
+}
+
+void qemu_mutex_destroy(QemuMutex *mutex)
+{
+    int err;
+
+    assert(mutex->initialized);
+    mutex->initialized = false;
+    err = pthread_mutex_destroy(&mutex->lock);
+    if (err)
+        error_exit(err, __func__);
+}
+
+void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    int err;
+
+    assert(mutex->initialized);
+    qemu_mutex_pre_lock(mutex, file, line);
+    err = pthread_mutex_lock(&mutex->lock);
+    if (err)
+        error_exit(err, __func__);
+    qemu_mutex_post_lock(mutex, file, line);
+}
+
+int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    int err;
+
+    assert(mutex->initialized);
+    err = pthread_mutex_trylock(&mutex->lock);
+    if (err == 0) {
+        qemu_mutex_post_lock(mutex, file, line);
+        return 0;
+    }
+    if (err != EBUSY) {
+        error_exit(err, __func__);
+    }
+    return -EBUSY;
+}
+
+void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    int err;
+
+    assert(mutex->initialized);
+    qemu_mutex_pre_unlock(mutex, file, line);
+    err = pthread_mutex_unlock(&mutex->lock);
+    if (err)
+        error_exit(err, __func__);
+}
+
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    int err;
+    pthread_mutexattr_t attr;
+
+    pthread_mutexattr_init(&attr);
+    pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+    err = pthread_mutex_init(&mutex->m.lock, &attr);
+    pthread_mutexattr_destroy(&attr);
+    if (err) {
+        error_exit(err, __func__);
+    }
+    mutex->m.initialized = true;
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+    qemu_mutex_destroy(&mutex->m);
+}
+
+void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    qemu_mutex_lock_impl(&mutex->m, file, line);
+}
+
+int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    return qemu_mutex_trylock_impl(&mutex->m, file, line);
+}
+
+void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    qemu_mutex_unlock_impl(&mutex->m, file, line);
+}
+
+void qemu_cond_init(QemuCond *cond)
+{
+    int err;
+
+    err = pthread_cond_init(&cond->cond, NULL);
+    if (err)
+        error_exit(err, __func__);
+    cond->initialized = true;
+}
+
+void qemu_cond_destroy(QemuCond *cond)
+{
+    int err;
+
+    assert(cond->initialized);
+    cond->initialized = false;
+    err = pthread_cond_destroy(&cond->cond);
+    if (err)
+        error_exit(err, __func__);
+}
+
+void qemu_cond_signal(QemuCond *cond)
+{
+    int err;
+
+    assert(cond->initialized);
+    err = pthread_cond_signal(&cond->cond);
+    if (err)
+        error_exit(err, __func__);
+}
+
+void qemu_cond_broadcast(QemuCond *cond)
+{
+    int err;
+
+    assert(cond->initialized);
+    err = pthread_cond_broadcast(&cond->cond);
+    if (err)
+        error_exit(err, __func__);
+}
+
+void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
+{
+    int err;
+
+    assert(cond->initialized);
+    qemu_mutex_pre_unlock(mutex, file, line);
+    err = pthread_cond_wait(&cond->cond, &mutex->lock);
+    qemu_mutex_post_lock(mutex, file, line);
+    if (err)
+        error_exit(err, __func__);
+}
+
+bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms,
+                              const char *file, const int line)
+{
+    int err;
+    struct timespec ts;
+
+    assert(cond->initialized);
+    trace_qemu_mutex_unlock(mutex, file, line);
+    compute_abs_deadline(&ts, ms);
+    err = pthread_cond_timedwait(&cond->cond, &mutex->lock, &ts);
+    trace_qemu_mutex_locked(mutex, file, line);
+    if (err && err != ETIMEDOUT) {
+        error_exit(err, __func__);
+    }
+    return err != ETIMEDOUT;
+}
+
+void qemu_sem_init(QemuSemaphore *sem, int init)
+{
+    int rc;
+
+#ifndef CONFIG_SEM_TIMEDWAIT
+    rc = pthread_mutex_init(&sem->lock, NULL);
+    if (rc != 0) {
+        error_exit(rc, __func__);
+    }
+    rc = pthread_cond_init(&sem->cond, NULL);
+    if (rc != 0) {
+        error_exit(rc, __func__);
+    }
+    if (init < 0) {
+        error_exit(EINVAL, __func__);
+    }
+    sem->count = init;
+#else
+    rc = sem_init(&sem->sem, 0, init);
+    if (rc < 0) {
+        error_exit(errno, __func__);
+    }
+#endif
+    sem->initialized = true;
+}
+
+void qemu_sem_destroy(QemuSemaphore *sem)
+{
+    int rc;
+
+    assert(sem->initialized);
+    sem->initialized = false;
+#ifndef CONFIG_SEM_TIMEDWAIT
+    rc = pthread_cond_destroy(&sem->cond);
+    if (rc < 0) {
+        error_exit(rc, __func__);
+    }
+    rc = pthread_mutex_destroy(&sem->lock);
+    if (rc < 0) {
+        error_exit(rc, __func__);
+    }
+#else
+    rc = sem_destroy(&sem->sem);
+    if (rc < 0) {
+        error_exit(errno, __func__);
+    }
+#endif
+}
+
+void qemu_sem_post(QemuSemaphore *sem)
+{
+    int rc;
+
+    assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+    pthread_mutex_lock(&sem->lock);
+    if (sem->count == UINT_MAX) {
+        rc = EINVAL;
+    } else {
+        sem->count++;
+        rc = pthread_cond_signal(&sem->cond);
+    }
+    pthread_mutex_unlock(&sem->lock);
+    if (rc != 0) {
+        error_exit(rc, __func__);
+    }
+#else
+    rc = sem_post(&sem->sem);
+    if (rc < 0) {
+        error_exit(errno, __func__);
+    }
+#endif
+}
+
+int qemu_sem_timedwait(QemuSemaphore *sem, int ms)
+{
+    int rc;
+    struct timespec ts;
+
+    assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+    rc = 0;
+    compute_abs_deadline(&ts, ms);
+    pthread_mutex_lock(&sem->lock);
+    while (sem->count == 0) {
+        rc = pthread_cond_timedwait(&sem->cond, &sem->lock, &ts);
+        if (rc == ETIMEDOUT) {
+            break;
+        }
+        if (rc != 0) {
+            error_exit(rc, __func__);
+        }
+    }
+    if (rc != ETIMEDOUT) {
+        --sem->count;
+    }
+    pthread_mutex_unlock(&sem->lock);
+    return (rc == ETIMEDOUT ? -1 : 0);
+#else
+    if (ms <= 0) {
+        /* This is cheaper than sem_timedwait.  */
+        do {
+            rc = sem_trywait(&sem->sem);
+        } while (rc == -1 && errno == EINTR);
+        if (rc == -1 && errno == EAGAIN) {
+            return -1;
+        }
+    } else {
+        compute_abs_deadline(&ts, ms);
+        do {
+            rc = sem_timedwait(&sem->sem, &ts);
+        } while (rc == -1 && errno == EINTR);
+        if (rc == -1 && errno == ETIMEDOUT) {
+            return -1;
+        }
+    }
+    if (rc < 0) {
+        error_exit(errno, __func__);
+    }
+    return 0;
+#endif
+}
+
+void qemu_sem_wait(QemuSemaphore *sem)
+{
+    int rc;
+
+    assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+    pthread_mutex_lock(&sem->lock);
+    while (sem->count == 0) {
+        rc = pthread_cond_wait(&sem->cond, &sem->lock);
+        if (rc != 0) {
+            error_exit(rc, __func__);
+        }
+    }
+    --sem->count;
+    pthread_mutex_unlock(&sem->lock);
+#else
+    do {
+        rc = sem_wait(&sem->sem);
+    } while (rc == -1 && errno == EINTR);
+    if (rc < 0) {
+        error_exit(errno, __func__);
+    }
+#endif
+}
+
+#ifdef __linux__
+#include "qemu/futex.h"
+#else
+static inline void qemu_futex_wake(QemuEvent *ev, int n)
+{
+    assert(ev->initialized);
+    pthread_mutex_lock(&ev->lock);
+    if (n == 1) {
+        pthread_cond_signal(&ev->cond);
+    } else {
+        pthread_cond_broadcast(&ev->cond);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+
+static inline void qemu_futex_wait(QemuEvent *ev, unsigned val)
+{
+    assert(ev->initialized);
+    pthread_mutex_lock(&ev->lock);
+    if (ev->value == val) {
+        pthread_cond_wait(&ev->cond, &ev->lock);
+    }
+    pthread_mutex_unlock(&ev->lock);
+}
+#endif
+
+/* Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by qemu_futex_wake
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy.
+ */
+
+#define EV_SET         0
+#define EV_FREE        1
+#define EV_BUSY       -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+#ifndef __linux__
+    pthread_mutex_init(&ev->lock, NULL);
+    pthread_cond_init(&ev->cond, NULL);
+#endif
+
+    ev->value = (init ? EV_SET : EV_FREE);
+    ev->initialized = true;
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+    assert(ev->initialized);
+    ev->initialized = false;
+#ifndef __linux__
+    pthread_mutex_destroy(&ev->lock);
+    pthread_cond_destroy(&ev->cond);
+#endif
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    /* qemu_event_set has release semantics, but because it *loads*
+     * ev->value we need a full memory barrier here.
+     */
+    assert(ev->initialized);
+    smp_mb();
+    if (qatomic_read(&ev->value) != EV_SET) {
+        if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+            /* There were waiters, wake them up.  */
+            qemu_futex_wake(ev, INT_MAX);
+        }
+    }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    unsigned value;
+
+    assert(ev->initialized);
+    value = qatomic_read(&ev->value);
+    smp_mb_acquire();
+    if (value == EV_SET) {
+        /*
+         * If there was a concurrent reset (or even reset+wait),
+         * do nothing.  Otherwise change EV_SET->EV_FREE.
+         */
+        qatomic_or(&ev->value, EV_FREE);
+    }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    unsigned value;
+
+    assert(ev->initialized);
+    value = qatomic_read(&ev->value);
+    smp_mb_acquire();
+    if (value != EV_SET) {
+        if (value == EV_FREE) {
+            /*
+             * Leave the event reset and tell qemu_event_set that there
+             * are waiters.  No need to retry, because there cannot be
+             * a concurrent busy->free transition.  After the CAS, the
+             * event will be either set or busy.
+             */
+            if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+                return;
+            }
+        }
+        qemu_futex_wait(ev, EV_BUSY);
+    }
+}
+
+static __thread NotifierList thread_exit;
+
+/*
+ * Note that in this implementation you can register a thread-exit
+ * notifier for the main thread, but it will never be called.
+ * This is OK because main thread exit can only happen when the
+ * entire process is exiting, and the API allows notifiers to not
+ * be called on process exit.
+ */
+void qemu_thread_atexit_add(Notifier *notifier)
+{
+    notifier_list_add(&thread_exit, notifier);
+}
+
+void qemu_thread_atexit_remove(Notifier *notifier)
+{
+    notifier_remove(notifier);
+}
+
+static void qemu_thread_atexit_notify(void *arg)
+{
+    /*
+     * Called when non-main thread exits (via qemu_thread_exit()
+     * or by returning from its start routine.)
+     */
+    notifier_list_notify(&thread_exit, NULL);
+}
+
+typedef struct {
+    void *(*start_routine)(void *);
+    void *arg;
+    char *name;
+} QemuThreadArgs;
+
+static void *qemu_thread_start(void *args)
+{
+    QemuThreadArgs *qemu_thread_args = args;
+    void *(*start_routine)(void *) = qemu_thread_args->start_routine;
+    void *arg = qemu_thread_args->arg;
+    void *r;
+
+    /* Attempt to set the threads name; note that this is for debug, so
+     * we're not going to fail if we can't set it.
+     */
+    if (name_threads && qemu_thread_args->name) {
+# if defined(CONFIG_PTHREAD_SETNAME_NP_W_TID)
+        pthread_setname_np(pthread_self(), qemu_thread_args->name);
+# elif defined(CONFIG_PTHREAD_SETNAME_NP_WO_TID)
+        pthread_setname_np(qemu_thread_args->name);
+# endif
+    }
+    QEMU_TSAN_ANNOTATE_THREAD_NAME(qemu_thread_args->name);
+    g_free(qemu_thread_args->name);
+    g_free(qemu_thread_args);
+
+    /*
+     * GCC 11 with glibc 2.17 on PowerPC reports
+     *
+     * qemu-thread-posix.c:540:5: error: ‘__sigsetjmp’ accessing 656 bytes
+     *   in a region of size 528 [-Werror=stringop-overflow=]
+     * 540 |     pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
+     *     |     ^~~~~~~~~~~~~~~~~~~~
+     *
+     * which is clearly nonsense.
+     */
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
+    pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
+    r = start_routine(arg);
+    pthread_cleanup_pop(1);
+
+#pragma GCC diagnostic pop
+
+    return r;
+}
+
+void qemu_thread_create(QemuThread *thread, const char *name,
+                       void *(*start_routine)(void*),
+                       void *arg, int mode)
+{
+    sigset_t set, oldset;
+    int err;
+    pthread_attr_t attr;
+    QemuThreadArgs *qemu_thread_args;
+
+    err = pthread_attr_init(&attr);
+    if (err) {
+        error_exit(err, __func__);
+    }
+
+    if (mode == QEMU_THREAD_DETACHED) {
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+    }
+
+    /* Leave signal handling to the iothread.  */
+    sigfillset(&set);
+    /* Blocking the signals can result in undefined behaviour. */
+    sigdelset(&set, SIGSEGV);
+    sigdelset(&set, SIGFPE);
+    sigdelset(&set, SIGILL);
+    /* TODO avoid SIGBUS loss on macOS */
+    pthread_sigmask(SIG_SETMASK, &set, &oldset);
+
+    qemu_thread_args = g_new0(QemuThreadArgs, 1);
+    qemu_thread_args->name = g_strdup(name);
+    qemu_thread_args->start_routine = start_routine;
+    qemu_thread_args->arg = arg;
+
+    err = pthread_create(&thread->thread, &attr,
+                         qemu_thread_start, qemu_thread_args);
+
+    if (err)
+        error_exit(err, __func__);
+
+    pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+
+    pthread_attr_destroy(&attr);
+}
+
+void qemu_thread_get_self(QemuThread *thread)
+{
+    thread->thread = pthread_self();
+}
+
+bool qemu_thread_is_self(QemuThread *thread)
+{
+   return pthread_equal(pthread_self(), thread->thread);
+}
+
+void qemu_thread_exit(void *retval)
+{
+    pthread_exit(retval);
+}
+
+void *qemu_thread_join(QemuThread *thread)
+{
+    int err;
+    void *ret;
+
+    err = pthread_join(thread->thread, &ret);
+    if (err) {
+        error_exit(err, __func__);
+    }
+    return ret;
+}
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
new file mode 100644
index 000000000..52eb19f35
--- /dev/null
+++ b/util/qemu-thread-win32.c
@@ -0,0 +1,461 @@
+/*
+ * Win32 implementation for mutex/cond/thread functions
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Author:
+ *  Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/thread.h"
+#include "qemu/notify.h"
+#include "qemu-thread-common.h"
+#include <process.h>
+
+static bool name_threads;
+
+void qemu_thread_naming(bool enable)
+{
+    /* But note we don't actually name them on Windows yet */
+    name_threads = enable;
+
+    fprintf(stderr, "qemu: thread naming not supported on this host\n");
+}
+
+static void error_exit(int err, const char *msg)
+{
+    char *pstr;
+
+    FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+                  NULL, err, 0, (LPTSTR)&pstr, 2, NULL);
+    fprintf(stderr, "qemu: %s: %s\n", msg, pstr);
+    LocalFree(pstr);
+    abort();
+}
+
+void qemu_mutex_init(QemuMutex *mutex)
+{
+    InitializeSRWLock(&mutex->lock);
+    qemu_mutex_post_init(mutex);
+}
+
+void qemu_mutex_destroy(QemuMutex *mutex)
+{
+    assert(mutex->initialized);
+    mutex->initialized = false;
+    InitializeSRWLock(&mutex->lock);
+}
+
+void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    assert(mutex->initialized);
+    qemu_mutex_pre_lock(mutex, file, line);
+    AcquireSRWLockExclusive(&mutex->lock);
+    qemu_mutex_post_lock(mutex, file, line);
+}
+
+int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    int owned;
+
+    assert(mutex->initialized);
+    owned = TryAcquireSRWLockExclusive(&mutex->lock);
+    if (owned) {
+        qemu_mutex_post_lock(mutex, file, line);
+        return 0;
+    }
+    return -EBUSY;
+}
+
+void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+    assert(mutex->initialized);
+    qemu_mutex_pre_unlock(mutex, file, line);
+    ReleaseSRWLockExclusive(&mutex->lock);
+}
+
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+    InitializeCriticalSection(&mutex->lock);
+    mutex->initialized = true;
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+    assert(mutex->initialized);
+    mutex->initialized = false;
+    DeleteCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    assert(mutex->initialized);
+    EnterCriticalSection(&mutex->lock);
+}
+
+int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    assert(mutex->initialized);
+    return !TryEnterCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+    assert(mutex->initialized);
+    LeaveCriticalSection(&mutex->lock);
+}
+
+void qemu_cond_init(QemuCond *cond)
+{
+    memset(cond, 0, sizeof(*cond));
+    InitializeConditionVariable(&cond->var);
+    cond->initialized = true;
+}
+
+void qemu_cond_destroy(QemuCond *cond)
+{
+    assert(cond->initialized);
+    cond->initialized = false;
+    InitializeConditionVariable(&cond->var);
+}
+
+void qemu_cond_signal(QemuCond *cond)
+{
+    assert(cond->initialized);
+    WakeConditionVariable(&cond->var);
+}
+
+void qemu_cond_broadcast(QemuCond *cond)
+{
+    assert(cond->initialized);
+    WakeAllConditionVariable(&cond->var);
+}
+
+void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
+{
+    assert(cond->initialized);
+    qemu_mutex_pre_unlock(mutex, file, line);
+    SleepConditionVariableSRW(&cond->var, &mutex->lock, INFINITE, 0);
+    qemu_mutex_post_lock(mutex, file, line);
+}
+
+bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms,
+                              const char *file, const int line)
+{
+    int rc = 0;
+
+    assert(cond->initialized);
+    trace_qemu_mutex_unlock(mutex, file, line);
+    if (!SleepConditionVariableSRW(&cond->var, &mutex->lock, ms, 0)) {
+        rc = GetLastError();
+    }
+    trace_qemu_mutex_locked(mutex, file, line);
+    if (rc && rc != ERROR_TIMEOUT) {
+        error_exit(rc, __func__);
+    }
+    return rc != ERROR_TIMEOUT;
+}
+
+void qemu_sem_init(QemuSemaphore *sem, int init)
+{
+    /* Manual reset.  */
+    sem->sema = CreateSemaphore(NULL, init, LONG_MAX, NULL);
+    sem->initialized = true;
+}
+
+void qemu_sem_destroy(QemuSemaphore *sem)
+{
+    assert(sem->initialized);
+    sem->initialized = false;
+    CloseHandle(sem->sema);
+}
+
+void qemu_sem_post(QemuSemaphore *sem)
+{
+    assert(sem->initialized);
+    ReleaseSemaphore(sem->sema, 1, NULL);
+}
+
+int qemu_sem_timedwait(QemuSemaphore *sem, int ms)
+{
+    int rc;
+
+    assert(sem->initialized);
+    rc = WaitForSingleObject(sem->sema, ms);
+    if (rc == WAIT_OBJECT_0) {
+        return 0;
+    }
+    if (rc != WAIT_TIMEOUT) {
+        error_exit(GetLastError(), __func__);
+    }
+    return -1;
+}
+
+void qemu_sem_wait(QemuSemaphore *sem)
+{
+    assert(sem->initialized);
+    if (WaitForSingleObject(sem->sema, INFINITE) != WAIT_OBJECT_0) {
+        error_exit(GetLastError(), __func__);
+    }
+}
+
+/* Wrap a Win32 manual-reset event with a fast userspace path.  The idea
+ * is to reset the Win32 event lazily, as part of a test-reset-test-wait
+ * sequence.  Such a sequence is, indeed, how QemuEvents are used by
+ * RCU and other subsystems!
+ *
+ * Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by SetEvent
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy (and is faster than cmpxchg).
+ */
+
+#define EV_SET         0
+#define EV_FREE        1
+#define EV_BUSY       -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+    /* Manual reset.  */
+    ev->event = CreateEvent(NULL, TRUE, TRUE, NULL);
+    ev->value = (init ? EV_SET : EV_FREE);
+    ev->initialized = true;
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+    assert(ev->initialized);
+    ev->initialized = false;
+    CloseHandle(ev->event);
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+    assert(ev->initialized);
+    /* qemu_event_set has release semantics, but because it *loads*
+     * ev->value we need a full memory barrier here.
+     */
+    smp_mb();
+    if (qatomic_read(&ev->value) != EV_SET) {
+        if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+            /* There were waiters, wake them up.  */
+            SetEvent(ev->event);
+        }
+    }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+    unsigned value;
+
+    assert(ev->initialized);
+    value = qatomic_read(&ev->value);
+    smp_mb_acquire();
+    if (value == EV_SET) {
+        /* If there was a concurrent reset (or even reset+wait),
+         * do nothing.  Otherwise change EV_SET->EV_FREE.
+         */
+        qatomic_or(&ev->value, EV_FREE);
+    }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+    unsigned value;
+
+    assert(ev->initialized);
+    value = qatomic_read(&ev->value);
+    smp_mb_acquire();
+    if (value != EV_SET) {
+        if (value == EV_FREE) {
+            /* qemu_event_set is not yet going to call SetEvent, but we are
+             * going to do another check for EV_SET below when setting EV_BUSY.
+             * At that point it is safe to call WaitForSingleObject.
+             */
+            ResetEvent(ev->event);
+
+            /* Tell qemu_event_set that there are waiters.  No need to retry
+             * because there cannot be a concurrent busy->free transition.
+             * After the CAS, the event will be either set or busy.
+             */
+            if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+                value = EV_SET;
+            } else {
+                value = EV_BUSY;
+            }
+        }
+        if (value == EV_BUSY) {
+            WaitForSingleObject(ev->event, INFINITE);
+        }
+    }
+}
+
+struct QemuThreadData {
+    /* Passed to win32_start_routine.  */
+    void             *(*start_routine)(void *);
+    void             *arg;
+    short             mode;
+    NotifierList      exit;
+
+    /* Only used for joinable threads. */
+    bool              exited;
+    void             *ret;
+    CRITICAL_SECTION  cs;
+};
+
+static bool atexit_registered;
+static NotifierList main_thread_exit;
+
+static __thread QemuThreadData *qemu_thread_data;
+
+static void run_main_thread_exit(void)
+{
+    notifier_list_notify(&main_thread_exit, NULL);
+}
+
+void qemu_thread_atexit_add(Notifier *notifier)
+{
+    if (!qemu_thread_data) {
+        if (!atexit_registered) {
+            atexit_registered = true;
+            atexit(run_main_thread_exit);
+        }
+        notifier_list_add(&main_thread_exit, notifier);
+    } else {
+        notifier_list_add(&qemu_thread_data->exit, notifier);
+    }
+}
+
+void qemu_thread_atexit_remove(Notifier *notifier)
+{
+    notifier_remove(notifier);
+}
+
+static unsigned __stdcall win32_start_routine(void *arg)
+{
+    QemuThreadData *data = (QemuThreadData *) arg;
+    void *(*start_routine)(void *) = data->start_routine;
+    void *thread_arg = data->arg;
+
+    qemu_thread_data = data;
+    qemu_thread_exit(start_routine(thread_arg));
+    abort();
+}
+
+void qemu_thread_exit(void *arg)
+{
+    QemuThreadData *data = qemu_thread_data;
+
+    notifier_list_notify(&data->exit, NULL);
+    if (data->mode == QEMU_THREAD_JOINABLE) {
+        data->ret = arg;
+        EnterCriticalSection(&data->cs);
+        data->exited = true;
+        LeaveCriticalSection(&data->cs);
+    } else {
+        g_free(data);
+    }
+    _endthreadex(0);
+}
+
+void *qemu_thread_join(QemuThread *thread)
+{
+    QemuThreadData *data;
+    void *ret;
+    HANDLE handle;
+
+    data = thread->data;
+    if (data->mode == QEMU_THREAD_DETACHED) {
+        return NULL;
+    }
+
+    /*
+     * Because multiple copies of the QemuThread can exist via
+     * qemu_thread_get_self, we need to store a value that cannot
+     * leak there.  The simplest, non racy way is to store the TID,
+     * discard the handle that _beginthreadex gives back, and
+     * get another copy of the handle here.
+     */
+    handle = qemu_thread_get_handle(thread);
+    if (handle) {
+        WaitForSingleObject(handle, INFINITE);
+        CloseHandle(handle);
+    }
+    ret = data->ret;
+    DeleteCriticalSection(&data->cs);
+    g_free(data);
+    return ret;
+}
+
+void qemu_thread_create(QemuThread *thread, const char *name,
+                       void *(*start_routine)(void *),
+                       void *arg, int mode)
+{
+    HANDLE hThread;
+    struct QemuThreadData *data;
+
+    data = g_malloc(sizeof *data);
+    data->start_routine = start_routine;
+    data->arg = arg;
+    data->mode = mode;
+    data->exited = false;
+    notifier_list_init(&data->exit);
+
+    if (data->mode != QEMU_THREAD_DETACHED) {
+        InitializeCriticalSection(&data->cs);
+    }
+
+    hThread = (HANDLE) _beginthreadex(NULL, 0, win32_start_routine,
+                                      data, 0, &thread->tid);
+    if (!hThread) {
+        error_exit(GetLastError(), __func__);
+    }
+    CloseHandle(hThread);
+    thread->data = data;
+}
+
+void qemu_thread_get_self(QemuThread *thread)
+{
+    thread->data = qemu_thread_data;
+    thread->tid = GetCurrentThreadId();
+}
+
+HANDLE qemu_thread_get_handle(QemuThread *thread)
+{
+    QemuThreadData *data;
+    HANDLE handle;
+
+    data = thread->data;
+    if (data->mode == QEMU_THREAD_DETACHED) {
+        return NULL;
+    }
+
+    EnterCriticalSection(&data->cs);
+    if (!data->exited) {
+        handle = OpenThread(SYNCHRONIZE | THREAD_SUSPEND_RESUME |
+                            THREAD_SET_CONTEXT, FALSE, thread->tid);
+    } else {
+        handle = NULL;
+    }
+    LeaveCriticalSection(&data->cs);
+    return handle;
+}
+
+bool qemu_thread_is_self(QemuThread *thread)
+{
+    return GetCurrentThreadId() == thread->tid;
+}
diff --git a/util/qemu-timer-common.c b/util/qemu-timer-common.c
new file mode 100644
index 000000000..cc1326f72
--- /dev/null
+++ b/util/qemu-timer-common.c
@@ -0,0 +1,63 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+
+/***********************************************************/
+/* real time host monotonic timer */
+
+int64_t clock_start;
+
+#ifdef _WIN32
+
+int64_t clock_freq;
+
+static void __attribute__((constructor)) init_get_clock(void)
+{
+    LARGE_INTEGER freq;
+    int ret;
+    ret = QueryPerformanceFrequency(&freq);
+    if (ret == 0) {
+        fprintf(stderr, "Could not calibrate ticks\n");
+        exit(1);
+    }
+    clock_freq = freq.QuadPart;
+    clock_start = get_clock();
+}
+
+#else
+
+int use_rt_clock;
+
+static void __attribute__((constructor)) init_get_clock(void)
+{
+    struct timespec ts;
+
+    use_rt_clock = 0;
+    if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+        use_rt_clock = 1;
+    }
+    clock_start = get_clock();
+}
+#endif
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
new file mode 100644
index 000000000..f36c75e59
--- /dev/null
+++ b/util/qemu-timer.c
@@ -0,0 +1,674 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/timer.h"
+#include "qemu/lockable.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/replay.h"
+#include "sysemu/cpus.h"
+
+#ifdef CONFIG_POSIX
+#include <pthread.h>
+#endif
+
+#ifdef CONFIG_PPOLL
+#include <poll.h>
+#endif
+
+#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK
+#include <sys/prctl.h>
+#endif
+
+/***********************************************************/
+/* timers */
+
+typedef struct QEMUClock {
+    /* We rely on BQL to protect the timerlists */
+    QLIST_HEAD(, QEMUTimerList) timerlists;
+
+    QEMUClockType type;
+    bool enabled;
+} QEMUClock;
+
+QEMUTimerListGroup main_loop_tlg;
+static QEMUClock qemu_clocks[QEMU_CLOCK_MAX];
+
+/* A QEMUTimerList is a list of timers attached to a clock. More
+ * than one QEMUTimerList can be attached to each clock, for instance
+ * used by different AioContexts / threads. Each clock also has
+ * a list of the QEMUTimerLists associated with it, in order that
+ * reenabling the clock can call all the notifiers.
+ */
+
+struct QEMUTimerList {
+    QEMUClock *clock;
+    QemuMutex active_timers_lock;
+    QEMUTimer *active_timers;
+    QLIST_ENTRY(QEMUTimerList) list;
+    QEMUTimerListNotifyCB *notify_cb;
+    void *notify_opaque;
+
+    /* lightweight method to mark the end of timerlist's running */
+    QemuEvent timers_done_ev;
+};
+
+/**
+ * qemu_clock_ptr:
+ * @type: type of clock
+ *
+ * Translate a clock type into a pointer to QEMUClock object.
+ *
+ * Returns: a pointer to the QEMUClock object
+ */
+static inline QEMUClock *qemu_clock_ptr(QEMUClockType type)
+{
+    return &qemu_clocks[type];
+}
+
+static bool timer_expired_ns(QEMUTimer *timer_head, int64_t current_time)
+{
+    return timer_head && (timer_head->expire_time <= current_time);
+}
+
+QEMUTimerList *timerlist_new(QEMUClockType type,
+                             QEMUTimerListNotifyCB *cb,
+                             void *opaque)
+{
+    QEMUTimerList *timer_list;
+    QEMUClock *clock = qemu_clock_ptr(type);
+
+    timer_list = g_malloc0(sizeof(QEMUTimerList));
+    qemu_event_init(&timer_list->timers_done_ev, true);
+    timer_list->clock = clock;
+    timer_list->notify_cb = cb;
+    timer_list->notify_opaque = opaque;
+    qemu_mutex_init(&timer_list->active_timers_lock);
+    QLIST_INSERT_HEAD(&clock->timerlists, timer_list, list);
+    return timer_list;
+}
+
+void timerlist_free(QEMUTimerList *timer_list)
+{
+    assert(!timerlist_has_timers(timer_list));
+    if (timer_list->clock) {
+        QLIST_REMOVE(timer_list, list);
+    }
+    qemu_mutex_destroy(&timer_list->active_timers_lock);
+    g_free(timer_list);
+}
+
+static void qemu_clock_init(QEMUClockType type, QEMUTimerListNotifyCB *notify_cb)
+{
+    QEMUClock *clock = qemu_clock_ptr(type);
+
+    /* Assert that the clock of type TYPE has not been initialized yet. */
+    assert(main_loop_tlg.tl[type] == NULL);
+
+    clock->type = type;
+    clock->enabled = (type == QEMU_CLOCK_VIRTUAL ? false : true);
+    QLIST_INIT(&clock->timerlists);
+    main_loop_tlg.tl[type] = timerlist_new(type, notify_cb, NULL);
+}
+
+bool qemu_clock_use_for_deadline(QEMUClockType type)
+{
+    return !(icount_enabled() && (type == QEMU_CLOCK_VIRTUAL));
+}
+
+void qemu_clock_notify(QEMUClockType type)
+{
+    QEMUTimerList *timer_list;
+    QEMUClock *clock = qemu_clock_ptr(type);
+    QLIST_FOREACH(timer_list, &clock->timerlists, list) {
+        timerlist_notify(timer_list);
+    }
+}
+
+/* Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers.  Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
+ */
+void qemu_clock_enable(QEMUClockType type, bool enabled)
+{
+    QEMUClock *clock = qemu_clock_ptr(type);
+    QEMUTimerList *tl;
+    bool old = clock->enabled;
+    clock->enabled = enabled;
+    if (enabled && !old) {
+        qemu_clock_notify(type);
+    } else if (!enabled && old) {
+        QLIST_FOREACH(tl, &clock->timerlists, list) {
+            qemu_event_wait(&tl->timers_done_ev);
+        }
+    }
+}
+
+bool timerlist_has_timers(QEMUTimerList *timer_list)
+{
+    return !!qatomic_read(&timer_list->active_timers);
+}
+
+bool qemu_clock_has_timers(QEMUClockType type)
+{
+    return timerlist_has_timers(
+        main_loop_tlg.tl[type]);
+}
+
+bool timerlist_expired(QEMUTimerList *timer_list)
+{
+    int64_t expire_time;
+
+    if (!qatomic_read(&timer_list->active_timers)) {
+        return false;
+    }
+
+    WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+        if (!timer_list->active_timers) {
+            return false;
+        }
+        expire_time = timer_list->active_timers->expire_time;
+    }
+
+    return expire_time <= qemu_clock_get_ns(timer_list->clock->type);
+}
+
+bool qemu_clock_expired(QEMUClockType type)
+{
+    return timerlist_expired(
+        main_loop_tlg.tl[type]);
+}
+
+/*
+ * As above, but return -1 for no deadline, and do not cap to 2^32
+ * as we know the result is always positive.
+ */
+
+int64_t timerlist_deadline_ns(QEMUTimerList *timer_list)
+{
+    int64_t delta;
+    int64_t expire_time;
+
+    if (!qatomic_read(&timer_list->active_timers)) {
+        return -1;
+    }
+
+    if (!timer_list->clock->enabled) {
+        return -1;
+    }
+
+    /* The active timers list may be modified before the caller uses our return
+     * value but ->notify_cb() is called when the deadline changes.  Therefore
+     * the caller should notice the change and there is no race condition.
+     */
+    WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+        if (!timer_list->active_timers) {
+            return -1;
+        }
+        expire_time = timer_list->active_timers->expire_time;
+    }
+
+    delta = expire_time - qemu_clock_get_ns(timer_list->clock->type);
+
+    if (delta <= 0) {
+        return 0;
+    }
+
+    return delta;
+}
+
+/* Calculate the soonest deadline across all timerlists attached
+ * to the clock. This is used for the icount timeout so we
+ * ignore whether or not the clock should be used in deadline
+ * calculations.
+ */
+int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask)
+{
+    int64_t deadline = -1;
+    int64_t delta;
+    int64_t expire_time;
+    QEMUTimer *ts;
+    QEMUTimerList *timer_list;
+    QEMUClock *clock = qemu_clock_ptr(type);
+
+    if (!clock->enabled) {
+        return -1;
+    }
+
+    QLIST_FOREACH(timer_list, &clock->timerlists, list) {
+        qemu_mutex_lock(&timer_list->active_timers_lock);
+        ts = timer_list->active_timers;
+        /* Skip all external timers */
+        while (ts && (ts->attributes & ~attr_mask)) {
+            ts = ts->next;
+        }
+        if (!ts) {
+            qemu_mutex_unlock(&timer_list->active_timers_lock);
+            continue;
+        }
+        expire_time = ts->expire_time;
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+        delta = expire_time - qemu_clock_get_ns(type);
+        if (delta <= 0) {
+            delta = 0;
+        }
+        deadline = qemu_soonest_timeout(deadline, delta);
+    }
+    return deadline;
+}
+
+QEMUClockType timerlist_get_clock(QEMUTimerList *timer_list)
+{
+    return timer_list->clock->type;
+}
+
+QEMUTimerList *qemu_clock_get_main_loop_timerlist(QEMUClockType type)
+{
+    return main_loop_tlg.tl[type];
+}
+
+void timerlist_notify(QEMUTimerList *timer_list)
+{
+    if (timer_list->notify_cb) {
+        timer_list->notify_cb(timer_list->notify_opaque, timer_list->clock->type);
+    } else {
+        qemu_notify_event();
+    }
+}
+
+/* Transition function to convert a nanosecond timeout to ms
+ * This is used where a system does not support ppoll
+ */
+int qemu_timeout_ns_to_ms(int64_t ns)
+{
+    int64_t ms;
+    if (ns < 0) {
+        return -1;
+    }
+
+    if (!ns) {
+        return 0;
+    }
+
+    /* Always round up, because it's better to wait too long than to wait too
+     * little and effectively busy-wait
+     */
+    ms = DIV_ROUND_UP(ns, SCALE_MS);
+
+    /* To avoid overflow problems, limit this to 2^31, i.e. approx 25 days */
+    return MIN(ms, INT32_MAX);
+}
+
+
+/* qemu implementation of g_poll which uses a nanosecond timeout but is
+ * otherwise identical to g_poll
+ */
+int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout)
+{
+#ifdef CONFIG_PPOLL
+    if (timeout < 0) {
+        return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
+    } else {
+        struct timespec ts;
+        int64_t tvsec = timeout / 1000000000LL;
+        /* Avoid possibly overflowing and specifying a negative number of
+         * seconds, which would turn a very long timeout into a busy-wait.
+         */
+        if (tvsec > (int64_t)INT32_MAX) {
+            tvsec = INT32_MAX;
+        }
+        ts.tv_sec = tvsec;
+        ts.tv_nsec = timeout % 1000000000LL;
+        return ppoll((struct pollfd *)fds, nfds, &ts, NULL);
+    }
+#else
+    return g_poll(fds, nfds, qemu_timeout_ns_to_ms(timeout));
+#endif
+}
+
+
+void timer_init_full(QEMUTimer *ts,
+                     QEMUTimerListGroup *timer_list_group, QEMUClockType type,
+                     int scale, int attributes,
+                     QEMUTimerCB *cb, void *opaque)
+{
+    if (!timer_list_group) {
+        timer_list_group = &main_loop_tlg;
+    }
+    ts->timer_list = timer_list_group->tl[type];
+    ts->cb = cb;
+    ts->opaque = opaque;
+    ts->scale = scale;
+    ts->attributes = attributes;
+    ts->expire_time = -1;
+}
+
+void timer_deinit(QEMUTimer *ts)
+{
+    assert(ts->expire_time == -1);
+    ts->timer_list = NULL;
+}
+
+static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
+{
+    QEMUTimer **pt, *t;
+
+    ts->expire_time = -1;
+    pt = &timer_list->active_timers;
+    for(;;) {
+        t = *pt;
+        if (!t)
+            break;
+        if (t == ts) {
+            qatomic_set(pt, t->next);
+            break;
+        }
+        pt = &t->next;
+    }
+}
+
+static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
+                                QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimer **pt, *t;
+
+    /* add the timer in the sorted list */
+    pt = &timer_list->active_timers;
+    for (;;) {
+        t = *pt;
+        if (!timer_expired_ns(t, expire_time)) {
+            break;
+        }
+        pt = &t->next;
+    }
+    ts->expire_time = MAX(expire_time, 0);
+    ts->next = *pt;
+    qatomic_set(pt, ts);
+
+    return pt == &timer_list->active_timers;
+}
+
+static void timerlist_rearm(QEMUTimerList *timer_list)
+{
+    /* Interrupt execution to force deadline recalculation.  */
+    if (icount_enabled() && timer_list->clock->type == QEMU_CLOCK_VIRTUAL) {
+        icount_start_warp_timer();
+    }
+    timerlist_notify(timer_list);
+}
+
+/* stop a timer, but do not dealloc it */
+void timer_del(QEMUTimer *ts)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+
+    if (timer_list) {
+        qemu_mutex_lock(&timer_list->active_timers_lock);
+        timer_del_locked(timer_list, ts);
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+    }
+}
+
+/* modify the current timer so that it will be fired when current_time
+   >= expire_time. The corresponding callback will be called. */
+void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+    bool rearm;
+
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    timer_del_locked(timer_list, ts);
+    rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+    if (rearm) {
+        timerlist_rearm(timer_list);
+    }
+}
+
+/* modify the current timer so that it will be fired when current_time
+   >= expire_time or the current deadline, whichever comes earlier.
+   The corresponding callback will be called. */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time)
+{
+    QEMUTimerList *timer_list = ts->timer_list;
+    bool rearm;
+
+    WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+        if (ts->expire_time == -1 || ts->expire_time > expire_time) {
+            if (ts->expire_time != -1) {
+                timer_del_locked(timer_list, ts);
+            }
+            rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+        } else {
+            rearm = false;
+        }
+    }
+    if (rearm) {
+        timerlist_rearm(timer_list);
+    }
+}
+
+void timer_mod(QEMUTimer *ts, int64_t expire_time)
+{
+    timer_mod_ns(ts, expire_time * ts->scale);
+}
+
+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time)
+{
+    timer_mod_anticipate_ns(ts, expire_time * ts->scale);
+}
+
+bool timer_pending(QEMUTimer *ts)
+{
+    return ts->expire_time >= 0;
+}
+
+bool timer_expired(QEMUTimer *timer_head, int64_t current_time)
+{
+    return timer_expired_ns(timer_head, current_time * timer_head->scale);
+}
+
+bool timerlist_run_timers(QEMUTimerList *timer_list)
+{
+    QEMUTimer *ts;
+    int64_t current_time;
+    bool progress = false;
+    QEMUTimerCB *cb;
+    void *opaque;
+
+    if (!qatomic_read(&timer_list->active_timers)) {
+        return false;
+    }
+
+    qemu_event_reset(&timer_list->timers_done_ev);
+    if (!timer_list->clock->enabled) {
+        goto out;
+    }
+
+    switch (timer_list->clock->type) {
+    case QEMU_CLOCK_REALTIME:
+        break;
+    default:
+    case QEMU_CLOCK_VIRTUAL:
+        break;
+    case QEMU_CLOCK_HOST:
+        if (!replay_checkpoint(CHECKPOINT_CLOCK_HOST)) {
+            goto out;
+        }
+        break;
+    case QEMU_CLOCK_VIRTUAL_RT:
+        if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL_RT)) {
+            goto out;
+        }
+        break;
+    }
+
+    /*
+     * Extract expired timers from active timers list and process them.
+     *
+     * In rr mode we need "filtered" checkpointing for virtual clock.  The
+     * checkpoint must be recorded/replayed before processing any non-EXTERNAL timer,
+     * and that must only be done once since the clock value stays the same. Because
+     * non-EXTERNAL timers may appear in the timers list while it being processed,
+     * the checkpoint can be issued at a time until no timers are left and we are
+     * done".
+     */
+    current_time = qemu_clock_get_ns(timer_list->clock->type);
+    qemu_mutex_lock(&timer_list->active_timers_lock);
+    while ((ts = timer_list->active_timers)) {
+        if (!timer_expired_ns(ts, current_time)) {
+            /* No expired timers left.  The checkpoint can be skipped
+             * if no timers fired or they were all external.
+             */
+            break;
+        }
+        /* Checkpoint for virtual clock is redundant in cases where
+         * it's being triggered with only non-EXTERNAL timers, because
+         * these timers don't change guest state directly.
+         */
+        if (replay_mode != REPLAY_MODE_NONE
+            && timer_list->clock->type == QEMU_CLOCK_VIRTUAL
+            && !(ts->attributes & QEMU_TIMER_ATTR_EXTERNAL)
+            && !replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL)) {
+            qemu_mutex_unlock(&timer_list->active_timers_lock);
+            goto out;
+        }
+
+        /* remove timer from the list before calling the callback */
+        timer_list->active_timers = ts->next;
+        ts->next = NULL;
+        ts->expire_time = -1;
+        cb = ts->cb;
+        opaque = ts->opaque;
+
+        /* run the callback (the timer list can be modified) */
+        qemu_mutex_unlock(&timer_list->active_timers_lock);
+        cb(opaque);
+        qemu_mutex_lock(&timer_list->active_timers_lock);
+
+        progress = true;
+    }
+    qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+out:
+    qemu_event_set(&timer_list->timers_done_ev);
+    return progress;
+}
+
+bool qemu_clock_run_timers(QEMUClockType type)
+{
+    return timerlist_run_timers(main_loop_tlg.tl[type]);
+}
+
+void timerlistgroup_init(QEMUTimerListGroup *tlg,
+                         QEMUTimerListNotifyCB *cb, void *opaque)
+{
+    QEMUClockType type;
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        tlg->tl[type] = timerlist_new(type, cb, opaque);
+    }
+}
+
+void timerlistgroup_deinit(QEMUTimerListGroup *tlg)
+{
+    QEMUClockType type;
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        timerlist_free(tlg->tl[type]);
+    }
+}
+
+bool timerlistgroup_run_timers(QEMUTimerListGroup *tlg)
+{
+    QEMUClockType type;
+    bool progress = false;
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        progress |= timerlist_run_timers(tlg->tl[type]);
+    }
+    return progress;
+}
+
+int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg)
+{
+    int64_t deadline = -1;
+    QEMUClockType type;
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        if (qemu_clock_use_for_deadline(type)) {
+            deadline = qemu_soonest_timeout(deadline,
+                                            timerlist_deadline_ns(tlg->tl[type]));
+        }
+    }
+    return deadline;
+}
+
+int64_t qemu_clock_get_ns(QEMUClockType type)
+{
+    switch (type) {
+    case QEMU_CLOCK_REALTIME:
+        return get_clock();
+    default:
+    case QEMU_CLOCK_VIRTUAL:
+        return cpus_get_virtual_clock();
+    case QEMU_CLOCK_HOST:
+        return REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
+    case QEMU_CLOCK_VIRTUAL_RT:
+        return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock());
+    }
+}
+
+void init_clocks(QEMUTimerListNotifyCB *notify_cb)
+{
+    QEMUClockType type;
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        qemu_clock_init(type, notify_cb);
+    }
+
+#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK
+    prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
+#endif
+}
+
+uint64_t timer_expire_time_ns(QEMUTimer *ts)
+{
+    return timer_pending(ts) ? ts->expire_time : -1;
+}
+
+bool qemu_clock_run_all_timers(void)
+{
+    bool progress = false;
+    QEMUClockType type;
+
+    for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+        if (qemu_clock_use_for_deadline(type)) {
+            progress |= qemu_clock_run_timers(type);
+        }
+    }
+
+    return progress;
+}
diff --git a/util/qht.c b/util/qht.c
new file mode 100644
index 000000000..079605121
--- /dev/null
+++ b/util/qht.c
@@ -0,0 +1,963 @@
+/*
+ * qht.c - QEMU Hash Table, designed to scale for read-mostly workloads.
+ *
+ * Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ *
+ * Assumptions:
+ * - NULL cannot be inserted/removed as a pointer value.
+ * - Trying to insert an already-existing hash-pointer pair is OK. However,
+ *   it is not OK to insert into the same hash table different hash-pointer
+ *   pairs that have the same pointer value, but not the hashes.
+ * - Lookups are performed under an RCU read-critical section; removals
+ *   must wait for a grace period to elapse before freeing removed objects.
+ *
+ * Features:
+ * - Reads (i.e. lookups and iterators) can be concurrent with other reads.
+ *   Lookups that are concurrent with writes to the same bucket will retry
+ *   via a seqlock; iterators acquire all bucket locks and therefore can be
+ *   concurrent with lookups and are serialized wrt writers.
+ * - Writes (i.e. insertions/removals) can be concurrent with writes to
+ *   different buckets; writes to the same bucket are serialized through a lock.
+ * - Optional auto-resizing: the hash table resizes up if the load surpasses
+ *   a certain threshold. Resizing is done concurrently with readers; writes
+ *   are serialized with the resize operation.
+ *
+ * The key structure is the bucket, which is cacheline-sized. Buckets
+ * contain a few hash values and pointers; the u32 hash values are stored in
+ * full so that resizing is fast. Having this structure instead of directly
+ * chaining items has two advantages:
+ * - Failed lookups fail fast, and touch a minimum number of cache lines.
+ * - Resizing the hash table with concurrent lookups is easy.
+ *
+ * There are two types of buckets:
+ * 1. "head" buckets are the ones allocated in the array of buckets in qht_map.
+ * 2. all "non-head" buckets (i.e. all others) are members of a chain that
+ *    starts from a head bucket.
+ * Note that the seqlock and spinlock of a head bucket applies to all buckets
+ * chained to it; these two fields are unused in non-head buckets.
+ *
+ * On removals, we move the last valid item in the chain to the position of the
+ * just-removed entry. This makes lookups slightly faster, since the moment an
+ * invalid entry is found, the (failed) lookup is over.
+ *
+ * Resizing is done by taking all bucket spinlocks (so that no other writers can
+ * race with us) and then copying all entries into a new hash map. Then, the
+ * ht->map pointer is set, and the old map is freed once no RCU readers can see
+ * it anymore.
+ *
+ * Writers check for concurrent resizes by comparing ht->map before and after
+ * acquiring their bucket lock. If they don't match, a resize has occurred
+ * while the bucket spinlock was being acquired.
+ *
+ * Related Work:
+ * - Idea of cacheline-sized buckets with full hashes taken from:
+ *   David, Guerraoui & Trigonakis, "Asynchronized Concurrency:
+ *   The Secret to Scaling Concurrent Search Data Structures", ASPLOS'15.
+ * - Why not RCU-based hash tables? They would allow us to get rid of the
+ *   seqlock, but resizing would take forever since RCU read critical
+ *   sections in QEMU take quite a long time.
+ *   More info on relativistic hash tables:
+ *   + Triplett, McKenney & Walpole, "Resizable, Scalable, Concurrent Hash
+ *     Tables via Relativistic Programming", USENIX ATC'11.
+ *   + Corbet, "Relativistic hash tables, part 1: Algorithms", @ lwn.net, 2014.
+ *     https://lwn.net/Articles/612021/
+ */
+#include "qemu/osdep.h"
+#include "qemu/qht.h"
+#include "qemu/atomic.h"
+#include "qemu/rcu.h"
+
+//#define QHT_DEBUG
+
+/*
+ * We want to avoid false sharing of cache lines. Most systems have 64-byte
+ * cache lines so we go with it for simplicity.
+ *
+ * Note that systems with smaller cache lines will be fine (the struct is
+ * almost 64-bytes); systems with larger cache lines might suffer from
+ * some false sharing.
+ */
+#define QHT_BUCKET_ALIGN 64
+
+/* define these to keep sizeof(qht_bucket) within QHT_BUCKET_ALIGN */
+#if HOST_LONG_BITS == 32
+#define QHT_BUCKET_ENTRIES 6
+#else /* 64-bit */
+#define QHT_BUCKET_ENTRIES 4
+#endif
+
+enum qht_iter_type {
+    QHT_ITER_VOID,    /* do nothing; use retvoid */
+    QHT_ITER_RM,      /* remove element if retbool returns true */
+};
+
+struct qht_iter {
+    union {
+        qht_iter_func_t retvoid;
+        qht_iter_bool_func_t retbool;
+    } f;
+    enum qht_iter_type type;
+};
+
+/*
+ * Do _not_ use qemu_mutex_[try]lock directly! Use these macros, otherwise
+ * the profiler (QSP) will deadlock.
+ */
+static inline void qht_lock(struct qht *ht)
+{
+    if (ht->mode & QHT_MODE_RAW_MUTEXES) {
+        qemu_mutex_lock__raw(&ht->lock);
+    } else {
+        qemu_mutex_lock(&ht->lock);
+    }
+}
+
+static inline int qht_trylock(struct qht *ht)
+{
+    if (ht->mode & QHT_MODE_RAW_MUTEXES) {
+        return qemu_mutex_trylock__raw(&(ht)->lock);
+    }
+    return qemu_mutex_trylock(&(ht)->lock);
+}
+
+/* this inline is not really necessary, but it helps keep code consistent */
+static inline void qht_unlock(struct qht *ht)
+{
+    qemu_mutex_unlock(&ht->lock);
+}
+
+/*
+ * Note: reading partially-updated pointers in @pointers could lead to
+ * segfaults. We thus access them with qatomic_read/set; this guarantees
+ * that the compiler makes all those accesses atomic. We also need the
+ * volatile-like behavior in qatomic_read, since otherwise the compiler
+ * might refetch the pointer.
+ * qatomic_read's are of course not necessary when the bucket lock is held.
+ *
+ * If both ht->lock and b->lock are grabbed, ht->lock should always
+ * be grabbed first.
+ */
+struct qht_bucket {
+    QemuSpin lock;
+    QemuSeqLock sequence;
+    uint32_t hashes[QHT_BUCKET_ENTRIES];
+    void *pointers[QHT_BUCKET_ENTRIES];
+    struct qht_bucket *next;
+} QEMU_ALIGNED(QHT_BUCKET_ALIGN);
+
+QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > QHT_BUCKET_ALIGN);
+
+/**
+ * struct qht_map - structure to track an array of buckets
+ * @rcu: used by RCU. Keep it as the top field in the struct to help valgrind
+ *       find the whole struct.
+ * @buckets: array of head buckets. It is constant once the map is created.
+ * @n_buckets: number of head buckets. It is constant once the map is created.
+ * @n_added_buckets: number of added (i.e. "non-head") buckets
+ * @n_added_buckets_threshold: threshold to trigger an upward resize once the
+ *                             number of added buckets surpasses it.
+ *
+ * Buckets are tracked in what we call a "map", i.e. this structure.
+ */
+struct qht_map {
+    struct rcu_head rcu;
+    struct qht_bucket *buckets;
+    size_t n_buckets;
+    size_t n_added_buckets;
+    size_t n_added_buckets_threshold;
+};
+
+/* trigger a resize when n_added_buckets > n_buckets / div */
+#define QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV 8
+
+static void qht_do_resize_reset(struct qht *ht, struct qht_map *new,
+                                bool reset);
+static void qht_grow_maybe(struct qht *ht);
+
+#ifdef QHT_DEBUG
+
+#define qht_debug_assert(X) do { assert(X); } while (0)
+
+static void qht_bucket_debug__locked(struct qht_bucket *b)
+{
+    bool seen_empty = false;
+    bool corrupt = false;
+    int i;
+
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (b->pointers[i] == NULL) {
+                seen_empty = true;
+                continue;
+            }
+            if (seen_empty) {
+                fprintf(stderr, "%s: b: %p, pos: %i, hash: 0x%x, p: %p\n",
+                        __func__, b, i, b->hashes[i], b->pointers[i]);
+                corrupt = true;
+            }
+        }
+        b = b->next;
+    } while (b);
+    qht_debug_assert(!corrupt);
+}
+
+static void qht_map_debug__all_locked(struct qht_map *map)
+{
+    int i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        qht_bucket_debug__locked(&map->buckets[i]);
+    }
+}
+#else
+
+#define qht_debug_assert(X) do { (void)(X); } while (0)
+
+static inline void qht_bucket_debug__locked(struct qht_bucket *b)
+{ }
+
+static inline void qht_map_debug__all_locked(struct qht_map *map)
+{ }
+#endif /* QHT_DEBUG */
+
+static inline size_t qht_elems_to_buckets(size_t n_elems)
+{
+    return pow2ceil(n_elems / QHT_BUCKET_ENTRIES);
+}
+
+static inline void qht_head_init(struct qht_bucket *b)
+{
+    memset(b, 0, sizeof(*b));
+    qemu_spin_init(&b->lock);
+    seqlock_init(&b->sequence);
+}
+
+static inline
+struct qht_bucket *qht_map_to_bucket(const struct qht_map *map, uint32_t hash)
+{
+    return &map->buckets[hash & (map->n_buckets - 1)];
+}
+
+/* acquire all bucket locks from a map */
+static void qht_map_lock_buckets(struct qht_map *map)
+{
+    size_t i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        struct qht_bucket *b = &map->buckets[i];
+
+        qemu_spin_lock(&b->lock);
+    }
+}
+
+static void qht_map_unlock_buckets(struct qht_map *map)
+{
+    size_t i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        struct qht_bucket *b = &map->buckets[i];
+
+        qemu_spin_unlock(&b->lock);
+    }
+}
+
+/*
+ * Call with at least a bucket lock held.
+ * @map should be the value read before acquiring the lock (or locks).
+ */
+static inline bool qht_map_is_stale__locked(const struct qht *ht,
+                                            const struct qht_map *map)
+{
+    return map != ht->map;
+}
+
+/*
+ * Grab all bucket locks, and set @pmap after making sure the map isn't stale.
+ *
+ * Pairs with qht_map_unlock_buckets(), hence the pass-by-reference.
+ *
+ * Note: callers cannot have ht->lock held.
+ */
+static inline
+void qht_map_lock_buckets__no_stale(struct qht *ht, struct qht_map **pmap)
+{
+    struct qht_map *map;
+
+    map = qatomic_rcu_read(&ht->map);
+    qht_map_lock_buckets(map);
+    if (likely(!qht_map_is_stale__locked(ht, map))) {
+        *pmap = map;
+        return;
+    }
+    qht_map_unlock_buckets(map);
+
+    /* we raced with a resize; acquire ht->lock to see the updated ht->map */
+    qht_lock(ht);
+    map = ht->map;
+    qht_map_lock_buckets(map);
+    qht_unlock(ht);
+    *pmap = map;
+    return;
+}
+
+/*
+ * Get a head bucket and lock it, making sure its parent map is not stale.
+ * @pmap is filled with a pointer to the bucket's parent map.
+ *
+ * Unlock with qemu_spin_unlock(&b->lock).
+ *
+ * Note: callers cannot have ht->lock held.
+ */
+static inline
+struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht, uint32_t hash,
+                                             struct qht_map **pmap)
+{
+    struct qht_bucket *b;
+    struct qht_map *map;
+
+    map = qatomic_rcu_read(&ht->map);
+    b = qht_map_to_bucket(map, hash);
+
+    qemu_spin_lock(&b->lock);
+    if (likely(!qht_map_is_stale__locked(ht, map))) {
+        *pmap = map;
+        return b;
+    }
+    qemu_spin_unlock(&b->lock);
+
+    /* we raced with a resize; acquire ht->lock to see the updated ht->map */
+    qht_lock(ht);
+    map = ht->map;
+    b = qht_map_to_bucket(map, hash);
+    qemu_spin_lock(&b->lock);
+    qht_unlock(ht);
+    *pmap = map;
+    return b;
+}
+
+static inline bool qht_map_needs_resize(const struct qht_map *map)
+{
+    return qatomic_read(&map->n_added_buckets) >
+           map->n_added_buckets_threshold;
+}
+
+static inline void qht_chain_destroy(const struct qht_bucket *head)
+{
+    struct qht_bucket *curr = head->next;
+    struct qht_bucket *prev;
+
+    qemu_spin_destroy(&head->lock);
+    while (curr) {
+        prev = curr;
+        curr = curr->next;
+        qemu_vfree(prev);
+    }
+}
+
+/* pass only an orphan map */
+static void qht_map_destroy(struct qht_map *map)
+{
+    size_t i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        qht_chain_destroy(&map->buckets[i]);
+    }
+    qemu_vfree(map->buckets);
+    g_free(map);
+}
+
+static struct qht_map *qht_map_create(size_t n_buckets)
+{
+    struct qht_map *map;
+    size_t i;
+
+    map = g_malloc(sizeof(*map));
+    map->n_buckets = n_buckets;
+
+    map->n_added_buckets = 0;
+    map->n_added_buckets_threshold = n_buckets /
+        QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV;
+
+    /* let tiny hash tables to at least add one non-head bucket */
+    if (unlikely(map->n_added_buckets_threshold == 0)) {
+        map->n_added_buckets_threshold = 1;
+    }
+
+    map->buckets = qemu_memalign(QHT_BUCKET_ALIGN,
+                                 sizeof(*map->buckets) * n_buckets);
+    for (i = 0; i < n_buckets; i++) {
+        qht_head_init(&map->buckets[i]);
+    }
+    return map;
+}
+
+void qht_init(struct qht *ht, qht_cmp_func_t cmp, size_t n_elems,
+              unsigned int mode)
+{
+    struct qht_map *map;
+    size_t n_buckets = qht_elems_to_buckets(n_elems);
+
+    g_assert(cmp);
+    ht->cmp = cmp;
+    ht->mode = mode;
+    qemu_mutex_init(&ht->lock);
+    map = qht_map_create(n_buckets);
+    qatomic_rcu_set(&ht->map, map);
+}
+
+/* call only when there are no readers/writers left */
+void qht_destroy(struct qht *ht)
+{
+    qht_map_destroy(ht->map);
+    memset(ht, 0, sizeof(*ht));
+}
+
+static void qht_bucket_reset__locked(struct qht_bucket *head)
+{
+    struct qht_bucket *b = head;
+    int i;
+
+    seqlock_write_begin(&head->sequence);
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (b->pointers[i] == NULL) {
+                goto done;
+            }
+            qatomic_set(&b->hashes[i], 0);
+            qatomic_set(&b->pointers[i], NULL);
+        }
+        b = b->next;
+    } while (b);
+ done:
+    seqlock_write_end(&head->sequence);
+}
+
+/* call with all bucket locks held */
+static void qht_map_reset__all_locked(struct qht_map *map)
+{
+    size_t i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        qht_bucket_reset__locked(&map->buckets[i]);
+    }
+    qht_map_debug__all_locked(map);
+}
+
+void qht_reset(struct qht *ht)
+{
+    struct qht_map *map;
+
+    qht_map_lock_buckets__no_stale(ht, &map);
+    qht_map_reset__all_locked(map);
+    qht_map_unlock_buckets(map);
+}
+
+static inline void qht_do_resize(struct qht *ht, struct qht_map *new)
+{
+    qht_do_resize_reset(ht, new, false);
+}
+
+static inline void qht_do_resize_and_reset(struct qht *ht, struct qht_map *new)
+{
+    qht_do_resize_reset(ht, new, true);
+}
+
+bool qht_reset_size(struct qht *ht, size_t n_elems)
+{
+    struct qht_map *new = NULL;
+    struct qht_map *map;
+    size_t n_buckets;
+
+    n_buckets = qht_elems_to_buckets(n_elems);
+
+    qht_lock(ht);
+    map = ht->map;
+    if (n_buckets != map->n_buckets) {
+        new = qht_map_create(n_buckets);
+    }
+    qht_do_resize_and_reset(ht, new);
+    qht_unlock(ht);
+
+    return !!new;
+}
+
+static inline
+void *qht_do_lookup(const struct qht_bucket *head, qht_lookup_func_t func,
+                    const void *userp, uint32_t hash)
+{
+    const struct qht_bucket *b = head;
+    int i;
+
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (qatomic_read(&b->hashes[i]) == hash) {
+                /* The pointer is dereferenced before seqlock_read_retry,
+                 * so (unlike qht_insert__locked) we need to use
+                 * qatomic_rcu_read here.
+                 */
+                void *p = qatomic_rcu_read(&b->pointers[i]);
+
+                if (likely(p) && likely(func(p, userp))) {
+                    return p;
+                }
+            }
+        }
+        b = qatomic_rcu_read(&b->next);
+    } while (b);
+
+    return NULL;
+}
+
+static __attribute__((noinline))
+void *qht_lookup__slowpath(const struct qht_bucket *b, qht_lookup_func_t func,
+                           const void *userp, uint32_t hash)
+{
+    unsigned int version;
+    void *ret;
+
+    do {
+        version = seqlock_read_begin(&b->sequence);
+        ret = qht_do_lookup(b, func, userp, hash);
+    } while (seqlock_read_retry(&b->sequence, version));
+    return ret;
+}
+
+void *qht_lookup_custom(const struct qht *ht, const void *userp, uint32_t hash,
+                        qht_lookup_func_t func)
+{
+    const struct qht_bucket *b;
+    const struct qht_map *map;
+    unsigned int version;
+    void *ret;
+
+    map = qatomic_rcu_read(&ht->map);
+    b = qht_map_to_bucket(map, hash);
+
+    version = seqlock_read_begin(&b->sequence);
+    ret = qht_do_lookup(b, func, userp, hash);
+    if (likely(!seqlock_read_retry(&b->sequence, version))) {
+        return ret;
+    }
+    /*
+     * Removing the do/while from the fastpath gives a 4% perf. increase when
+     * running a 100%-lookup microbenchmark.
+     */
+    return qht_lookup__slowpath(b, func, userp, hash);
+}
+
+void *qht_lookup(const struct qht *ht, const void *userp, uint32_t hash)
+{
+    return qht_lookup_custom(ht, userp, hash, ht->cmp);
+}
+
+/*
+ * call with head->lock held
+ * @ht is const since it is only used for ht->cmp()
+ */
+static void *qht_insert__locked(const struct qht *ht, struct qht_map *map,
+                                struct qht_bucket *head, void *p, uint32_t hash,
+                                bool *needs_resize)
+{
+    struct qht_bucket *b = head;
+    struct qht_bucket *prev = NULL;
+    struct qht_bucket *new = NULL;
+    int i;
+
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (b->pointers[i]) {
+                if (unlikely(b->hashes[i] == hash &&
+                             ht->cmp(b->pointers[i], p))) {
+                    return b->pointers[i];
+                }
+            } else {
+                goto found;
+            }
+        }
+        prev = b;
+        b = b->next;
+    } while (b);
+
+    b = qemu_memalign(QHT_BUCKET_ALIGN, sizeof(*b));
+    memset(b, 0, sizeof(*b));
+    new = b;
+    i = 0;
+    qatomic_inc(&map->n_added_buckets);
+    if (unlikely(qht_map_needs_resize(map)) && needs_resize) {
+        *needs_resize = true;
+    }
+
+ found:
+    /* found an empty key: acquire the seqlock and write */
+    seqlock_write_begin(&head->sequence);
+    if (new) {
+        qatomic_rcu_set(&prev->next, b);
+    }
+    /* smp_wmb() implicit in seqlock_write_begin.  */
+    qatomic_set(&b->hashes[i], hash);
+    qatomic_set(&b->pointers[i], p);
+    seqlock_write_end(&head->sequence);
+    return NULL;
+}
+
+static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht)
+{
+    struct qht_map *map;
+
+    /*
+     * If the lock is taken it probably means there's an ongoing resize,
+     * so bail out.
+     */
+    if (qht_trylock(ht)) {
+        return;
+    }
+    map = ht->map;
+    /* another thread might have just performed the resize we were after */
+    if (qht_map_needs_resize(map)) {
+        struct qht_map *new = qht_map_create(map->n_buckets * 2);
+
+        qht_do_resize(ht, new);
+    }
+    qht_unlock(ht);
+}
+
+bool qht_insert(struct qht *ht, void *p, uint32_t hash, void **existing)
+{
+    struct qht_bucket *b;
+    struct qht_map *map;
+    bool needs_resize = false;
+    void *prev;
+
+    /* NULL pointers are not supported */
+    qht_debug_assert(p);
+
+    b = qht_bucket_lock__no_stale(ht, hash, &map);
+    prev = qht_insert__locked(ht, map, b, p, hash, &needs_resize);
+    qht_bucket_debug__locked(b);
+    qemu_spin_unlock(&b->lock);
+
+    if (unlikely(needs_resize) && ht->mode & QHT_MODE_AUTO_RESIZE) {
+        qht_grow_maybe(ht);
+    }
+    if (likely(prev == NULL)) {
+        return true;
+    }
+    if (existing) {
+        *existing = prev;
+    }
+    return false;
+}
+
+static inline bool qht_entry_is_last(const struct qht_bucket *b, int pos)
+{
+    if (pos == QHT_BUCKET_ENTRIES - 1) {
+        if (b->next == NULL) {
+            return true;
+        }
+        return b->next->pointers[0] == NULL;
+    }
+    return b->pointers[pos + 1] == NULL;
+}
+
+static void
+qht_entry_move(struct qht_bucket *to, int i, struct qht_bucket *from, int j)
+{
+    qht_debug_assert(!(to == from && i == j));
+    qht_debug_assert(to->pointers[i]);
+    qht_debug_assert(from->pointers[j]);
+
+    qatomic_set(&to->hashes[i], from->hashes[j]);
+    qatomic_set(&to->pointers[i], from->pointers[j]);
+
+    qatomic_set(&from->hashes[j], 0);
+    qatomic_set(&from->pointers[j], NULL);
+}
+
+/*
+ * Find the last valid entry in @orig, and swap it with @orig[pos], which has
+ * just been invalidated.
+ */
+static inline void qht_bucket_remove_entry(struct qht_bucket *orig, int pos)
+{
+    struct qht_bucket *b = orig;
+    struct qht_bucket *prev = NULL;
+    int i;
+
+    if (qht_entry_is_last(orig, pos)) {
+        orig->hashes[pos] = 0;
+        qatomic_set(&orig->pointers[pos], NULL);
+        return;
+    }
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (b->pointers[i]) {
+                continue;
+            }
+            if (i > 0) {
+                return qht_entry_move(orig, pos, b, i - 1);
+            }
+            qht_debug_assert(prev);
+            return qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1);
+        }
+        prev = b;
+        b = b->next;
+    } while (b);
+    /* no free entries other than orig[pos], so swap it with the last one */
+    qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1);
+}
+
+/* call with b->lock held */
+static inline
+bool qht_remove__locked(struct qht_bucket *head, const void *p, uint32_t hash)
+{
+    struct qht_bucket *b = head;
+    int i;
+
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            void *q = b->pointers[i];
+
+            if (unlikely(q == NULL)) {
+                return false;
+            }
+            if (q == p) {
+                qht_debug_assert(b->hashes[i] == hash);
+                seqlock_write_begin(&head->sequence);
+                qht_bucket_remove_entry(b, i);
+                seqlock_write_end(&head->sequence);
+                return true;
+            }
+        }
+        b = b->next;
+    } while (b);
+    return false;
+}
+
+bool qht_remove(struct qht *ht, const void *p, uint32_t hash)
+{
+    struct qht_bucket *b;
+    struct qht_map *map;
+    bool ret;
+
+    /* NULL pointers are not supported */
+    qht_debug_assert(p);
+
+    b = qht_bucket_lock__no_stale(ht, hash, &map);
+    ret = qht_remove__locked(b, p, hash);
+    qht_bucket_debug__locked(b);
+    qemu_spin_unlock(&b->lock);
+    return ret;
+}
+
+static inline void qht_bucket_iter(struct qht_bucket *head,
+                                   const struct qht_iter *iter, void *userp)
+{
+    struct qht_bucket *b = head;
+    int i;
+
+    do {
+        for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+            if (b->pointers[i] == NULL) {
+                return;
+            }
+            switch (iter->type) {
+            case QHT_ITER_VOID:
+                iter->f.retvoid(b->pointers[i], b->hashes[i], userp);
+                break;
+            case QHT_ITER_RM:
+                if (iter->f.retbool(b->pointers[i], b->hashes[i], userp)) {
+                    /* replace i with the last valid element in the bucket */
+                    seqlock_write_begin(&head->sequence);
+                    qht_bucket_remove_entry(b, i);
+                    seqlock_write_end(&head->sequence);
+                    qht_bucket_debug__locked(b);
+                    /* reevaluate i, since it just got replaced */
+                    i--;
+                    continue;
+                }
+                break;
+            default:
+                g_assert_not_reached();
+            }
+        }
+        b = b->next;
+    } while (b);
+}
+
+/* call with all of the map's locks held */
+static inline void qht_map_iter__all_locked(struct qht_map *map,
+                                            const struct qht_iter *iter,
+                                            void *userp)
+{
+    size_t i;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        qht_bucket_iter(&map->buckets[i], iter, userp);
+    }
+}
+
+static inline void
+do_qht_iter(struct qht *ht, const struct qht_iter *iter, void *userp)
+{
+    struct qht_map *map;
+
+    map = qatomic_rcu_read(&ht->map);
+    qht_map_lock_buckets(map);
+    qht_map_iter__all_locked(map, iter, userp);
+    qht_map_unlock_buckets(map);
+}
+
+void qht_iter(struct qht *ht, qht_iter_func_t func, void *userp)
+{
+    const struct qht_iter iter = {
+        .f.retvoid = func,
+        .type = QHT_ITER_VOID,
+    };
+
+    do_qht_iter(ht, &iter, userp);
+}
+
+void qht_iter_remove(struct qht *ht, qht_iter_bool_func_t func, void *userp)
+{
+    const struct qht_iter iter = {
+        .f.retbool = func,
+        .type = QHT_ITER_RM,
+    };
+
+    do_qht_iter(ht, &iter, userp);
+}
+
+struct qht_map_copy_data {
+    struct qht *ht;
+    struct qht_map *new;
+};
+
+static void qht_map_copy(void *p, uint32_t hash, void *userp)
+{
+    struct qht_map_copy_data *data = userp;
+    struct qht *ht = data->ht;
+    struct qht_map *new = data->new;
+    struct qht_bucket *b = qht_map_to_bucket(new, hash);
+
+    /* no need to acquire b->lock because no thread has seen this map yet */
+    qht_insert__locked(ht, new, b, p, hash, NULL);
+}
+
+/*
+ * Atomically perform a resize and/or reset.
+ * Call with ht->lock held.
+ */
+static void qht_do_resize_reset(struct qht *ht, struct qht_map *new, bool reset)
+{
+    struct qht_map *old;
+    const struct qht_iter iter = {
+        .f.retvoid = qht_map_copy,
+        .type = QHT_ITER_VOID,
+    };
+    struct qht_map_copy_data data;
+
+    old = ht->map;
+    qht_map_lock_buckets(old);
+
+    if (reset) {
+        qht_map_reset__all_locked(old);
+    }
+
+    if (new == NULL) {
+        qht_map_unlock_buckets(old);
+        return;
+    }
+
+    g_assert(new->n_buckets != old->n_buckets);
+    data.ht = ht;
+    data.new = new;
+    qht_map_iter__all_locked(old, &iter, &data);
+    qht_map_debug__all_locked(new);
+
+    qatomic_rcu_set(&ht->map, new);
+    qht_map_unlock_buckets(old);
+    call_rcu(old, qht_map_destroy, rcu);
+}
+
+bool qht_resize(struct qht *ht, size_t n_elems)
+{
+    size_t n_buckets = qht_elems_to_buckets(n_elems);
+    size_t ret = false;
+
+    qht_lock(ht);
+    if (n_buckets != ht->map->n_buckets) {
+        struct qht_map *new;
+
+        new = qht_map_create(n_buckets);
+        qht_do_resize(ht, new);
+        ret = true;
+    }
+    qht_unlock(ht);
+
+    return ret;
+}
+
+/* pass @stats to qht_statistics_destroy() when done */
+void qht_statistics_init(const struct qht *ht, struct qht_stats *stats)
+{
+    const struct qht_map *map;
+    int i;
+
+    map = qatomic_rcu_read(&ht->map);
+
+    stats->used_head_buckets = 0;
+    stats->entries = 0;
+    qdist_init(&stats->chain);
+    qdist_init(&stats->occupancy);
+    /* bail out if the qht has not yet been initialized */
+    if (unlikely(map == NULL)) {
+        stats->head_buckets = 0;
+        return;
+    }
+    stats->head_buckets = map->n_buckets;
+
+    for (i = 0; i < map->n_buckets; i++) {
+        const struct qht_bucket *head = &map->buckets[i];
+        const struct qht_bucket *b;
+        unsigned int version;
+        size_t buckets;
+        size_t entries;
+        int j;
+
+        do {
+            version = seqlock_read_begin(&head->sequence);
+            buckets = 0;
+            entries = 0;
+            b = head;
+            do {
+                for (j = 0; j < QHT_BUCKET_ENTRIES; j++) {
+                    if (qatomic_read(&b->pointers[j]) == NULL) {
+                        break;
+                    }
+                    entries++;
+                }
+                buckets++;
+                b = qatomic_rcu_read(&b->next);
+            } while (b);
+        } while (seqlock_read_retry(&head->sequence, version));
+
+        if (entries) {
+            qdist_inc(&stats->chain, buckets);
+            qdist_inc(&stats->occupancy,
+                      (double)entries / QHT_BUCKET_ENTRIES / buckets);
+            stats->used_head_buckets++;
+            stats->entries += entries;
+        } else {
+            qdist_inc(&stats->occupancy, 0);
+        }
+    }
+}
+
+void qht_statistics_destroy(struct qht_stats *stats)
+{
+    qdist_destroy(&stats->occupancy);
+    qdist_destroy(&stats->chain);
+}
diff --git a/util/qsp.c b/util/qsp.c
new file mode 100644
index 000000000..8562b14a8
--- /dev/null
+++ b/util/qsp.c
@@ -0,0 +1,813 @@
+/*
+ * qsp.c - QEMU Synchronization Profiler
+ *
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ *
+ * QSP profiles the time spent in synchronization primitives, which can
+ * help diagnose performance problems, e.g. scalability issues when
+ * contention is high.
+ *
+ * The primitives currently supported are mutexes, recursive mutexes and
+ * condition variables. Note that not all related functions are intercepted;
+ * instead we profile only those functions that can have a performance impact,
+ * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
+ * contention (e.g. mutex_lock, mutex_trylock).
+ *
+ * QSP's design focuses on speed and scalability. This is achieved
+ * by having threads do their profiling entirely on thread-local data.
+ * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
+ * table. To aggregate data in order to generate a report, we iterate over
+ * all entries in the hash table. Depending on the number of threads and
+ * synchronization objects this might be expensive, but note that it is
+ * very rarely called -- reports are generated only when requested by users.
+ *
+ * Reports are generated as a table where each row represents a call site. A
+ * call site is the triplet formed by the __file__ and __LINE__ of the caller
+ * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
+ * being operated on. Optionally, call sites that operate on different objects
+ * of the same type can be coalesced, which can be particularly useful when
+ * profiling dynamically-allocated objects.
+ *
+ * Alternative designs considered:
+ *
+ * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
+ *   for us because QEMU has __malloc_hook set (by one of the libraries it
+ *   uses); leaving this hook unset is required to avoid deadlock in mutrace.
+ *
+ * - Use a glib HT for each thread, protecting each HT with its own lock.
+ *   This isn't simpler than the current design, and is 10% slower in the
+ *   atomic_add-bench microbenchmark (-m option).
+ *
+ * - For reports, just use a binary tree as we aggregate data, instead of having
+ *   an intermediate hash table. This would simplify the code only slightly, but
+ *   would perform badly if there were many threads and objects to track.
+ *
+ * - Wrap operations on qsp entries with RCU read-side critical sections, so
+ *   that qsp_reset() can delete entries. Unfortunately, the overhead of calling
+ *   rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having
+ *   a snapshot that is updated on qsp_reset() avoids this overhead.
+ *
+ * Related Work:
+ * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
+ * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
+ *   Critical-Section Execution to Improve the Performance of Multithreaded
+ *   Applications", USENIX ATC'12.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/qemu-print.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+#include "qemu/qht.h"
+#include "qemu/rcu.h"
+#include "qemu/xxhash.h"
+
+enum QSPType {
+    QSP_MUTEX,
+    QSP_BQL_MUTEX,
+    QSP_REC_MUTEX,
+    QSP_CONDVAR,
+};
+
+struct QSPCallSite {
+    const void *obj;
+    const char *file; /* i.e. __FILE__; shortened later */
+    int line;
+    enum QSPType type;
+};
+typedef struct QSPCallSite QSPCallSite;
+
+struct QSPEntry {
+    void *thread_ptr;
+    const QSPCallSite *callsite;
+    aligned_uint64_t n_acqs;
+    aligned_uint64_t ns;
+    unsigned int n_objs; /* count of coalesced objs; only used for reporting */
+};
+typedef struct QSPEntry QSPEntry;
+
+struct QSPSnapshot {
+    struct rcu_head rcu;
+    struct qht ht;
+};
+typedef struct QSPSnapshot QSPSnapshot;
+
+/* initial sizing for hash tables */
+#define QSP_INITIAL_SIZE 64
+
+/* If this file is moved, QSP_REL_PATH should be updated accordingly */
+#define QSP_REL_PATH "util/qsp.c"
+
+/* this file's full path. Used to present all call sites with relative paths */
+static size_t qsp_qemu_path_len;
+
+/* the address of qsp_thread gives us a unique 'thread ID' */
+static __thread int qsp_thread;
+
+/*
+ * Call sites are the same for all threads, so we track them in a separate hash
+ * table to save memory.
+ */
+static struct qht qsp_callsite_ht;
+
+static struct qht qsp_ht;
+static QSPSnapshot *qsp_snapshot;
+static bool qsp_initialized, qsp_initializing;
+
+static const char * const qsp_typenames[] = {
+    [QSP_MUTEX]     = "mutex",
+    [QSP_BQL_MUTEX] = "BQL mutex",
+    [QSP_REC_MUTEX] = "rec_mutex",
+    [QSP_CONDVAR]   = "condvar",
+};
+
+QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl;
+QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl;
+QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl;
+QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl;
+QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func =
+    qemu_rec_mutex_trylock_impl;
+QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl;
+QemuCondTimedWaitFunc qemu_cond_timedwait_func = qemu_cond_timedwait_impl;
+
+/*
+ * It pays off to _not_ hash callsite->file; hashing a string is slow, and
+ * without it we still get a pretty unique hash.
+ */
+static inline
+uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t ab)
+{
+    uint64_t cd = (uint64_t)(uintptr_t)callsite->obj;
+    uint32_t e = callsite->line;
+    uint32_t f = callsite->type;
+
+    return qemu_xxhash6(ab, cd, e, f);
+}
+
+static inline
+uint32_t qsp_callsite_hash(const QSPCallSite *callsite)
+{
+    return do_qsp_callsite_hash(callsite, 0);
+}
+
+static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a)
+{
+    return do_qsp_callsite_hash(entry->callsite, a);
+}
+
+static uint32_t qsp_entry_hash(const QSPEntry *entry)
+{
+    return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr);
+}
+
+static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry)
+{
+    return do_qsp_entry_hash(entry, 0);
+}
+
+/* without the objects we need to hash the file name to get a decent hash */
+static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry)
+{
+    const QSPCallSite *callsite = entry->callsite;
+    uint64_t ab = g_str_hash(callsite->file);
+    uint64_t cd = callsite->line;
+    uint32_t e = callsite->type;
+
+    return qemu_xxhash5(ab, cd, e);
+}
+
+static bool qsp_callsite_cmp(const void *ap, const void *bp)
+{
+    const QSPCallSite *a = ap;
+    const QSPCallSite *b = bp;
+
+    return a == b ||
+        (a->obj == b->obj &&
+         a->line == b->line &&
+         a->type == b->type &&
+         (a->file == b->file || !strcmp(a->file, b->file)));
+}
+
+static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp)
+{
+    const QSPCallSite *a = ap;
+    const QSPCallSite *b = bp;
+
+    return a == b ||
+        (a->line == b->line &&
+         a->type == b->type &&
+         (a->file == b->file || !strcmp(a->file, b->file)));
+}
+
+static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp)
+{
+    const QSPEntry *a = ap;
+    const QSPEntry *b = bp;
+
+    return qsp_callsite_cmp(a->callsite, b->callsite);
+}
+
+static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp)
+{
+    const QSPEntry *a = ap;
+    const QSPEntry *b = bp;
+
+    return qsp_callsite_no_obj_cmp(a->callsite, b->callsite);
+}
+
+static bool qsp_entry_cmp(const void *ap, const void *bp)
+{
+    const QSPEntry *a = ap;
+    const QSPEntry *b = bp;
+
+    return a->thread_ptr == b->thread_ptr &&
+        qsp_callsite_cmp(a->callsite, b->callsite);
+}
+
+/*
+ * Normally we'd call this from a constructor function, but we want it to work
+ * via libutil as well.
+ */
+static void qsp_do_init(void)
+{
+    /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
+    g_assert(strstr(__FILE__, QSP_REL_PATH));
+    qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH);
+
+    qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
+             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+    qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE,
+             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+}
+
+static __attribute__((noinline)) void qsp_init__slowpath(void)
+{
+    if (qatomic_cmpxchg(&qsp_initializing, false, true) == false) {
+        qsp_do_init();
+        qatomic_set(&qsp_initialized, true);
+    } else {
+        while (!qatomic_read(&qsp_initialized)) {
+            cpu_relax();
+        }
+    }
+}
+
+/* qsp_init() must be called from _all_ exported functions */
+static inline void qsp_init(void)
+{
+    if (likely(qatomic_read(&qsp_initialized))) {
+        return;
+    }
+    qsp_init__slowpath();
+}
+
+static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig)
+{
+    QSPCallSite *callsite;
+    uint32_t hash;
+
+    hash = qsp_callsite_hash(orig);
+    callsite = qht_lookup(&qsp_callsite_ht, orig, hash);
+    if (callsite == NULL) {
+        void *existing = NULL;
+
+        callsite = g_new(QSPCallSite, 1);
+        memcpy(callsite, orig, sizeof(*callsite));
+        qht_insert(&qsp_callsite_ht, callsite, hash, &existing);
+        if (unlikely(existing)) {
+            g_free(callsite);
+            callsite = existing;
+        }
+    }
+    return callsite;
+}
+
+static QSPEntry *
+qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash)
+{
+    QSPEntry *e;
+    void *existing = NULL;
+
+    e = g_new0(QSPEntry, 1);
+    e->thread_ptr = entry->thread_ptr;
+    e->callsite = qsp_callsite_find(entry->callsite);
+
+    qht_insert(ht, e, hash, &existing);
+    if (unlikely(existing)) {
+        g_free(e);
+        e = existing;
+    }
+    return e;
+}
+
+static QSPEntry *
+qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash)
+{
+    QSPEntry *e;
+
+    e = qht_lookup(ht, entry, hash);
+    if (e == NULL) {
+        e = qsp_entry_create(ht, entry, hash);
+    }
+    return e;
+}
+
+/*
+ * Note: Entries are never removed, so callers do not have to be in an RCU
+ * read-side critical section.
+ */
+static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line,
+                               enum QSPType type)
+{
+    QSPCallSite callsite = {
+        .obj = obj,
+        .file = file,
+        .line = line,
+        .type = type,
+    };
+    QSPEntry orig;
+    uint32_t hash;
+
+    qsp_init();
+
+    orig.thread_ptr = &qsp_thread;
+    orig.callsite = &callsite;
+
+    hash = qsp_entry_hash(&orig);
+    return qsp_entry_find(&qsp_ht, &orig, hash);
+}
+
+/*
+ * @e is in the global hash table; it is only written to by the current thread,
+ * so we write to it atomically (as in "write once") to prevent torn reads.
+ */
+static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq)
+{
+    qatomic_set_u64(&e->ns, e->ns + delta);
+    if (acq) {
+        qatomic_set_u64(&e->n_acqs, e->n_acqs + 1);
+    }
+}
+
+static inline void qsp_entry_record(QSPEntry *e, int64_t delta)
+{
+    do_qsp_entry_record(e, delta, true);
+}
+
+#define QSP_GEN_VOID(type_, qsp_t_, func_, impl_)                       \
+    static void func_(type_ *obj, const char *file, int line)           \
+    {                                                                   \
+        QSPEntry *e;                                                    \
+        int64_t t0, t1;                                                 \
+                                                                        \
+        t0 = get_clock();                                               \
+        impl_(obj, file, line);                                         \
+        t1 = get_clock();                                               \
+                                                                        \
+        e = qsp_entry_get(obj, file, line, qsp_t_);                     \
+        qsp_entry_record(e, t1 - t0);                                   \
+    }
+
+#define QSP_GEN_RET1(type_, qsp_t_, func_, impl_)                       \
+    static int func_(type_ *obj, const char *file, int line)            \
+    {                                                                   \
+        QSPEntry *e;                                                    \
+        int64_t t0, t1;                                                 \
+        int err;                                                        \
+                                                                        \
+        t0 = get_clock();                                               \
+        err = impl_(obj, file, line);                                   \
+        t1 = get_clock();                                               \
+                                                                        \
+        e = qsp_entry_get(obj, file, line, qsp_t_);                     \
+        do_qsp_entry_record(e, t1 - t0, !err);                          \
+        return err;                                                     \
+    }
+
+QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl)
+QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl)
+QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl)
+
+QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock,
+             qemu_rec_mutex_lock_impl)
+QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock,
+             qemu_rec_mutex_trylock_impl)
+
+#undef QSP_GEN_RET1
+#undef QSP_GEN_VOID
+
+static void
+qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line)
+{
+    QSPEntry *e;
+    int64_t t0, t1;
+
+    t0 = get_clock();
+    qemu_cond_wait_impl(cond, mutex, file, line);
+    t1 = get_clock();
+
+    e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
+    qsp_entry_record(e, t1 - t0);
+}
+
+static bool
+qsp_cond_timedwait(QemuCond *cond, QemuMutex *mutex, int ms,
+                   const char *file, int line)
+{
+    QSPEntry *e;
+    int64_t t0, t1;
+    bool ret;
+
+    t0 = get_clock();
+    ret = qemu_cond_timedwait_impl(cond, mutex, ms, file, line);
+    t1 = get_clock();
+
+    e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
+    qsp_entry_record(e, t1 - t0);
+    return ret;
+}
+
+bool qsp_is_enabled(void)
+{
+    return qatomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock;
+}
+
+void qsp_enable(void)
+{
+    qatomic_set(&qemu_mutex_lock_func, qsp_mutex_lock);
+    qatomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock);
+    qatomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock);
+    qatomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock);
+    qatomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock);
+    qatomic_set(&qemu_cond_wait_func, qsp_cond_wait);
+    qatomic_set(&qemu_cond_timedwait_func, qsp_cond_timedwait);
+}
+
+void qsp_disable(void)
+{
+    qatomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl);
+    qatomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl);
+    qatomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl);
+    qatomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl);
+    qatomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl);
+    qatomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl);
+    qatomic_set(&qemu_cond_timedwait_func, qemu_cond_timedwait_impl);
+}
+
+static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up)
+{
+    const QSPEntry *a = ap;
+    const QSPEntry *b = bp;
+    enum QSPSortBy sort_by = *(enum QSPSortBy *)up;
+    const QSPCallSite *ca;
+    const QSPCallSite *cb;
+
+    switch (sort_by) {
+    case QSP_SORT_BY_TOTAL_WAIT_TIME:
+        if (a->ns > b->ns) {
+            return -1;
+        } else if (a->ns < b->ns) {
+            return 1;
+        }
+        break;
+    case QSP_SORT_BY_AVG_WAIT_TIME:
+    {
+        double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0;
+        double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0;
+
+        if (avg_a > avg_b) {
+            return -1;
+        } else if (avg_a < avg_b) {
+            return 1;
+        }
+        break;
+    }
+    default:
+        g_assert_not_reached();
+    }
+
+    ca = a->callsite;
+    cb = b->callsite;
+    /* Break the tie with the object's address */
+    if (ca->obj < cb->obj) {
+        return -1;
+    } else if (ca->obj > cb->obj) {
+        return 1;
+    } else {
+        int cmp;
+
+        /* same obj. Break the tie with the callsite's file */
+        cmp = strcmp(ca->file, cb->file);
+        if (cmp) {
+            return cmp;
+        }
+        /* same callsite file. Break the tie with the callsite's line */
+        g_assert(ca->line != cb->line);
+        if (ca->line < cb->line) {
+            return -1;
+        } else if (ca->line > cb->line) {
+            return 1;
+        } else {
+            /* break the tie with the callsite's type */
+            return cb->type - ca->type;
+        }
+    }
+}
+
+static void qsp_sort(void *p, uint32_t h, void *userp)
+{
+    QSPEntry *e = p;
+    GTree *tree = userp;
+
+    g_tree_insert(tree, e, NULL);
+}
+
+static void qsp_aggregate(void *p, uint32_t h, void *up)
+{
+    struct qht *ht = up;
+    const QSPEntry *e = p;
+    QSPEntry *agg;
+    uint32_t hash;
+
+    hash = qsp_entry_no_thread_hash(e);
+    agg = qsp_entry_find(ht, e, hash);
+    /*
+     * The entry is in the global hash table; read from it atomically (as in
+     * "read once").
+     */
+    agg->ns += qatomic_read_u64(&e->ns);
+    agg->n_acqs += qatomic_read_u64(&e->n_acqs);
+}
+
+static void qsp_iter_diff(void *p, uint32_t hash, void *htp)
+{
+    struct qht *ht = htp;
+    QSPEntry *old = p;
+    QSPEntry *new;
+
+    new = qht_lookup(ht, old, hash);
+    /* entries are never deleted, so we must have this one */
+    g_assert(new != NULL);
+    /* our reading of the stats happened after the snapshot was taken */
+    g_assert(new->n_acqs >= old->n_acqs);
+    g_assert(new->ns >= old->ns);
+
+    new->n_acqs -= old->n_acqs;
+    new->ns -= old->ns;
+
+    /* No point in reporting an empty entry */
+    if (new->n_acqs == 0 && new->ns == 0) {
+        bool removed = qht_remove(ht, new, hash);
+
+        g_assert(removed);
+        g_free(new);
+    }
+}
+
+static void qsp_diff(struct qht *orig, struct qht *new)
+{
+    qht_iter(orig, qsp_iter_diff, new);
+}
+
+static void qsp_iter_callsite_coalesce(void *p, uint32_t h, void *htp)
+{
+    struct qht *ht = htp;
+    QSPEntry *old = p;
+    QSPEntry *e;
+    uint32_t hash;
+
+    hash = qsp_entry_no_thread_obj_hash(old);
+    e = qht_lookup(ht, old, hash);
+    if (e == NULL) {
+        e = qsp_entry_create(ht, old, hash);
+        e->n_objs = 1;
+    } else if (e->callsite->obj != old->callsite->obj) {
+        e->n_objs++;
+    }
+    e->ns += old->ns;
+    e->n_acqs += old->n_acqs;
+}
+
+static void qsp_ht_delete(void *p, uint32_t h, void *htp)
+{
+    g_free(p);
+}
+
+static void qsp_mktree(GTree *tree, bool callsite_coalesce)
+{
+    struct qht ht, coalesce_ht;
+    struct qht *htp;
+
+    /*
+     * First, see if there's a prior snapshot, so that we read the global hash
+     * table _after_ the snapshot has been created, which guarantees that
+     * the entries we'll read will be a superset of the snapshot's entries.
+     *
+     * We must remain in an RCU read-side critical section until we're done
+     * with the snapshot.
+     */
+    WITH_RCU_READ_LOCK_GUARD() {
+        QSPSnapshot *snap = qatomic_rcu_read(&qsp_snapshot);
+
+        /* Aggregate all results from the global hash table into a local one */
+        qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
+                 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+        qht_iter(&qsp_ht, qsp_aggregate, &ht);
+
+        /* compute the difference wrt the snapshot, if any */
+        if (snap) {
+            qsp_diff(&snap->ht, &ht);
+        }
+    }
+
+    htp = &ht;
+    if (callsite_coalesce) {
+        qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE,
+                 QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+        qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht);
+
+        /* free the previous hash table, and point htp to coalesce_ht */
+        qht_iter(&ht, qsp_ht_delete, NULL);
+        qht_destroy(&ht);
+        htp = &coalesce_ht;
+    }
+
+    /* sort the hash table elements by using a tree */
+    qht_iter(htp, qsp_sort, tree);
+
+    /* free the hash table, but keep the elements (those are in the tree now) */
+    qht_destroy(htp);
+}
+
+/* free string with g_free */
+static char *qsp_at(const QSPCallSite *callsite)
+{
+    GString *s = g_string_new(NULL);
+    const char *shortened;
+
+    /* remove the absolute path to qemu */
+    if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) {
+        shortened = callsite->file;
+    } else {
+        shortened = callsite->file + qsp_qemu_path_len;
+    }
+    g_string_append_printf(s, "%s:%u", shortened, callsite->line);
+    return g_string_free(s, FALSE);
+}
+
+struct QSPReportEntry {
+    const void *obj;
+    char *callsite_at;
+    const char *typename;
+    double time_s;
+    double ns_avg;
+    uint64_t n_acqs;
+    unsigned int n_objs;
+};
+typedef struct QSPReportEntry QSPReportEntry;
+
+struct QSPReport {
+    QSPReportEntry *entries;
+    size_t n_entries;
+    size_t max_n_entries;
+};
+typedef struct QSPReport QSPReport;
+
+static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata)
+{
+    const QSPEntry *e = key;
+    QSPReport *report = udata;
+    QSPReportEntry *entry;
+
+    if (report->n_entries == report->max_n_entries) {
+        return TRUE;
+    }
+    entry = &report->entries[report->n_entries];
+    report->n_entries++;
+
+    entry->obj = e->callsite->obj;
+    entry->n_objs = e->n_objs;
+    entry->callsite_at = qsp_at(e->callsite);
+    entry->typename = qsp_typenames[e->callsite->type];
+    entry->time_s = e->ns * 1e-9;
+    entry->n_acqs = e->n_acqs;
+    entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0;
+    return FALSE;
+}
+
+static void pr_report(const QSPReport *rep)
+{
+    char *dashes;
+    size_t max_len = 0;
+    int callsite_len = 0;
+    int callsite_rspace;
+    int n_dashes;
+    size_t i;
+
+    /* find out the maximum length of all 'callsite' fields */
+    for (i = 0; i < rep->n_entries; i++) {
+        const QSPReportEntry *e = &rep->entries[i];
+        size_t len = strlen(e->callsite_at);
+
+        if (len > max_len) {
+            max_len = len;
+        }
+    }
+
+    callsite_len = MAX(max_len, strlen("Call site"));
+    /* white space to leave to the right of "Call site" */
+    callsite_rspace = callsite_len - strlen("Call site");
+
+    qemu_printf("Type               Object  Call site%*s  Wait Time (s)  "
+                "       Count  Average (us)\n", callsite_rspace, "");
+
+    /* build a horizontal rule with dashes */
+    n_dashes = 79 + callsite_rspace;
+    dashes = g_malloc(n_dashes + 1);
+    memset(dashes, '-', n_dashes);
+    dashes[n_dashes] = '\0';
+    qemu_printf("%s\n", dashes);
+
+    for (i = 0; i < rep->n_entries; i++) {
+        const QSPReportEntry *e = &rep->entries[i];
+        GString *s = g_string_new(NULL);
+
+        g_string_append_printf(s, "%-9s  ", e->typename);
+        if (e->n_objs > 1) {
+            g_string_append_printf(s, "[%12u]", e->n_objs);
+        } else {
+            g_string_append_printf(s, "%14p", e->obj);
+        }
+        g_string_append_printf(s, "  %s%*s  %13.5f  %12" PRIu64 "  %12.2f\n",
+                               e->callsite_at,
+                               callsite_len - (int)strlen(e->callsite_at), "",
+                               e->time_s, e->n_acqs, e->ns_avg * 1e-3);
+        qemu_printf("%s", s->str);
+        g_string_free(s, TRUE);
+    }
+
+    qemu_printf("%s\n", dashes);
+    g_free(dashes);
+}
+
+static void report_destroy(QSPReport *rep)
+{
+    size_t i;
+
+    for (i = 0; i < rep->n_entries; i++) {
+        QSPReportEntry *e = &rep->entries[i];
+
+        g_free(e->callsite_at);
+    }
+    g_free(rep->entries);
+}
+
+void qsp_report(size_t max, enum QSPSortBy sort_by,
+                bool callsite_coalesce)
+{
+    GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL);
+    QSPReport rep;
+
+    qsp_init();
+
+    rep.entries = g_new0(QSPReportEntry, max);
+    rep.n_entries = 0;
+    rep.max_n_entries = max;
+
+    qsp_mktree(tree, callsite_coalesce);
+    g_tree_foreach(tree, qsp_tree_report, &rep);
+    g_tree_destroy(tree);
+
+    pr_report(&rep);
+    report_destroy(&rep);
+}
+
+static void qsp_snapshot_destroy(QSPSnapshot *snap)
+{
+    qht_iter(&snap->ht, qsp_ht_delete, NULL);
+    qht_destroy(&snap->ht);
+    g_free(snap);
+}
+
+void qsp_reset(void)
+{
+    QSPSnapshot *new = g_new(QSPSnapshot, 1);
+    QSPSnapshot *old;
+
+    qsp_init();
+
+    qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
+             QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+
+    /* take a snapshot of the current state */
+    qht_iter(&qsp_ht, qsp_aggregate, &new->ht);
+
+    /* replace the previous snapshot, if any */
+    old = qatomic_xchg(&qsp_snapshot, new);
+    if (old) {
+        call_rcu(old, qsp_snapshot_destroy, rcu);
+    }
+}
diff --git a/util/range.c b/util/range.c
new file mode 100644
index 000000000..098d9d2dc
--- /dev/null
+++ b/util/range.c
@@ -0,0 +1,72 @@
+/*
+ * QEMU 64-bit address ranges
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/range.h"
+
+/*
+ * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
+ * Both @a and @b must not be empty.
+ */
+static inline int range_compare(Range *a, Range *b)
+{
+    assert(!range_is_empty(a) && !range_is_empty(b));
+
+    /* Careful, avoid wraparound */
+    if (b->lob && b->lob - 1 > a->upb) {
+        return -1;
+    }
+    if (a->lob && a->lob - 1 > b->upb) {
+        return 1;
+    }
+    return 0;
+}
+
+/* Insert @data into @list of ranges; caller no longer owns @data */
+GList *range_list_insert(GList *list, Range *data)
+{
+    GList *l;
+
+    assert(!range_is_empty(data));
+
+    /* Skip all list elements strictly less than data */
+    for (l = list; l && range_compare(l->data, data) < 0; l = l->next) {
+    }
+
+    if (!l || range_compare(l->data, data) > 0) {
+        /* Rest of the list (if any) is strictly greater than @data */
+        return g_list_insert_before(list, l, data);
+    }
+
+    /* Current list element overlaps @data, merge the two */
+    range_extend(l->data, data);
+    g_free(data);
+
+    /* Merge any subsequent list elements that now also overlap */
+    while (l->next && range_compare(l->data, l->next->data) == 0) {
+        GList *new_l;
+
+        range_extend(l->data, l->next->data);
+        g_free(l->next->data);
+        new_l = g_list_delete_link(list, l->next);
+        assert(new_l == list);
+    }
+
+    return list;
+}
diff --git a/util/rcu.c b/util/rcu.c
new file mode 100644
index 000000000..c91da9f13
--- /dev/null
+++ b/util/rcu.c
@@ -0,0 +1,455 @@
+/*
+ * urcu-mb.c
+ *
+ * Userspace RCU library with explicit memory barriers
+ *
+ * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * Copyright (c) 2009 Paul E. McKenney, IBM Corporation.
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Ported to QEMU by Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/rcu.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "qemu/lockable.h"
+#if defined(CONFIG_MALLOC_TRIM)
+#include <malloc.h>
+#endif
+
+/*
+ * Global grace period counter.  Bit 0 is always one in rcu_gp_ctr.
+ * Bits 1 and above are defined in synchronize_rcu.
+ */
+#define RCU_GP_LOCKED           (1UL << 0)
+#define RCU_GP_CTR              (1UL << 1)
+
+unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
+
+QemuEvent rcu_gp_event;
+static int in_drain_call_rcu;
+static QemuMutex rcu_registry_lock;
+static QemuMutex rcu_sync_lock;
+
+/*
+ * Check whether a quiescent state was crossed between the beginning of
+ * update_counter_and_wait and now.
+ */
+static inline int rcu_gp_ongoing(unsigned long *ctr)
+{
+    unsigned long v;
+
+    v = qatomic_read(ctr);
+    return v && (v != rcu_gp_ctr);
+}
+
+/* Written to only by each individual reader. Read by both the reader and the
+ * writers.
+ */
+__thread struct rcu_reader_data rcu_reader;
+
+/* Protected by rcu_registry_lock.  */
+typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
+static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
+
+/* Wait for previous parity/grace period to be empty of readers.  */
+static void wait_for_readers(void)
+{
+    ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
+    struct rcu_reader_data *index, *tmp;
+
+    for (;;) {
+        /* We want to be notified of changes made to rcu_gp_ongoing
+         * while we walk the list.
+         */
+        qemu_event_reset(&rcu_gp_event);
+
+        /* Instead of using qatomic_mb_set for index->waiting, and
+         * qatomic_mb_read for index->ctr, memory barriers are placed
+         * manually since writes to different threads are independent.
+         * qemu_event_reset has acquire semantics, so no memory barrier
+         * is needed here.
+         */
+        QLIST_FOREACH(index, &registry, node) {
+            qatomic_set(&index->waiting, true);
+        }
+
+        /* Here, order the stores to index->waiting before the loads of
+         * index->ctr.  Pairs with smp_mb_placeholder() in rcu_read_unlock(),
+         * ensuring that the loads of index->ctr are sequentially consistent.
+         */
+        smp_mb_global();
+
+        QLIST_FOREACH_SAFE(index, &registry, node, tmp) {
+            if (!rcu_gp_ongoing(&index->ctr)) {
+                QLIST_REMOVE(index, node);
+                QLIST_INSERT_HEAD(&qsreaders, index, node);
+
+                /* No need for mb_set here, worst of all we
+                 * get some extra futex wakeups.
+                 */
+                qatomic_set(&index->waiting, false);
+            } else if (qatomic_read(&in_drain_call_rcu)) {
+                notifier_list_notify(&index->force_rcu, NULL);
+            }
+        }
+
+        if (QLIST_EMPTY(&registry)) {
+            break;
+        }
+
+        /* Wait for one thread to report a quiescent state and try again.
+         * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't
+         * wait too much time.
+         *
+         * rcu_register_thread() may add nodes to &registry; it will not
+         * wake up synchronize_rcu, but that is okay because at least another
+         * thread must exit its RCU read-side critical section before
+         * synchronize_rcu is done.  The next iteration of the loop will
+         * move the new thread's rcu_reader from &registry to &qsreaders,
+         * because rcu_gp_ongoing() will return false.
+         *
+         * rcu_unregister_thread() may remove nodes from &qsreaders instead
+         * of &registry if it runs during qemu_event_wait.  That's okay;
+         * the node then will not be added back to &registry by QLIST_SWAP
+         * below.  The invariant is that the node is part of one list when
+         * rcu_registry_lock is released.
+         */
+        qemu_mutex_unlock(&rcu_registry_lock);
+        qemu_event_wait(&rcu_gp_event);
+        qemu_mutex_lock(&rcu_registry_lock);
+    }
+
+    /* put back the reader list in the registry */
+    QLIST_SWAP(&registry, &qsreaders, node);
+}
+
+void synchronize_rcu(void)
+{
+    QEMU_LOCK_GUARD(&rcu_sync_lock);
+
+    /* Write RCU-protected pointers before reading p_rcu_reader->ctr.
+     * Pairs with smp_mb_placeholder() in rcu_read_lock().
+     */
+    smp_mb_global();
+
+    QEMU_LOCK_GUARD(&rcu_registry_lock);
+    if (!QLIST_EMPTY(&registry)) {
+        /* In either case, the qatomic_mb_set below blocks stores that free
+         * old RCU-protected pointers.
+         */
+        if (sizeof(rcu_gp_ctr) < 8) {
+            /* For architectures with 32-bit longs, a two-subphases algorithm
+             * ensures we do not encounter overflow bugs.
+             *
+             * Switch parity: 0 -> 1, 1 -> 0.
+             */
+            qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+            wait_for_readers();
+            qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+        } else {
+            /* Increment current grace period.  */
+            qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
+        }
+
+        wait_for_readers();
+    }
+}
+
+
+#define RCU_CALL_MIN_SIZE        30
+
+/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
+ * from liburcu.  Note that head is only used by the consumer.
+ */
+static struct rcu_head dummy;
+static struct rcu_head *head = &dummy, **tail = &dummy.next;
+static int rcu_call_count;
+static QemuEvent rcu_call_ready_event;
+
+static void enqueue(struct rcu_head *node)
+{
+    struct rcu_head **old_tail;
+
+    node->next = NULL;
+    old_tail = qatomic_xchg(&tail, &node->next);
+    qatomic_mb_set(old_tail, node);
+}
+
+static struct rcu_head *try_dequeue(void)
+{
+    struct rcu_head *node, *next;
+
+retry:
+    /* Test for an empty list, which we do not expect.  Note that for
+     * the consumer head and tail are always consistent.  The head
+     * is consistent because only the consumer reads/writes it.
+     * The tail, because it is the first step in the enqueuing.
+     * It is only the next pointers that might be inconsistent.
+     */
+    if (head == &dummy && qatomic_mb_read(&tail) == &dummy.next) {
+        abort();
+    }
+
+    /* If the head node has NULL in its next pointer, the value is
+     * wrong and we need to wait until its enqueuer finishes the update.
+     */
+    node = head;
+    next = qatomic_mb_read(&head->next);
+    if (!next) {
+        return NULL;
+    }
+
+    /* Since we are the sole consumer, and we excluded the empty case
+     * above, the queue will always have at least two nodes: the
+     * dummy node, and the one being removed.  So we do not need to update
+     * the tail pointer.
+     */
+    head = next;
+
+    /* If we dequeued the dummy node, add it back at the end and retry.  */
+    if (node == &dummy) {
+        enqueue(node);
+        goto retry;
+    }
+
+    return node;
+}
+
+static void *call_rcu_thread(void *opaque)
+{
+    struct rcu_head *node;
+
+    rcu_register_thread();
+
+    for (;;) {
+        int tries = 0;
+        int n = qatomic_read(&rcu_call_count);
+
+        /* Heuristically wait for a decent number of callbacks to pile up.
+         * Fetch rcu_call_count now, we only must process elements that were
+         * added before synchronize_rcu() starts.
+         */
+        while (n == 0 || (n < RCU_CALL_MIN_SIZE && ++tries <= 5)) {
+            g_usleep(10000);
+            if (n == 0) {
+                qemu_event_reset(&rcu_call_ready_event);
+                n = qatomic_read(&rcu_call_count);
+                if (n == 0) {
+#if defined(CONFIG_MALLOC_TRIM)
+                    malloc_trim(4 * 1024 * 1024);
+#endif
+                    qemu_event_wait(&rcu_call_ready_event);
+                }
+            }
+            n = qatomic_read(&rcu_call_count);
+        }
+
+        qatomic_sub(&rcu_call_count, n);
+        synchronize_rcu();
+        qemu_mutex_lock_iothread();
+        while (n > 0) {
+            node = try_dequeue();
+            while (!node) {
+                qemu_mutex_unlock_iothread();
+                qemu_event_reset(&rcu_call_ready_event);
+                node = try_dequeue();
+                if (!node) {
+                    qemu_event_wait(&rcu_call_ready_event);
+                    node = try_dequeue();
+                }
+                qemu_mutex_lock_iothread();
+            }
+
+            n--;
+            node->func(node);
+        }
+        qemu_mutex_unlock_iothread();
+    }
+    abort();
+}
+
+void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
+{
+    node->func = func;
+    enqueue(node);
+    qatomic_inc(&rcu_call_count);
+    qemu_event_set(&rcu_call_ready_event);
+}
+
+
+struct rcu_drain {
+    struct rcu_head rcu;
+    QemuEvent drain_complete_event;
+};
+
+static void drain_rcu_callback(struct rcu_head *node)
+{
+    struct rcu_drain *event = (struct rcu_drain *)node;
+    qemu_event_set(&event->drain_complete_event);
+}
+
+/*
+ * This function ensures that all pending RCU callbacks
+ * on the current thread are done executing
+
+ * drops big qemu lock during the wait to allow RCU thread
+ * to process the callbacks
+ *
+ */
+
+void drain_call_rcu(void)
+{
+    struct rcu_drain rcu_drain;
+    bool locked = qemu_mutex_iothread_locked();
+
+    memset(&rcu_drain, 0, sizeof(struct rcu_drain));
+    qemu_event_init(&rcu_drain.drain_complete_event, false);
+
+    if (locked) {
+        qemu_mutex_unlock_iothread();
+    }
+
+
+    /*
+     * RCU callbacks are invoked in the same order as in which they
+     * are registered, thus we can be sure that when 'drain_rcu_callback'
+     * is called, all RCU callbacks that were registered on this thread
+     * prior to calling this function are completed.
+     *
+     * Note that since we have only one global queue of the RCU callbacks,
+     * we also end up waiting for most of RCU callbacks that were registered
+     * on the other threads, but this is a side effect that shoudn't be
+     * assumed.
+     */
+
+    qatomic_inc(&in_drain_call_rcu);
+    call_rcu1(&rcu_drain.rcu, drain_rcu_callback);
+    qemu_event_wait(&rcu_drain.drain_complete_event);
+    qatomic_dec(&in_drain_call_rcu);
+
+    if (locked) {
+        qemu_mutex_lock_iothread();
+    }
+
+}
+
+void rcu_register_thread(void)
+{
+    assert(rcu_reader.ctr == 0);
+    qemu_mutex_lock(&rcu_registry_lock);
+    QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_unregister_thread(void)
+{
+    qemu_mutex_lock(&rcu_registry_lock);
+    QLIST_REMOVE(&rcu_reader, node);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_add_force_rcu_notifier(Notifier *n)
+{
+    qemu_mutex_lock(&rcu_registry_lock);
+    notifier_list_add(&rcu_reader.force_rcu, n);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_remove_force_rcu_notifier(Notifier *n)
+{
+    qemu_mutex_lock(&rcu_registry_lock);
+    notifier_remove(n);
+    qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+static void rcu_init_complete(void)
+{
+    QemuThread thread;
+
+    qemu_mutex_init(&rcu_registry_lock);
+    qemu_mutex_init(&rcu_sync_lock);
+    qemu_event_init(&rcu_gp_event, true);
+
+    qemu_event_init(&rcu_call_ready_event, false);
+
+    /* The caller is assumed to have iothread lock, so the call_rcu thread
+     * must have been quiescent even after forking, just recreate it.
+     */
+    qemu_thread_create(&thread, "call_rcu", call_rcu_thread,
+                       NULL, QEMU_THREAD_DETACHED);
+
+    rcu_register_thread();
+}
+
+static int atfork_depth = 1;
+
+void rcu_enable_atfork(void)
+{
+    atfork_depth++;
+}
+
+void rcu_disable_atfork(void)
+{
+    atfork_depth--;
+}
+
+#ifdef CONFIG_POSIX
+static void rcu_init_lock(void)
+{
+    if (atfork_depth < 1) {
+        return;
+    }
+
+    qemu_mutex_lock(&rcu_sync_lock);
+    qemu_mutex_lock(&rcu_registry_lock);
+}
+
+static void rcu_init_unlock(void)
+{
+    if (atfork_depth < 1) {
+        return;
+    }
+
+    qemu_mutex_unlock(&rcu_registry_lock);
+    qemu_mutex_unlock(&rcu_sync_lock);
+}
+
+static void rcu_init_child(void)
+{
+    if (atfork_depth < 1) {
+        return;
+    }
+
+    memset(&registry, 0, sizeof(registry));
+    rcu_init_complete();
+}
+#endif
+
+static void __attribute__((__constructor__)) rcu_init(void)
+{
+    smp_mb_global_init();
+#ifdef CONFIG_POSIX
+    pthread_atfork(rcu_init_lock, rcu_init_unlock, rcu_init_child);
+#endif
+    rcu_init_complete();
+}
diff --git a/util/readline.c b/util/readline.c
new file mode 100644
index 000000000..f1ac6e476
--- /dev/null
+++ b/util/readline.c
@@ -0,0 +1,549 @@
+/*
+ * QEMU readline utility
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/readline.h"
+#include "qemu/ctype.h"
+#include "qemu/cutils.h"
+
+#define IS_NORM 0
+#define IS_ESC  1
+#define IS_CSI  2
+#define IS_SS3  3
+
+void readline_show_prompt(ReadLineState *rs)
+{
+    rs->printf_func(rs->opaque, "%s", rs->prompt);
+    rs->flush_func(rs->opaque);
+    rs->last_cmd_buf_index = 0;
+    rs->last_cmd_buf_size = 0;
+    rs->esc_state = IS_NORM;
+}
+
+/* update the displayed command line */
+static void readline_update(ReadLineState *rs)
+{
+    int i, delta, len;
+
+    if (rs->cmd_buf_size != rs->last_cmd_buf_size ||
+        memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) {
+        for (i = 0; i < rs->last_cmd_buf_index; i++) {
+            rs->printf_func(rs->opaque, "\033[D");
+        }
+        rs->cmd_buf[rs->cmd_buf_size] = '\0';
+        if (rs->read_password) {
+            len = strlen(rs->cmd_buf);
+            for (i = 0; i < len; i++) {
+                rs->printf_func(rs->opaque, "*");
+            }
+        } else {
+            rs->printf_func(rs->opaque, "%s", rs->cmd_buf);
+        }
+        rs->printf_func(rs->opaque, "\033[K");
+        memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size);
+        rs->last_cmd_buf_size = rs->cmd_buf_size;
+        rs->last_cmd_buf_index = rs->cmd_buf_size;
+    }
+    if (rs->cmd_buf_index != rs->last_cmd_buf_index) {
+        delta = rs->cmd_buf_index - rs->last_cmd_buf_index;
+        if (delta > 0) {
+            for (i = 0; i < delta; i++) {
+                rs->printf_func(rs->opaque, "\033[C");
+            }
+        } else {
+            delta = -delta;
+            for (i = 0; i < delta; i++) {
+                rs->printf_func(rs->opaque, "\033[D");
+            }
+        }
+        rs->last_cmd_buf_index = rs->cmd_buf_index;
+    }
+    rs->flush_func(rs->opaque);
+}
+
+static void readline_insert_char(ReadLineState *rs, int ch)
+{
+    if (rs->cmd_buf_index < READLINE_CMD_BUF_SIZE) {
+        memmove(rs->cmd_buf + rs->cmd_buf_index + 1,
+                rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf_size - rs->cmd_buf_index);
+        rs->cmd_buf[rs->cmd_buf_index] = ch;
+        rs->cmd_buf_size++;
+        rs->cmd_buf_index++;
+    }
+}
+
+static void readline_backward_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index > 0) {
+        rs->cmd_buf_index--;
+    }
+}
+
+static void readline_forward_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index < rs->cmd_buf_size) {
+        rs->cmd_buf_index++;
+    }
+}
+
+static void readline_delete_char(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index < rs->cmd_buf_size) {
+        memmove(rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf + rs->cmd_buf_index + 1,
+                rs->cmd_buf_size - rs->cmd_buf_index - 1);
+        rs->cmd_buf_size--;
+    }
+}
+
+static void readline_backspace(ReadLineState *rs)
+{
+    if (rs->cmd_buf_index > 0) {
+        readline_backward_char(rs);
+        readline_delete_char(rs);
+    }
+}
+
+static void readline_backword(ReadLineState *rs)
+{
+    int start;
+
+    if (rs->cmd_buf_index == 0 || rs->cmd_buf_index > rs->cmd_buf_size) {
+        return;
+    }
+
+    start = rs->cmd_buf_index - 1;
+
+    /* find first word (backwards) */
+    while (start > 0) {
+        if (!qemu_isspace(rs->cmd_buf[start])) {
+            break;
+        }
+
+        --start;
+    }
+
+    /* find first space (backwards) */
+    while (start > 0) {
+        if (qemu_isspace(rs->cmd_buf[start])) {
+            ++start;
+            break;
+        }
+
+        --start;
+    }
+
+    /* remove word */
+    if (start < rs->cmd_buf_index) {
+        memmove(rs->cmd_buf + start,
+                rs->cmd_buf + rs->cmd_buf_index,
+                rs->cmd_buf_size - rs->cmd_buf_index);
+        rs->cmd_buf_size -= rs->cmd_buf_index - start;
+        rs->cmd_buf_index = start;
+    }
+}
+
+static void readline_bol(ReadLineState *rs)
+{
+    rs->cmd_buf_index = 0;
+}
+
+static void readline_eol(ReadLineState *rs)
+{
+    rs->cmd_buf_index = rs->cmd_buf_size;
+}
+
+static void readline_up_char(ReadLineState *rs)
+{
+    int idx;
+
+    if (rs->hist_entry == 0) {
+        return;
+    }
+    if (rs->hist_entry == -1) {
+        /* Find latest entry */
+        for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+            if (rs->history[idx] == NULL) {
+                break;
+            }
+        }
+        rs->hist_entry = idx;
+    }
+    rs->hist_entry--;
+    if (rs->hist_entry >= 0) {
+        pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+                rs->history[rs->hist_entry]);
+        rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+    }
+}
+
+static void readline_down_char(ReadLineState *rs)
+{
+    if (rs->hist_entry == -1) {
+        return;
+    }
+    if (rs->hist_entry < READLINE_MAX_CMDS - 1 &&
+        rs->history[++rs->hist_entry] != NULL) {
+        pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+                rs->history[rs->hist_entry]);
+    } else {
+        rs->cmd_buf[0] = 0;
+        rs->hist_entry = -1;
+    }
+    rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+}
+
+static void readline_hist_add(ReadLineState *rs, const char *cmdline)
+{
+    char *hist_entry, *new_entry;
+    int idx;
+
+    if (cmdline[0] == '\0') {
+        return;
+    }
+    new_entry = NULL;
+    if (rs->hist_entry != -1) {
+        /* We were editing an existing history entry: replace it */
+        hist_entry = rs->history[rs->hist_entry];
+        idx = rs->hist_entry;
+        if (strcmp(hist_entry, cmdline) == 0) {
+            goto same_entry;
+        }
+    }
+    /* Search cmdline in history buffers */
+    for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+        hist_entry = rs->history[idx];
+        if (hist_entry == NULL) {
+            break;
+        }
+        if (strcmp(hist_entry, cmdline) == 0) {
+        same_entry:
+            if (idx == READLINE_MAX_CMDS - 1) {
+                return;
+            }
+            new_entry = hist_entry;
+            /* Put this entry at the end of history */
+            memmove(&rs->history[idx], &rs->history[idx + 1],
+                    (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *));
+            rs->history[READLINE_MAX_CMDS - 1] = NULL;
+            for (; idx < READLINE_MAX_CMDS; idx++) {
+                if (rs->history[idx] == NULL) {
+                    break;
+                }
+            }
+            break;
+        }
+    }
+    if (idx == READLINE_MAX_CMDS) {
+        /* Need to get one free slot */
+        g_free(rs->history[0]);
+        memmove(rs->history, &rs->history[1],
+                (READLINE_MAX_CMDS - 1) * sizeof(char *));
+        rs->history[READLINE_MAX_CMDS - 1] = NULL;
+        idx = READLINE_MAX_CMDS - 1;
+    }
+    if (new_entry == NULL) {
+        new_entry = g_strdup(cmdline);
+    }
+    rs->history[idx] = new_entry;
+    rs->hist_entry = -1;
+}
+
+/* completion support */
+
+void readline_add_completion(ReadLineState *rs, const char *str)
+{
+    if (rs->nb_completions < READLINE_MAX_COMPLETIONS) {
+        int i;
+        for (i = 0; i < rs->nb_completions; i++) {
+            if (!strcmp(rs->completions[i], str)) {
+                return;
+            }
+        }
+        rs->completions[rs->nb_completions++] = g_strdup(str);
+    }
+}
+
+void readline_set_completion_index(ReadLineState *rs, int index)
+{
+    rs->completion_index = index;
+}
+
+static int completion_comp(const void *a, const void *b)
+{
+    return strcmp(*(const char **) a, *(const char **) b);
+}
+
+static void readline_completion(ReadLineState *rs)
+{
+    int len, i, j, max_width, nb_cols, max_prefix;
+    char *cmdline;
+
+    rs->nb_completions = 0;
+
+    cmdline = g_strndup(rs->cmd_buf, rs->cmd_buf_index);
+    rs->completion_finder(rs->opaque, cmdline);
+    g_free(cmdline);
+
+    /* no completion found */
+    if (rs->nb_completions <= 0) {
+        return;
+    }
+    if (rs->nb_completions == 1) {
+        len = strlen(rs->completions[0]);
+        for (i = rs->completion_index; i < len; i++) {
+            readline_insert_char(rs, rs->completions[0][i]);
+        }
+        /* extra space for next argument. XXX: make it more generic */
+        if (len > 0 && rs->completions[0][len - 1] != '/') {
+            readline_insert_char(rs, ' ');
+        }
+    } else {
+        qsort(rs->completions, rs->nb_completions, sizeof(char *),
+              completion_comp);
+        rs->printf_func(rs->opaque, "\n");
+        max_width = 0;
+        max_prefix = 0;
+        for (i = 0; i < rs->nb_completions; i++) {
+            len = strlen(rs->completions[i]);
+            if (i == 0) {
+                max_prefix = len;
+            } else {
+                if (len < max_prefix) {
+                    max_prefix = len;
+                }
+                for (j = 0; j < max_prefix; j++) {
+                    if (rs->completions[i][j] != rs->completions[0][j]) {
+                        max_prefix = j;
+                    }
+                }
+            }
+            if (len > max_width) {
+                max_width = len;
+            }
+        }
+        if (max_prefix > 0)
+            for (i = rs->completion_index; i < max_prefix; i++) {
+                readline_insert_char(rs, rs->completions[0][i]);
+            }
+        max_width += 2;
+        if (max_width < 10) {
+            max_width = 10;
+        } else if (max_width > 80) {
+            max_width = 80;
+        }
+        nb_cols = 80 / max_width;
+        j = 0;
+        for (i = 0; i < rs->nb_completions; i++) {
+            rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]);
+            if (++j == nb_cols || i == (rs->nb_completions - 1)) {
+                rs->printf_func(rs->opaque, "\n");
+                j = 0;
+            }
+        }
+        readline_show_prompt(rs);
+    }
+    for (i = 0; i < rs->nb_completions; i++) {
+        g_free(rs->completions[i]);
+    }
+}
+
+static void readline_clear_screen(ReadLineState *rs)
+{
+    rs->printf_func(rs->opaque, "\033[2J\033[1;1H");
+    readline_show_prompt(rs);
+}
+
+/* return true if command handled */
+void readline_handle_byte(ReadLineState *rs, int ch)
+{
+    switch (rs->esc_state) {
+    case IS_NORM:
+        switch (ch) {
+        case 1:
+            readline_bol(rs);
+            break;
+        case 4:
+            readline_delete_char(rs);
+            break;
+        case 5:
+            readline_eol(rs);
+            break;
+        case 9:
+            readline_completion(rs);
+            break;
+        case 12:
+            readline_clear_screen(rs);
+            break;
+        case 10:
+        case 13:
+            rs->cmd_buf[rs->cmd_buf_size] = '\0';
+            if (!rs->read_password) {
+                readline_hist_add(rs, rs->cmd_buf);
+            }
+            rs->printf_func(rs->opaque, "\n");
+            rs->cmd_buf_index = 0;
+            rs->cmd_buf_size = 0;
+            rs->last_cmd_buf_index = 0;
+            rs->last_cmd_buf_size = 0;
+            rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque);
+            break;
+        case 23:
+            /* ^W */
+            readline_backword(rs);
+            break;
+        case 27:
+            rs->esc_state = IS_ESC;
+            break;
+        case 127:
+        case 8:
+            readline_backspace(rs);
+            break;
+        case 155:
+            rs->esc_state = IS_CSI;
+            break;
+        default:
+            if (ch >= 32) {
+                readline_insert_char(rs, ch);
+            }
+            break;
+        }
+        break;
+    case IS_ESC:
+        if (ch == '[') {
+            rs->esc_state = IS_CSI;
+            rs->esc_param = 0;
+        } else if (ch == 'O') {
+            rs->esc_state = IS_SS3;
+            rs->esc_param = 0;
+        } else {
+            rs->esc_state = IS_NORM;
+        }
+        break;
+    case IS_CSI:
+        switch (ch) {
+        case 'A':
+        case 'F':
+            readline_up_char(rs);
+            break;
+        case 'B':
+        case 'E':
+            readline_down_char(rs);
+            break;
+        case 'D':
+            readline_backward_char(rs);
+            break;
+        case 'C':
+            readline_forward_char(rs);
+            break;
+        case '0' ... '9':
+            rs->esc_param = rs->esc_param * 10 + (ch - '0');
+            goto the_end;
+        case '~':
+            switch (rs->esc_param) {
+            case 1:
+                readline_bol(rs);
+                break;
+            case 3:
+                readline_delete_char(rs);
+                break;
+            case 4:
+                readline_eol(rs);
+                break;
+            }
+            break;
+        default:
+            break;
+        }
+        rs->esc_state = IS_NORM;
+    the_end:
+        break;
+    case IS_SS3:
+        switch (ch) {
+        case 'F':
+            readline_eol(rs);
+            break;
+        case 'H':
+            readline_bol(rs);
+            break;
+        }
+        rs->esc_state = IS_NORM;
+        break;
+    }
+    readline_update(rs);
+}
+
+void readline_start(ReadLineState *rs, const char *prompt, int read_password,
+                    ReadLineFunc *readline_func, void *opaque)
+{
+    pstrcpy(rs->prompt, sizeof(rs->prompt), prompt);
+    rs->readline_func = readline_func;
+    rs->readline_opaque = opaque;
+    rs->read_password = read_password;
+    readline_restart(rs);
+}
+
+void readline_restart(ReadLineState *rs)
+{
+    rs->cmd_buf_index = 0;
+    rs->cmd_buf_size = 0;
+}
+
+const char *readline_get_history(ReadLineState *rs, unsigned int index)
+{
+    if (index >= READLINE_MAX_CMDS) {
+        return NULL;
+    }
+    return rs->history[index];
+}
+
+void readline_free(ReadLineState *rs)
+{
+    int i;
+
+    if (!rs) {
+        return;
+    }
+    for (i = 0; i < READLINE_MAX_CMDS; i++) {
+        g_free(rs->history[i]);
+    }
+    g_free(rs);
+}
+
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+                             ReadLineFlushFunc *flush_func,
+                             void *opaque,
+                             ReadLineCompletionFunc *completion_finder)
+{
+    ReadLineState *rs = g_new0(ReadLineState, 1);
+
+    rs->hist_entry = -1;
+    rs->opaque = opaque;
+    rs->printf_func = printf_func;
+    rs->flush_func = flush_func;
+    rs->completion_finder = completion_finder;
+
+    return rs;
+}
diff --git a/util/selfmap.c b/util/selfmap.c
new file mode 100644
index 000000000..2c14f019c
--- /dev/null
+++ b/util/selfmap.c
@@ -0,0 +1,83 @@
+/*
+ * Utility function to get QEMU's own process map
+ *
+ * Copyright (c) 2020 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/selfmap.h"
+
+GSList *read_self_maps(void)
+{
+    gchar *maps;
+    GSList *map_info = NULL;
+
+    if (g_file_get_contents("/proc/self/maps", &maps, NULL, NULL)) {
+        gchar **lines = g_strsplit(maps, "\n", 0);
+        int i, entries = g_strv_length(lines);
+
+        for (i = 0; i < entries; i++) {
+            gchar **fields = g_strsplit(lines[i], " ", 6);
+            if (g_strv_length(fields) > 4) {
+                MapInfo *e = g_new0(MapInfo, 1);
+                int errors = 0;
+                const char *end;
+
+                errors |= qemu_strtoul(fields[0], &end, 16, &e->start);
+                errors |= qemu_strtoul(end + 1, NULL, 16, &e->end);
+
+                e->is_read  = fields[1][0] == 'r';
+                e->is_write = fields[1][1] == 'w';
+                e->is_exec  = fields[1][2] == 'x';
+                e->is_priv  = fields[1][3] == 'p';
+
+                errors |= qemu_strtoul(fields[2], NULL, 16, &e->offset);
+                e->dev = g_strdup(fields[3]);
+                errors |= qemu_strtou64(fields[4], NULL, 10, &e->inode);
+
+                if (!errors) {
+                    /*
+                     * The last field may have leading spaces which we
+                     * need to strip.
+                     */
+                    if (g_strv_length(fields) == 6) {
+                        e->path = g_strdup(g_strchug(fields[5]));
+                    }
+                    map_info = g_slist_prepend(map_info, e);
+                } else {
+                    g_free(e->dev);
+                    g_free(e);
+                }
+            }
+
+            g_strfreev(fields);
+        }
+        g_strfreev(lines);
+        g_free(maps);
+    }
+
+    /* ensure the map data is in the same order we collected it */
+    return g_slist_reverse(map_info);
+}
+
+/**
+ * free_self_maps:
+ * @info: a GSlist
+ *
+ * Free a list of MapInfo structures.
+ */
+static void free_info(gpointer data)
+{
+    MapInfo *e = (MapInfo *) data;
+    g_free(e->dev);
+    g_free(e->path);
+    g_free(e);
+}
+
+void free_self_maps(GSList *info)
+{
+    g_slist_free_full(info, &free_info);
+}
diff --git a/util/stats64.c b/util/stats64.c
new file mode 100644
index 000000000..897613c94
--- /dev/null
+++ b/util/stats64.c
@@ -0,0 +1,137 @@
+/*
+ * Atomic operations on 64-bit quantities.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/stats64.h"
+#include "qemu/processor.h"
+
+#ifndef CONFIG_ATOMIC64
+static inline void stat64_rdlock(Stat64 *s)
+{
+    /* Keep out incoming writers to avoid them starving us. */
+    qatomic_add(&s->lock, 2);
+
+    /* If there is a concurrent writer, wait for it.  */
+    while (qatomic_read(&s->lock) & 1) {
+        cpu_relax();
+    }
+}
+
+static inline void stat64_rdunlock(Stat64 *s)
+{
+    qatomic_sub(&s->lock, 2);
+}
+
+static inline bool stat64_wrtrylock(Stat64 *s)
+{
+    return qatomic_cmpxchg(&s->lock, 0, 1) == 0;
+}
+
+static inline void stat64_wrunlock(Stat64 *s)
+{
+    qatomic_dec(&s->lock);
+}
+
+uint64_t stat64_get(const Stat64 *s)
+{
+    uint32_t high, low;
+
+    stat64_rdlock((Stat64 *)s);
+
+    /* 64-bit writes always take the lock, so we can read in
+     * any order.
+     */
+    high = qatomic_read(&s->high);
+    low = qatomic_read(&s->low);
+    stat64_rdunlock((Stat64 *)s);
+
+    return ((uint64_t)high << 32) | low;
+}
+
+bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high)
+{
+    uint32_t old;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    /* 64-bit reads always take the lock, so they don't care about the
+     * order of our update.  By updating s->low first, we can check
+     * whether we have to carry into s->high.
+     */
+    old = qatomic_fetch_add(&s->low, low);
+    high += (old + low) < old;
+    qatomic_add(&s->high, high);
+    stat64_wrunlock(s);
+    return true;
+}
+
+bool stat64_min_slow(Stat64 *s, uint64_t value)
+{
+    uint32_t high, low;
+    uint64_t orig;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    high = qatomic_read(&s->high);
+    low = qatomic_read(&s->low);
+
+    orig = ((uint64_t)high << 32) | low;
+    if (value < orig) {
+        /* We have to set low before high, just like stat64_min reads
+         * high before low.  The value may become higher temporarily, but
+         * stat64_get does not notice (it takes the lock) and the only ill
+         * effect on stat64_min is that the slow path may be triggered
+         * unnecessarily.
+         */
+        qatomic_set(&s->low, (uint32_t)value);
+        smp_wmb();
+        qatomic_set(&s->high, value >> 32);
+    }
+    stat64_wrunlock(s);
+    return true;
+}
+
+bool stat64_max_slow(Stat64 *s, uint64_t value)
+{
+    uint32_t high, low;
+    uint64_t orig;
+
+    if (!stat64_wrtrylock(s)) {
+        cpu_relax();
+        return false;
+    }
+
+    high = qatomic_read(&s->high);
+    low = qatomic_read(&s->low);
+
+    orig = ((uint64_t)high << 32) | low;
+    if (value > orig) {
+        /* We have to set low before high, just like stat64_max reads
+         * high before low.  The value may become lower temporarily, but
+         * stat64_get does not notice (it takes the lock) and the only ill
+         * effect on stat64_max is that the slow path may be triggered
+         * unnecessarily.
+         */
+        qatomic_set(&s->low, (uint32_t)value);
+        smp_wmb();
+        qatomic_set(&s->high, value >> 32);
+    }
+    stat64_wrunlock(s);
+    return true;
+}
+#endif
diff --git a/util/sys_membarrier.c b/util/sys_membarrier.c
new file mode 100644
index 000000000..1362c0c4c
--- /dev/null
+++ b/util/sys_membarrier.c
@@ -0,0 +1,50 @@
+/*
+ * Process-global memory barriers
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/sys_membarrier.h"
+#include "qemu/error-report.h"
+
+#ifdef CONFIG_LINUX
+#include <linux/membarrier.h>
+#include <sys/syscall.h>
+
+static int
+membarrier(int cmd, int flags)
+{
+    return syscall(__NR_membarrier, cmd, flags);
+}
+#endif
+
+void smp_mb_global(void)
+{
+#if defined CONFIG_WIN32
+    FlushProcessWriteBuffers();
+#elif defined CONFIG_LINUX
+    membarrier(MEMBARRIER_CMD_SHARED, 0);
+#else
+#error --enable-membarrier is not supported on this operating system.
+#endif
+}
+
+void smp_mb_global_init(void)
+{
+#ifdef CONFIG_LINUX
+    int ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
+    if (ret < 0) {
+        error_report("This QEMU binary requires the membarrier system call.");
+        error_report("Please upgrade your system to a newer version of Linux");
+        exit(1);
+    }
+    if (!(ret & MEMBARRIER_CMD_SHARED)) {
+        error_report("This QEMU binary requires MEMBARRIER_CMD_SHARED support.");
+        error_report("Please upgrade your system to a newer version of Linux");
+        exit(1);
+    }
+#endif
+}
diff --git a/util/systemd.c b/util/systemd.c
new file mode 100644
index 000000000..5bcac9b40
--- /dev/null
+++ b/util/systemd.c
@@ -0,0 +1,79 @@
+/*
+ * systemd socket activation support
+ *
+ * Copyright 2017 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Richard W.M. Jones <rjones@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/systemd.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+#ifndef _WIN32
+unsigned int check_socket_activation(void)
+{
+    const char *s;
+    unsigned long pid;
+    unsigned long nr_fds;
+    unsigned int i;
+    int fd;
+    int f;
+    int err;
+
+    s = getenv("LISTEN_PID");
+    if (s == NULL) {
+        return 0;
+    }
+    err = qemu_strtoul(s, NULL, 10, &pid);
+    if (err) {
+        return 0;
+    }
+    if (pid != getpid()) {
+        return 0;
+    }
+
+    s = getenv("LISTEN_FDS");
+    if (s == NULL) {
+        return 0;
+    }
+    err = qemu_strtoul(s, NULL, 10, &nr_fds);
+    if (err) {
+        return 0;
+    }
+    assert(nr_fds <= UINT_MAX);
+
+    /* So these are not passed to any child processes we might start. */
+    unsetenv("LISTEN_FDS");
+    unsetenv("LISTEN_PID");
+
+    /* So the file descriptors don't leak into child processes. */
+    for (i = 0; i < nr_fds; ++i) {
+        fd = FIRST_SOCKET_ACTIVATION_FD + i;
+        f = fcntl(fd, F_GETFD);
+        if (f == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) {
+            /* If we cannot set FD_CLOEXEC then it probably means the file
+             * descriptor is invalid, so socket activation has gone wrong
+             * and we should exit.
+             */
+            error_report("Socket activation failed: "
+                         "invalid file descriptor fd = %d: %s",
+                         fd, g_strerror(errno));
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    return (unsigned int) nr_fds;
+}
+
+#else /* !_WIN32 */
+unsigned int check_socket_activation(void)
+{
+    return 0;
+}
+#endif
diff --git a/util/thread-pool.c b/util/thread-pool.c
new file mode 100644
index 000000000..d763cea50
--- /dev/null
+++ b/util/thread-pool.c
@@ -0,0 +1,352 @@
+/*
+ * QEMU block layer thread pool
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "qemu/coroutine.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/main-loop.h"
+
+static void do_spawn_thread(ThreadPool *pool);
+
+typedef struct ThreadPoolElement ThreadPoolElement;
+
+enum ThreadState {
+    THREAD_QUEUED,
+    THREAD_ACTIVE,
+    THREAD_DONE,
+};
+
+struct ThreadPoolElement {
+    BlockAIOCB common;
+    ThreadPool *pool;
+    ThreadPoolFunc *func;
+    void *arg;
+
+    /* Moving state out of THREAD_QUEUED is protected by lock.  After
+     * that, only the worker thread can write to it.  Reads and writes
+     * of state and ret are ordered with memory barriers.
+     */
+    enum ThreadState state;
+    int ret;
+
+    /* Access to this list is protected by lock.  */
+    QTAILQ_ENTRY(ThreadPoolElement) reqs;
+
+    /* Access to this list is protected by the global mutex.  */
+    QLIST_ENTRY(ThreadPoolElement) all;
+};
+
+struct ThreadPool {
+    AioContext *ctx;
+    QEMUBH *completion_bh;
+    QemuMutex lock;
+    QemuCond worker_stopped;
+    QemuSemaphore sem;
+    int max_threads;
+    QEMUBH *new_thread_bh;
+
+    /* The following variables are only accessed from one AioContext. */
+    QLIST_HEAD(, ThreadPoolElement) head;
+
+    /* The following variables are protected by lock.  */
+    QTAILQ_HEAD(, ThreadPoolElement) request_list;
+    int cur_threads;
+    int idle_threads;
+    int new_threads;     /* backlog of threads we need to create */
+    int pending_threads; /* threads created but not running yet */
+    bool stopping;
+};
+
+static void *worker_thread(void *opaque)
+{
+    ThreadPool *pool = opaque;
+
+    qemu_mutex_lock(&pool->lock);
+    pool->pending_threads--;
+    do_spawn_thread(pool);
+
+    while (!pool->stopping) {
+        ThreadPoolElement *req;
+        int ret;
+
+        do {
+            pool->idle_threads++;
+            qemu_mutex_unlock(&pool->lock);
+            ret = qemu_sem_timedwait(&pool->sem, 10000);
+            qemu_mutex_lock(&pool->lock);
+            pool->idle_threads--;
+        } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
+        if (ret == -1 || pool->stopping) {
+            break;
+        }
+
+        req = QTAILQ_FIRST(&pool->request_list);
+        QTAILQ_REMOVE(&pool->request_list, req, reqs);
+        req->state = THREAD_ACTIVE;
+        qemu_mutex_unlock(&pool->lock);
+
+        ret = req->func(req->arg);
+
+        req->ret = ret;
+        /* Write ret before state.  */
+        smp_wmb();
+        req->state = THREAD_DONE;
+
+        qemu_mutex_lock(&pool->lock);
+
+        qemu_bh_schedule(pool->completion_bh);
+    }
+
+    pool->cur_threads--;
+    qemu_cond_signal(&pool->worker_stopped);
+    qemu_mutex_unlock(&pool->lock);
+    return NULL;
+}
+
+static void do_spawn_thread(ThreadPool *pool)
+{
+    QemuThread t;
+
+    /* Runs with lock taken.  */
+    if (!pool->new_threads) {
+        return;
+    }
+
+    pool->new_threads--;
+    pool->pending_threads++;
+
+    qemu_thread_create(&t, "worker", worker_thread, pool, QEMU_THREAD_DETACHED);
+}
+
+static void spawn_thread_bh_fn(void *opaque)
+{
+    ThreadPool *pool = opaque;
+
+    qemu_mutex_lock(&pool->lock);
+    do_spawn_thread(pool);
+    qemu_mutex_unlock(&pool->lock);
+}
+
+static void spawn_thread(ThreadPool *pool)
+{
+    pool->cur_threads++;
+    pool->new_threads++;
+    /* If there are threads being created, they will spawn new workers, so
+     * we don't spend time creating many threads in a loop holding a mutex or
+     * starving the current vcpu.
+     *
+     * If there are no idle threads, ask the main thread to create one, so we
+     * inherit the correct affinity instead of the vcpu affinity.
+     */
+    if (!pool->pending_threads) {
+        qemu_bh_schedule(pool->new_thread_bh);
+    }
+}
+
+static void thread_pool_completion_bh(void *opaque)
+{
+    ThreadPool *pool = opaque;
+    ThreadPoolElement *elem, *next;
+
+    aio_context_acquire(pool->ctx);
+restart:
+    QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
+        if (elem->state != THREAD_DONE) {
+            continue;
+        }
+
+        trace_thread_pool_complete(pool, elem, elem->common.opaque,
+                                   elem->ret);
+        QLIST_REMOVE(elem, all);
+
+        if (elem->common.cb) {
+            /* Read state before ret.  */
+            smp_rmb();
+
+            /* Schedule ourselves in case elem->common.cb() calls aio_poll() to
+             * wait for another request that completed at the same time.
+             */
+            qemu_bh_schedule(pool->completion_bh);
+
+            aio_context_release(pool->ctx);
+            elem->common.cb(elem->common.opaque, elem->ret);
+            aio_context_acquire(pool->ctx);
+
+            /* We can safely cancel the completion_bh here regardless of someone
+             * else having scheduled it meanwhile because we reenter the
+             * completion function anyway (goto restart).
+             */
+            qemu_bh_cancel(pool->completion_bh);
+
+            qemu_aio_unref(elem);
+            goto restart;
+        } else {
+            qemu_aio_unref(elem);
+        }
+    }
+    aio_context_release(pool->ctx);
+}
+
+static void thread_pool_cancel(BlockAIOCB *acb)
+{
+    ThreadPoolElement *elem = (ThreadPoolElement *)acb;
+    ThreadPool *pool = elem->pool;
+
+    trace_thread_pool_cancel(elem, elem->common.opaque);
+
+    QEMU_LOCK_GUARD(&pool->lock);
+    if (elem->state == THREAD_QUEUED &&
+        /* No thread has yet started working on elem. we can try to "steal"
+         * the item from the worker if we can get a signal from the
+         * semaphore.  Because this is non-blocking, we can do it with
+         * the lock taken and ensure that elem will remain THREAD_QUEUED.
+         */
+        qemu_sem_timedwait(&pool->sem, 0) == 0) {
+        QTAILQ_REMOVE(&pool->request_list, elem, reqs);
+        qemu_bh_schedule(pool->completion_bh);
+
+        elem->state = THREAD_DONE;
+        elem->ret = -ECANCELED;
+    }
+
+}
+
+static AioContext *thread_pool_get_aio_context(BlockAIOCB *acb)
+{
+    ThreadPoolElement *elem = (ThreadPoolElement *)acb;
+    ThreadPool *pool = elem->pool;
+    return pool->ctx;
+}
+
+static const AIOCBInfo thread_pool_aiocb_info = {
+    .aiocb_size         = sizeof(ThreadPoolElement),
+    .cancel_async       = thread_pool_cancel,
+    .get_aio_context    = thread_pool_get_aio_context,
+};
+
+BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
+        ThreadPoolFunc *func, void *arg,
+        BlockCompletionFunc *cb, void *opaque)
+{
+    ThreadPoolElement *req;
+
+    req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque);
+    req->func = func;
+    req->arg = arg;
+    req->state = THREAD_QUEUED;
+    req->pool = pool;
+
+    QLIST_INSERT_HEAD(&pool->head, req, all);
+
+    trace_thread_pool_submit(pool, req, arg);
+
+    qemu_mutex_lock(&pool->lock);
+    if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) {
+        spawn_thread(pool);
+    }
+    QTAILQ_INSERT_TAIL(&pool->request_list, req, reqs);
+    qemu_mutex_unlock(&pool->lock);
+    qemu_sem_post(&pool->sem);
+    return &req->common;
+}
+
+typedef struct ThreadPoolCo {
+    Coroutine *co;
+    int ret;
+} ThreadPoolCo;
+
+static void thread_pool_co_cb(void *opaque, int ret)
+{
+    ThreadPoolCo *co = opaque;
+
+    co->ret = ret;
+    aio_co_wake(co->co);
+}
+
+int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
+                                       void *arg)
+{
+    ThreadPoolCo tpc = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS };
+    assert(qemu_in_coroutine());
+    thread_pool_submit_aio(pool, func, arg, thread_pool_co_cb, &tpc);
+    qemu_coroutine_yield();
+    return tpc.ret;
+}
+
+void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
+{
+    thread_pool_submit_aio(pool, func, arg, NULL, NULL);
+}
+
+static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
+{
+    if (!ctx) {
+        ctx = qemu_get_aio_context();
+    }
+
+    memset(pool, 0, sizeof(*pool));
+    pool->ctx = ctx;
+    pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool);
+    qemu_mutex_init(&pool->lock);
+    qemu_cond_init(&pool->worker_stopped);
+    qemu_sem_init(&pool->sem, 0);
+    pool->max_threads = 64;
+    pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
+
+    QLIST_INIT(&pool->head);
+    QTAILQ_INIT(&pool->request_list);
+}
+
+ThreadPool *thread_pool_new(AioContext *ctx)
+{
+    ThreadPool *pool = g_new(ThreadPool, 1);
+    thread_pool_init_one(pool, ctx);
+    return pool;
+}
+
+void thread_pool_free(ThreadPool *pool)
+{
+    if (!pool) {
+        return;
+    }
+
+    assert(QLIST_EMPTY(&pool->head));
+
+    qemu_mutex_lock(&pool->lock);
+
+    /* Stop new threads from spawning */
+    qemu_bh_delete(pool->new_thread_bh);
+    pool->cur_threads -= pool->new_threads;
+    pool->new_threads = 0;
+
+    /* Wait for worker threads to terminate */
+    pool->stopping = true;
+    while (pool->cur_threads > 0) {
+        qemu_sem_post(&pool->sem);
+        qemu_cond_wait(&pool->worker_stopped, &pool->lock);
+    }
+
+    qemu_mutex_unlock(&pool->lock);
+
+    qemu_bh_delete(pool->completion_bh);
+    qemu_sem_destroy(&pool->sem);
+    qemu_cond_destroy(&pool->worker_stopped);
+    qemu_mutex_destroy(&pool->lock);
+    g_free(pool);
+}
diff --git a/util/throttle.c b/util/throttle.c
new file mode 100644
index 000000000..81f247a8d
--- /dev/null
+++ b/util/throttle.c
@@ -0,0 +1,637 @@
+/*
+ * QEMU throttling infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2013-2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/throttle.h"
+#include "qemu/timer.h"
+#include "block/aio.h"
+
+/* This function make a bucket leak
+ *
+ * @bkt:   the bucket to make leak
+ * @delta_ns: the time delta
+ */
+void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns)
+{
+    double leak;
+
+    /* compute how much to leak */
+    leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND;
+
+    /* make the bucket leak */
+    bkt->level = MAX(bkt->level - leak, 0);
+
+    /* if we allow bursts for more than one second we also need to
+     * keep track of bkt->burst_level so the bkt->max goal per second
+     * is attained */
+    if (bkt->burst_length > 1) {
+        leak = (bkt->max * (double) delta_ns) / NANOSECONDS_PER_SECOND;
+        bkt->burst_level = MAX(bkt->burst_level - leak, 0);
+    }
+}
+
+/* Calculate the time delta since last leak and make proportionals leaks
+ *
+ * @now:      the current timestamp in ns
+ */
+static void throttle_do_leak(ThrottleState *ts, int64_t now)
+{
+    /* compute the time elapsed since the last leak */
+    int64_t delta_ns = now - ts->previous_leak;
+    int i;
+
+    ts->previous_leak = now;
+
+    if (delta_ns <= 0) {
+        return;
+    }
+
+    /* make each bucket leak */
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns);
+    }
+}
+
+/* do the real job of computing the time to wait
+ *
+ * @limit: the throttling limit
+ * @extra: the number of operation to delay
+ * @ret:   the time to wait in ns
+ */
+static int64_t throttle_do_compute_wait(double limit, double extra)
+{
+    double wait = extra * NANOSECONDS_PER_SECOND;
+    wait /= limit;
+    return wait;
+}
+
+/* This function compute the wait time in ns that a leaky bucket should trigger
+ *
+ * @bkt: the leaky bucket we operate on
+ * @ret: the resulting wait time in ns or 0 if the operation can go through
+ */
+int64_t throttle_compute_wait(LeakyBucket *bkt)
+{
+    double extra; /* the number of extra units blocking the io */
+    double bucket_size;   /* I/O before throttling to bkt->avg */
+    double burst_bucket_size; /* Before throttling to bkt->max */
+
+    if (!bkt->avg) {
+        return 0;
+    }
+
+    if (!bkt->max) {
+        /* If bkt->max is 0 we still want to allow short bursts of I/O
+         * from the guest, otherwise every other request will be throttled
+         * and performance will suffer considerably. */
+        bucket_size = (double) bkt->avg / 10;
+        burst_bucket_size = 0;
+    } else {
+        /* If we have a burst limit then we have to wait until all I/O
+         * at burst rate has finished before throttling to bkt->avg */
+        bucket_size = bkt->max * bkt->burst_length;
+        burst_bucket_size = (double) bkt->max / 10;
+    }
+
+    /* If the main bucket is full then we have to wait */
+    extra = bkt->level - bucket_size;
+    if (extra > 0) {
+        return throttle_do_compute_wait(bkt->avg, extra);
+    }
+
+    /* If the main bucket is not full yet we still have to check the
+     * burst bucket in order to enforce the burst limit */
+    if (bkt->burst_length > 1) {
+        assert(bkt->max > 0); /* see throttle_is_valid() */
+        extra = bkt->burst_level - burst_bucket_size;
+        if (extra > 0) {
+            return throttle_do_compute_wait(bkt->max, extra);
+        }
+    }
+
+    return 0;
+}
+
+/* This function compute the time that must be waited while this IO
+ *
+ * @is_write:   true if the current IO is a write, false if it's a read
+ * @ret:        time to wait
+ */
+static int64_t throttle_compute_wait_for(ThrottleState *ts,
+                                         bool is_write)
+{
+    BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL,
+                                   THROTTLE_OPS_TOTAL,
+                                   THROTTLE_BPS_READ,
+                                   THROTTLE_OPS_READ},
+                                  {THROTTLE_BPS_TOTAL,
+                                   THROTTLE_OPS_TOTAL,
+                                   THROTTLE_BPS_WRITE,
+                                   THROTTLE_OPS_WRITE}, };
+    int64_t wait, max_wait = 0;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        BucketType index = to_check[is_write][i];
+        wait = throttle_compute_wait(&ts->cfg.buckets[index]);
+        if (wait > max_wait) {
+            max_wait = wait;
+        }
+    }
+
+    return max_wait;
+}
+
+/* compute the timer for this type of operation
+ *
+ * @is_write:   the type of operation
+ * @now:        the current clock timestamp
+ * @next_timestamp: the resulting timer
+ * @ret:        true if a timer must be set
+ */
+static bool throttle_compute_timer(ThrottleState *ts,
+                                   bool is_write,
+                                   int64_t now,
+                                   int64_t *next_timestamp)
+{
+    int64_t wait;
+
+    /* leak proportionally to the time elapsed */
+    throttle_do_leak(ts, now);
+
+    /* compute the wait time if any */
+    wait = throttle_compute_wait_for(ts, is_write);
+
+    /* if the code must wait compute when the next timer should fire */
+    if (wait) {
+        *next_timestamp = now + wait;
+        return true;
+    }
+
+    /* else no need to wait at all */
+    *next_timestamp = now;
+    return false;
+}
+
+/* Add timers to event loop */
+void throttle_timers_attach_aio_context(ThrottleTimers *tt,
+                                        AioContext *new_context)
+{
+    tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+                                  tt->read_timer_cb, tt->timer_opaque);
+    tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+                                  tt->write_timer_cb, tt->timer_opaque);
+}
+
+/*
+ * Initialize the ThrottleConfig structure to a valid state
+ * @cfg: the config to initialize
+ */
+void throttle_config_init(ThrottleConfig *cfg)
+{
+    unsigned i;
+    memset(cfg, 0, sizeof(*cfg));
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        cfg->buckets[i].burst_length = 1;
+    }
+}
+
+/* To be called first on the ThrottleState */
+void throttle_init(ThrottleState *ts)
+{
+    memset(ts, 0, sizeof(ThrottleState));
+    throttle_config_init(&ts->cfg);
+}
+
+/* To be called first on the ThrottleTimers */
+void throttle_timers_init(ThrottleTimers *tt,
+                          AioContext *aio_context,
+                          QEMUClockType clock_type,
+                          QEMUTimerCB *read_timer_cb,
+                          QEMUTimerCB *write_timer_cb,
+                          void *timer_opaque)
+{
+    memset(tt, 0, sizeof(ThrottleTimers));
+
+    tt->clock_type = clock_type;
+    tt->read_timer_cb = read_timer_cb;
+    tt->write_timer_cb = write_timer_cb;
+    tt->timer_opaque = timer_opaque;
+    throttle_timers_attach_aio_context(tt, aio_context);
+}
+
+/* destroy a timer */
+static void throttle_timer_destroy(QEMUTimer **timer)
+{
+    assert(*timer != NULL);
+
+    timer_free(*timer);
+    *timer = NULL;
+}
+
+/* Remove timers from event loop */
+void throttle_timers_detach_aio_context(ThrottleTimers *tt)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        throttle_timer_destroy(&tt->timers[i]);
+    }
+}
+
+/* To be called last on the ThrottleTimers */
+void throttle_timers_destroy(ThrottleTimers *tt)
+{
+    throttle_timers_detach_aio_context(tt);
+}
+
+/* is any throttling timer configured */
+bool throttle_timers_are_initialized(ThrottleTimers *tt)
+{
+    if (tt->timers[0]) {
+        return true;
+    }
+
+    return false;
+}
+
+/* Does any throttling must be done
+ *
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if throttling must be done else false
+ */
+bool throttle_enabled(ThrottleConfig *cfg)
+{
+    int i;
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        if (cfg->buckets[i].avg > 0) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+/* check if a throttling configuration is valid
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if valid else false
+ * @errp: error object
+ */
+bool throttle_is_valid(ThrottleConfig *cfg, Error **errp)
+{
+    int i;
+    bool bps_flag, ops_flag;
+    bool bps_max_flag, ops_max_flag;
+
+    bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg &&
+               (cfg->buckets[THROTTLE_BPS_READ].avg ||
+                cfg->buckets[THROTTLE_BPS_WRITE].avg);
+
+    ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg &&
+               (cfg->buckets[THROTTLE_OPS_READ].avg ||
+                cfg->buckets[THROTTLE_OPS_WRITE].avg);
+
+    bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max &&
+                  (cfg->buckets[THROTTLE_BPS_READ].max  ||
+                   cfg->buckets[THROTTLE_BPS_WRITE].max);
+
+    ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max &&
+                   (cfg->buckets[THROTTLE_OPS_READ].max ||
+                   cfg->buckets[THROTTLE_OPS_WRITE].max);
+
+    if (bps_flag || ops_flag || bps_max_flag || ops_max_flag) {
+        error_setg(errp, "bps/iops/max total values and read/write values"
+                   " cannot be used at the same time");
+        return false;
+    }
+
+    if (cfg->op_size &&
+        !cfg->buckets[THROTTLE_OPS_TOTAL].avg &&
+        !cfg->buckets[THROTTLE_OPS_READ].avg &&
+        !cfg->buckets[THROTTLE_OPS_WRITE].avg) {
+        error_setg(errp, "iops size requires an iops value to be set");
+        return false;
+    }
+
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        LeakyBucket *bkt = &cfg->buckets[i];
+        if (bkt->avg > THROTTLE_VALUE_MAX || bkt->max > THROTTLE_VALUE_MAX) {
+            error_setg(errp, "bps/iops/max values must be within [0, %lld]",
+                       THROTTLE_VALUE_MAX);
+            return false;
+        }
+
+        if (!bkt->burst_length) {
+            error_setg(errp, "the burst length cannot be 0");
+            return false;
+        }
+
+        if (bkt->burst_length > 1 && !bkt->max) {
+            error_setg(errp, "burst length set without burst rate");
+            return false;
+        }
+
+        if (bkt->max && bkt->burst_length > THROTTLE_VALUE_MAX / bkt->max) {
+            error_setg(errp, "burst length too high for this burst rate");
+            return false;
+        }
+
+        if (bkt->max && !bkt->avg) {
+            error_setg(errp, "bps_max/iops_max require corresponding"
+                       " bps/iops values");
+            return false;
+        }
+
+        if (bkt->max && bkt->max < bkt->avg) {
+            error_setg(errp, "bps_max/iops_max cannot be lower than bps/iops");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/* Used to configure the throttle
+ *
+ * @ts: the throttle state we are working on
+ * @clock_type: the group's clock_type
+ * @cfg: the config to set
+ */
+void throttle_config(ThrottleState *ts,
+                     QEMUClockType clock_type,
+                     ThrottleConfig *cfg)
+{
+    int i;
+
+    ts->cfg = *cfg;
+
+    /* Zero bucket level */
+    for (i = 0; i < BUCKETS_COUNT; i++) {
+        ts->cfg.buckets[i].level = 0;
+        ts->cfg.buckets[i].burst_level = 0;
+    }
+
+    ts->previous_leak = qemu_clock_get_ns(clock_type);
+}
+
+/* used to get config
+ *
+ * @ts:  the throttle state we are working on
+ * @cfg: the config to write
+ */
+void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg)
+{
+    *cfg = ts->cfg;
+}
+
+
+/* Schedule the read or write timer if needed
+ *
+ * NOTE: this function is not unit tested due to it's usage of timer_mod
+ *
+ * @tt:       the timers structure
+ * @is_write: the type of operation (read/write)
+ * @ret:      true if the timer has been scheduled else false
+ */
+bool throttle_schedule_timer(ThrottleState *ts,
+                             ThrottleTimers *tt,
+                             bool is_write)
+{
+    int64_t now = qemu_clock_get_ns(tt->clock_type);
+    int64_t next_timestamp;
+    bool must_wait;
+
+    must_wait = throttle_compute_timer(ts,
+                                       is_write,
+                                       now,
+                                       &next_timestamp);
+
+    /* request not throttled */
+    if (!must_wait) {
+        return false;
+    }
+
+    /* request throttled and timer pending -> do nothing */
+    if (timer_pending(tt->timers[is_write])) {
+        return true;
+    }
+
+    /* request throttled and timer not pending -> arm timer */
+    timer_mod(tt->timers[is_write], next_timestamp);
+    return true;
+}
+
+/* do the accounting for this operation
+ *
+ * @is_write: the type of operation (read/write)
+ * @size:     the size of the operation
+ */
+void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
+{
+    const BucketType bucket_types_size[2][2] = {
+        { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ },
+        { THROTTLE_BPS_TOTAL, THROTTLE_BPS_WRITE }
+    };
+    const BucketType bucket_types_units[2][2] = {
+        { THROTTLE_OPS_TOTAL, THROTTLE_OPS_READ },
+        { THROTTLE_OPS_TOTAL, THROTTLE_OPS_WRITE }
+    };
+    double units = 1.0;
+    unsigned i;
+
+    /* if cfg.op_size is defined and smaller than size we compute unit count */
+    if (ts->cfg.op_size && size > ts->cfg.op_size) {
+        units = (double) size / ts->cfg.op_size;
+    }
+
+    for (i = 0; i < 2; i++) {
+        LeakyBucket *bkt;
+
+        bkt = &ts->cfg.buckets[bucket_types_size[is_write][i]];
+        bkt->level += size;
+        if (bkt->burst_length > 1) {
+            bkt->burst_level += size;
+        }
+
+        bkt = &ts->cfg.buckets[bucket_types_units[is_write][i]];
+        bkt->level += units;
+        if (bkt->burst_length > 1) {
+            bkt->burst_level += units;
+        }
+    }
+}
+
+/* return a ThrottleConfig based on the options in a ThrottleLimits
+ *
+ * @arg:    the ThrottleLimits object to read from
+ * @cfg:    the ThrottleConfig to edit
+ * @errp:   error object
+ */
+void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
+                               Error **errp)
+{
+    if (arg->has_bps_total) {
+        cfg->buckets[THROTTLE_BPS_TOTAL].avg = arg->bps_total;
+    }
+    if (arg->has_bps_read) {
+        cfg->buckets[THROTTLE_BPS_READ].avg  = arg->bps_read;
+    }
+    if (arg->has_bps_write) {
+        cfg->buckets[THROTTLE_BPS_WRITE].avg = arg->bps_write;
+    }
+
+    if (arg->has_iops_total) {
+        cfg->buckets[THROTTLE_OPS_TOTAL].avg = arg->iops_total;
+    }
+    if (arg->has_iops_read) {
+        cfg->buckets[THROTTLE_OPS_READ].avg  = arg->iops_read;
+    }
+    if (arg->has_iops_write) {
+        cfg->buckets[THROTTLE_OPS_WRITE].avg = arg->iops_write;
+    }
+
+    if (arg->has_bps_total_max) {
+        cfg->buckets[THROTTLE_BPS_TOTAL].max = arg->bps_total_max;
+    }
+    if (arg->has_bps_read_max) {
+        cfg->buckets[THROTTLE_BPS_READ].max = arg->bps_read_max;
+    }
+    if (arg->has_bps_write_max) {
+        cfg->buckets[THROTTLE_BPS_WRITE].max = arg->bps_write_max;
+    }
+    if (arg->has_iops_total_max) {
+        cfg->buckets[THROTTLE_OPS_TOTAL].max = arg->iops_total_max;
+    }
+    if (arg->has_iops_read_max) {
+        cfg->buckets[THROTTLE_OPS_READ].max = arg->iops_read_max;
+    }
+    if (arg->has_iops_write_max) {
+        cfg->buckets[THROTTLE_OPS_WRITE].max = arg->iops_write_max;
+    }
+
+    if (arg->has_bps_total_max_length) {
+        if (arg->bps_total_max_length > UINT_MAX) {
+            error_setg(errp, "bps-total-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_TOTAL].burst_length = arg->bps_total_max_length;
+    }
+    if (arg->has_bps_read_max_length) {
+        if (arg->bps_read_max_length > UINT_MAX) {
+            error_setg(errp, "bps-read-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_READ].burst_length = arg->bps_read_max_length;
+    }
+    if (arg->has_bps_write_max_length) {
+        if (arg->bps_write_max_length > UINT_MAX) {
+            error_setg(errp, "bps-write-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_BPS_WRITE].burst_length = arg->bps_write_max_length;
+    }
+    if (arg->has_iops_total_max_length) {
+        if (arg->iops_total_max_length > UINT_MAX) {
+            error_setg(errp, "iops-total-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_TOTAL].burst_length = arg->iops_total_max_length;
+    }
+    if (arg->has_iops_read_max_length) {
+        if (arg->iops_read_max_length > UINT_MAX) {
+            error_setg(errp, "iops-read-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_READ].burst_length = arg->iops_read_max_length;
+    }
+    if (arg->has_iops_write_max_length) {
+        if (arg->iops_write_max_length > UINT_MAX) {
+            error_setg(errp, "iops-write-max-length value must be in"
+                             " the range [0, %u]", UINT_MAX);
+            return;
+        }
+        cfg->buckets[THROTTLE_OPS_WRITE].burst_length = arg->iops_write_max_length;
+    }
+
+    if (arg->has_iops_size) {
+        cfg->op_size = arg->iops_size;
+    }
+
+    throttle_is_valid(cfg, errp);
+}
+
+/* write the options of a ThrottleConfig to a ThrottleLimits
+ *
+ * @cfg:    the ThrottleConfig to read from
+ * @var:    the ThrottleLimits to write to
+ */
+void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var)
+{
+    var->bps_total               = cfg->buckets[THROTTLE_BPS_TOTAL].avg;
+    var->bps_read                = cfg->buckets[THROTTLE_BPS_READ].avg;
+    var->bps_write               = cfg->buckets[THROTTLE_BPS_WRITE].avg;
+    var->iops_total              = cfg->buckets[THROTTLE_OPS_TOTAL].avg;
+    var->iops_read               = cfg->buckets[THROTTLE_OPS_READ].avg;
+    var->iops_write              = cfg->buckets[THROTTLE_OPS_WRITE].avg;
+    var->bps_total_max           = cfg->buckets[THROTTLE_BPS_TOTAL].max;
+    var->bps_read_max            = cfg->buckets[THROTTLE_BPS_READ].max;
+    var->bps_write_max           = cfg->buckets[THROTTLE_BPS_WRITE].max;
+    var->iops_total_max          = cfg->buckets[THROTTLE_OPS_TOTAL].max;
+    var->iops_read_max           = cfg->buckets[THROTTLE_OPS_READ].max;
+    var->iops_write_max          = cfg->buckets[THROTTLE_OPS_WRITE].max;
+    var->bps_total_max_length    = cfg->buckets[THROTTLE_BPS_TOTAL].burst_length;
+    var->bps_read_max_length     = cfg->buckets[THROTTLE_BPS_READ].burst_length;
+    var->bps_write_max_length    = cfg->buckets[THROTTLE_BPS_WRITE].burst_length;
+    var->iops_total_max_length   = cfg->buckets[THROTTLE_OPS_TOTAL].burst_length;
+    var->iops_read_max_length    = cfg->buckets[THROTTLE_OPS_READ].burst_length;
+    var->iops_write_max_length   = cfg->buckets[THROTTLE_OPS_WRITE].burst_length;
+    var->iops_size               = cfg->op_size;
+
+    var->has_bps_total = true;
+    var->has_bps_read = true;
+    var->has_bps_write = true;
+    var->has_iops_total = true;
+    var->has_iops_read = true;
+    var->has_iops_write = true;
+    var->has_bps_total_max = true;
+    var->has_bps_read_max = true;
+    var->has_bps_write_max = true;
+    var->has_iops_total_max = true;
+    var->has_iops_read_max = true;
+    var->has_iops_write_max = true;
+    var->has_bps_read_max_length = true;
+    var->has_bps_total_max_length = true;
+    var->has_bps_write_max_length = true;
+    var->has_iops_total_max_length = true;
+    var->has_iops_read_max_length = true;
+    var->has_iops_write_max_length = true;
+    var->has_iops_size = true;
+}
diff --git a/util/timed-average.c b/util/timed-average.c
new file mode 100644
index 000000000..2b49d532c
--- /dev/null
+++ b/util/timed-average.c
@@ -0,0 +1,231 @@
+/*
+ * QEMU timed average computation
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) version 3 or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/timed-average.h"
+
+/* This module computes an average of a set of values within a time
+ * window.
+ *
+ * Algorithm:
+ *
+ * - Create two windows with a certain expiration period, and
+ *   offsetted by period / 2.
+ * - Each time you want to account a new value, do it in both windows.
+ * - The minimum / maximum / average values are always returned from
+ *   the oldest window.
+ *
+ * Example:
+ *
+ *        t=0          |t=0.5           |t=1          |t=1.5            |t=2
+ *        wnd0: [0,0.5)|wnd0: [0.5,1.5) |             |wnd0: [1.5,2.5)  |
+ *        wnd1: [0,1)  |                |wnd1: [1,2)  |                 |
+ *
+ * Values are returned from:
+ *
+ *        wnd0---------|wnd1------------|wnd0---------|wnd1-------------|
+ */
+
+/* Update the expiration of a time window
+ *
+ * @w:      the window used
+ * @now:    the current time in nanoseconds
+ * @period: the expiration period in nanoseconds
+ */
+static void update_expiration(TimedAverageWindow *w, int64_t now,
+                              int64_t period)
+{
+    /* time elapsed since the last theoretical expiration */
+    int64_t elapsed = (now - w->expiration) % period;
+    /* time remaininging until the next expiration */
+    int64_t remaining = period - elapsed;
+    /* compute expiration */
+    w->expiration = now + remaining;
+}
+
+/* Reset a window
+ *
+ * @w: the window to reset
+ */
+static void window_reset(TimedAverageWindow *w)
+{
+    w->min = UINT64_MAX;
+    w->max = 0;
+    w->sum = 0;
+    w->count = 0;
+}
+
+/* Get the current window (that is, the one with the earliest
+ * expiration time).
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: a pointer to the current window
+ */
+static TimedAverageWindow *current_window(TimedAverage *ta)
+{
+     return &ta->windows[ta->current];
+}
+
+/* Initialize a TimedAverage structure
+ *
+ * @ta:         the TimedAverage structure
+ * @clock_type: the type of clock to use
+ * @period:     the time window period in nanoseconds
+ */
+void timed_average_init(TimedAverage *ta, QEMUClockType clock_type,
+                        uint64_t period)
+{
+    int64_t now = qemu_clock_get_ns(clock_type);
+
+    /* Returned values are from the oldest window, so they belong to
+     * the interval [ta->period/2,ta->period). By adjusting the
+     * requested period by 4/3, we guarantee that they're in the
+     * interval [2/3 period,4/3 period), closer to the requested
+     * period on average */
+    ta->period = (uint64_t) period * 4 / 3;
+    ta->clock_type = clock_type;
+    ta->current = 0;
+
+    window_reset(&ta->windows[0]);
+    window_reset(&ta->windows[1]);
+
+    /* Both windows are offsetted by half a period */
+    ta->windows[0].expiration = now + ta->period / 2;
+    ta->windows[1].expiration = now + ta->period;
+}
+
+/* Check if the time windows have expired, updating their counters and
+ * expiration time if that's the case.
+ *
+ * @ta: the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) within the current
+ *           window will be stored here
+ */
+static void check_expirations(TimedAverage *ta, uint64_t *elapsed)
+{
+    int64_t now = qemu_clock_get_ns(ta->clock_type);
+    int i;
+
+    assert(ta->period != 0);
+
+    /* Check if the windows have expired */
+    for (i = 0; i < 2; i++) {
+        TimedAverageWindow *w = &ta->windows[i];
+        if (w->expiration <= now) {
+            window_reset(w);
+            update_expiration(w, now, ta->period);
+        }
+    }
+
+    /* Make ta->current point to the oldest window */
+    if (ta->windows[0].expiration < ta->windows[1].expiration) {
+        ta->current = 0;
+    } else {
+        ta->current = 1;
+    }
+
+    /* Calculate the elapsed time within the current window */
+    if (elapsed) {
+        int64_t remaining = ta->windows[ta->current].expiration - now;
+        *elapsed = ta->period - remaining;
+    }
+}
+
+/* Account a value
+ *
+ * @ta:    the TimedAverage structure
+ * @value: the value to account
+ */
+void timed_average_account(TimedAverage *ta, uint64_t value)
+{
+    int i;
+    check_expirations(ta, NULL);
+
+    /* Do the accounting in both windows at the same time */
+    for (i = 0; i < 2; i++) {
+        TimedAverageWindow *w = &ta->windows[i];
+
+        w->sum += value;
+        w->count++;
+
+        if (value < w->min) {
+            w->min = value;
+        }
+
+        if (value > w->max) {
+            w->max = value;
+        }
+    }
+}
+
+/* Get the minimum value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the minimum value
+ */
+uint64_t timed_average_min(TimedAverage *ta)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, NULL);
+    w = current_window(ta);
+    return w->min < UINT64_MAX ? w->min : 0;
+}
+
+/* Get the average value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the average value
+ */
+uint64_t timed_average_avg(TimedAverage *ta)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, NULL);
+    w = current_window(ta);
+    return w->count > 0 ? w->sum / w->count : 0;
+}
+
+/* Get the maximum value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the maximum value
+ */
+uint64_t timed_average_max(TimedAverage *ta)
+{
+    check_expirations(ta, NULL);
+    return current_window(ta)->max;
+}
+
+/* Get the sum of all accounted values
+ * @ta:      the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here
+ * @ret:     the sum of all accounted values
+ */
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, elapsed);
+    w = current_window(ta);
+    return w->sum;
+}
diff --git a/util/trace-events b/util/trace-events
new file mode 100644
index 000000000..c8f53d7d9
--- /dev/null
+++ b/util/trace-events
@@ -0,0 +1,106 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# aio-posix.c
+run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ns %"PRId64 " timeout %"PRId64
+run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
+
+# async.c
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
+
+# thread-pool.c
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
+
+# buffer.c
+buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
+buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
+buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
+buffer_free(const char *buf, size_t len) "%s: capacity %zd"
+
+# filemonitor-inotify.c
+qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64
+qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64
+qemu_file_monitor_new(void *mon, int fd) "File monitor %p created fd=%d"
+qemu_file_monitor_enable_watch(void *mon, const char *dirpath, int id) "File monitor %p enable watch dir='%s' id=%u"
+qemu_file_monitor_disable_watch(void *mon, const char *dirpath, int id) "File monitor %p disable watch dir='%s' id=%u"
+qemu_file_monitor_event(void *mon, const char *dirpath, const char *filename, int mask, unsigned int id) "File monitor %p event dir='%s' file='%s' mask=0x%x id=%u"
+qemu_file_monitor_dispatch(void *mon, const char *dirpath, const char *filename, int ev, void *cb, void *opaque, int64_t id) "File monitor %p dispatch dir='%s' file='%s' ev=%d cb=%p opaque=%p id=%" PRId64
+
+# qemu-coroutine.c
+qemu_aio_coroutine_enter(void *ctx, void *from, void *to, void *opaque) "ctx %p from %p to %p opaque %p"
+qemu_coroutine_yield(void *from, void *to) "from %p to %p"
+qemu_coroutine_terminate(void *co) "self %p"
+
+# qemu-coroutine-lock.c
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p"
+
+# oslib-posix.c
+# oslib-win32.c
+qemu_memalign(size_t alignment, size_t size, void *ptr) "alignment %zu size %zu ptr %p"
+qemu_anon_ram_alloc(size_t size, void *ptr) "size %zu ptr %p"
+qemu_vfree(void *ptr) "ptr %p"
+qemu_anon_ram_free(void *ptr, size_t size) "ptr %p size %zu"
+
+# hbitmap.c
+hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
+hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
+hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
+
+# lockcnt.c
+lockcnt_fast_path_attempt(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d"
+lockcnt_fast_path_success(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d succeeded"
+lockcnt_unlock_attempt(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d"
+lockcnt_unlock_success(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d succeeded"
+lockcnt_futex_wait_prepare(const void *lockcnt, int expected, int new) "lockcnt %p preparing slow path %d->%d"
+lockcnt_futex_wait(const void *lockcnt, int val) "lockcnt %p waiting on %d"
+lockcnt_futex_wait_resume(const void *lockcnt, int new) "lockcnt %p after wait: %d"
+lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter"
+
+# qemu-sockets.c
+socket_listen(int num) "backlog: %d"
+
+# qemu-thread-common.h
+# qemu-thread-posix.c
+# qemu-thread-win32.c
+qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)"
+qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)"
+qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)"
+
+# vfio-helpers.c
+qemu_vfio_dma_reset_temporary(void *s) "s %p"
+qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
+qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
+qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
+
+#userfaultfd.c
+uffd_query_features_nosys(int err) "errno: %i"
+uffd_query_features_api_failed(int err) "errno: %i"
+uffd_create_fd_nosys(int err) "errno: %i"
+uffd_create_fd_api_failed(int err) "errno: %i"
+uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64
+uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i"
+uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i"
+
+# module.c
+module_load_module(const char *name) "file %s"
+module_lookup_object_type(const char *name) "name %s"
diff --git a/util/trace.h b/util/trace.h
new file mode 100644
index 000000000..86ff7a390
--- /dev/null
+++ b/util/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-util.h"
diff --git a/util/transactions.c b/util/transactions.c
new file mode 100644
index 000000000..2dbdedce9
--- /dev/null
+++ b/util/transactions.c
@@ -0,0 +1,100 @@
+/*
+ * Simple transactions API
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ *  Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/transactions.h"
+#include "qemu/queue.h"
+
+typedef struct TransactionAction {
+    TransactionActionDrv *drv;
+    void *opaque;
+    QSLIST_ENTRY(TransactionAction) entry;
+} TransactionAction;
+
+struct Transaction {
+    QSLIST_HEAD(, TransactionAction) actions;
+};
+
+Transaction *tran_new(void)
+{
+    Transaction *tran = g_new(Transaction, 1);
+
+    QSLIST_INIT(&tran->actions);
+
+    return tran;
+}
+
+void tran_add(Transaction *tran, TransactionActionDrv *drv, void *opaque)
+{
+    TransactionAction *act;
+
+    act = g_new(TransactionAction, 1);
+    *act = (TransactionAction) {
+        .drv = drv,
+        .opaque = opaque
+    };
+
+    QSLIST_INSERT_HEAD(&tran->actions, act, entry);
+}
+
+void tran_abort(Transaction *tran)
+{
+    TransactionAction *act, *next;
+
+    QSLIST_FOREACH(act, &tran->actions, entry) {
+        if (act->drv->abort) {
+            act->drv->abort(act->opaque);
+        }
+    }
+
+    QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) {
+        if (act->drv->clean) {
+            act->drv->clean(act->opaque);
+        }
+
+        g_free(act);
+    }
+
+    g_free(tran);
+}
+
+void tran_commit(Transaction *tran)
+{
+    TransactionAction *act, *next;
+
+    QSLIST_FOREACH(act, &tran->actions, entry) {
+        if (act->drv->commit) {
+            act->drv->commit(act->opaque);
+        }
+    }
+
+    QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) {
+        if (act->drv->clean) {
+            act->drv->clean(act->opaque);
+        }
+
+        g_free(act);
+    }
+
+    g_free(tran);
+}
diff --git a/util/unicode.c b/util/unicode.c
new file mode 100644
index 000000000..8580bc598
--- /dev/null
+++ b/util/unicode.c
@@ -0,0 +1,156 @@
+/*
+ * Dealing with Unicode
+ *
+ * Copyright (C) 2013 Red Hat, Inc.
+ *
+ * Authors:
+ *  Markus Armbruster <armbru@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/unicode.h"
+
+static bool is_valid_codepoint(int codepoint)
+{
+    if (codepoint > 0x10FFFFu) {
+        return false;            /* beyond Unicode range */
+    }
+    if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
+        || (codepoint & 0xFFFE) == 0xFFFE) {
+        return false;            /* noncharacter */
+    }
+    if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+        return false;            /* surrogate code point */
+    }
+    return true;
+}
+
+/**
+ * mod_utf8_codepoint:
+ * @s: string encoded in modified UTF-8
+ * @n: maximum number of bytes to read from @s, if less than 6
+ * @end: set to end of sequence on return
+ *
+ * Convert the modified UTF-8 sequence at the start of @s.  Modified
+ * UTF-8 is exactly like UTF-8, except U+0000 is encoded as
+ * "\xC0\x80".
+ *
+ * If @n is zero or @s points to a zero byte, the sequence is invalid,
+ * and @end is set to @s.
+ *
+ * If @s points to an impossible byte (0xFE or 0xFF) or a continuation
+ * byte, the sequence is invalid, and @end is set to @s + 1
+ *
+ * Else, the first byte determines how many continuation bytes are
+ * expected.  If there are fewer, the sequence is invalid, and @end is
+ * set to @s + 1 + actual number of continuation bytes.  Else, the
+ * sequence is well-formed, and @end is set to @s + 1 + expected
+ * number of continuation bytes.
+ *
+ * A well-formed sequence is valid unless it encodes a codepoint
+ * outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66
+ * noncharacters, a surrogate codepoint, or is overlong.  Except the
+ * overlong sequence "\xC0\x80" is valid.
+ *
+ * Conversion succeeds if and only if the sequence is valid.
+ *
+ * Returns: the Unicode codepoint on success, -1 on failure.
+ */
+int mod_utf8_codepoint(const char *s, size_t n, char **end)
+{
+    static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
+    const unsigned char *p;
+    unsigned byte, mask, len, i;
+    int cp;
+
+    if (n == 0 || *s == 0) {
+        /* empty sequence */
+        *end = (char *)s;
+        return -1;
+    }
+
+    p = (const unsigned char *)s;
+    byte = *p++;
+    if (byte < 0x80) {
+        cp = byte;              /* one byte sequence */
+    } else if (byte >= 0xFE) {
+        cp = -1;                /* impossible bytes 0xFE, 0xFF */
+    } else if ((byte & 0x40) == 0) {
+        cp = -1;                /* unexpected continuation byte */
+    } else {
+        /* multi-byte sequence */
+        len = 0;
+        for (mask = 0x80; byte & mask; mask >>= 1) {
+            len++;
+        }
+        assert(len > 1 && len < 7);
+        cp = byte & (mask - 1);
+        for (i = 1; i < len; i++) {
+            byte = i < n ? *p : 0;
+            if ((byte & 0xC0) != 0x80) {
+                cp = -1;        /* continuation byte missing */
+                goto out;
+            }
+            p++;
+            cp <<= 6;
+            cp |= byte & 0x3F;
+        }
+        if (!is_valid_codepoint(cp)) {
+            cp = -1;
+        } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
+            cp = -1;            /* overlong, not \xC0\x80 */
+        }
+    }
+
+out:
+    *end = (char *)p;
+    return cp;
+}
+
+/**
+ * mod_utf8_encode:
+ * @buf: Destination buffer
+ * @bufsz: size of @buf, at least 5.
+ * @codepoint: Unicode codepoint to encode
+ *
+ * Convert Unicode codepoint @codepoint to modified UTF-8.
+ *
+ * Returns: the length of the UTF-8 sequence on success, -1 when
+ * @codepoint is invalid.
+ */
+ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
+{
+    assert(bufsz >= 5);
+
+    if (!is_valid_codepoint(codepoint)) {
+        return -1;
+    }
+
+    if (codepoint > 0 && codepoint <= 0x7F) {
+        buf[0] = codepoint & 0x7F;
+        buf[1] = 0;
+        return 1;
+    }
+    if (codepoint <= 0x7FF) {
+        buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+        buf[1] = 0x80 | (codepoint & 0x3F);
+        buf[2] = 0;
+        return 2;
+    }
+    if (codepoint <= 0xFFFF) {
+        buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+        buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+        buf[2] = 0x80 | (codepoint & 0x3F);
+        buf[3] = 0;
+        return 3;
+    }
+    buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+    buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+    buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+    buf[3] = 0x80 | (codepoint & 0x3F);
+    buf[4] = 0;
+    return 4;
+}
diff --git a/util/uri.c b/util/uri.c
new file mode 100644
index 000000000..ff72c6005
--- /dev/null
+++ b/util/uri.c
@@ -0,0 +1,2314 @@
+/**
+ * uri.c: set of generic URI related routines
+ *
+ * Reference: RFCs 3986, 2732 and 2373
+ *
+ * Copyright (C) 1998-2003 Daniel Veillard.  All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of Daniel Veillard shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization from him.
+ *
+ * daniel@veillard.com
+ *
+ **
+ *
+ * Copyright (C) 2007, 2009-2010 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
+ *
+ * Authors:
+ *    Richard W.M. Jones <rjones@redhat.com>
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+
+#include "qemu/uri.h"
+
+static void uri_clean(URI *uri);
+
+/*
+ * Old rule from 2396 used in legacy handling code
+ * alpha    = lowalpha | upalpha
+ */
+#define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x))
+
+/*
+ * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" |
+ *            "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" |
+ *            "u" | "v" | "w" | "x" | "y" | "z"
+ */
+
+#define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z'))
+
+/*
+ * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" |
+ *           "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" |
+ *           "U" | "V" | "W" | "X" | "Y" | "Z"
+ */
+#define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z'))
+
+#ifdef IS_DIGIT
+#undef IS_DIGIT
+#endif
+/*
+ * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+ */
+#define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9'))
+
+/*
+ * alphanum = alpha | digit
+ */
+
+#define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x))
+
+/*
+ * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
+ */
+
+#define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') ||            \
+    ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') ||           \
+    ((x) == '(') || ((x) == ')'))
+
+/*
+ * unwise = "{" | "}" | "|" | "\" | "^" | "`"
+ */
+
+#define IS_UNWISE(p)                                                           \
+    (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) ||                  \
+     ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) ||                 \
+     ((*(p) == ']')) || ((*(p) == '`')))
+/*
+ * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
+ *            "[" | "]"
+ */
+
+#define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') ||        \
+    ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') ||            \
+    ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') ||            \
+    ((x) == ']'))
+
+/*
+ * unreserved = alphanum | mark
+ */
+
+#define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x))
+
+/*
+ * Skip to next pointer char, handle escaped sequences
+ */
+
+#define NEXT(p) ((*p == '%') ? p += 3 : p++)
+
+/*
+ * Productions from the spec.
+ *
+ *    authority     = server | reg_name
+ *    reg_name      = 1*( unreserved | escaped | "$" | "," |
+ *                        ";" | ":" | "@" | "&" | "=" | "+" )
+ *
+ * path          = [ abs_path | opaque_part ]
+ */
+
+/************************************************************************
+ *                                                                      *
+ *                         RFC 3986 parser                              *
+ *                                                                      *
+ ************************************************************************/
+
+#define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9'))
+#define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) ||                      \
+                      ((*(p) >= 'A') && (*(p) <= 'Z')))
+#define ISA_HEXDIG(p)                                                          \
+    (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) ||                       \
+     ((*(p) >= 'A') && (*(p) <= 'F')))
+
+/*
+ *    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
+ *                     / "*" / "+" / "," / ";" / "="
+ */
+#define ISA_SUB_DELIM(p)                                                       \
+    (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) ||                  \
+     ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) ||                  \
+     ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) ||                  \
+     ((*(p) == '=')) || ((*(p) == '\'')))
+
+/*
+ *    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+ */
+#define ISA_GEN_DELIM(p)                                                       \
+    (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) ||                  \
+     ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) ||                  \
+     ((*(p) == '@')))
+
+/*
+ *    reserved      = gen-delims / sub-delims
+ */
+#define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p)))
+
+/*
+ *    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ */
+#define ISA_UNRESERVED(p)                                                      \
+    ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) ||                    \
+     ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~')))
+
+/*
+ *    pct-encoded   = "%" HEXDIG HEXDIG
+ */
+#define ISA_PCT_ENCODED(p)                                                     \
+    ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2)))
+
+/*
+ *    pchar         = unreserved / pct-encoded / sub-delims / ":" / "@"
+ */
+#define ISA_PCHAR(p)                                                           \
+    (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) ||            \
+     ((*(p) == ':')) || ((*(p) == '@')))
+
+/**
+ * rfc3986_parse_scheme:
+ * @uri:  pointer to an URI structure
+ * @str:  pointer to the string to analyze
+ *
+ * Parse an URI scheme
+ *
+ * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_scheme(URI *uri, const char **str)
+{
+    const char *cur;
+
+    if (str == NULL) {
+        return -1;
+    }
+
+    cur = *str;
+    if (!ISA_ALPHA(cur)) {
+        return 2;
+    }
+    cur++;
+    while (ISA_ALPHA(cur) || ISA_DIGIT(cur) || (*cur == '+') || (*cur == '-') ||
+           (*cur == '.')) {
+        cur++;
+    }
+    if (uri != NULL) {
+        g_free(uri->scheme);
+        uri->scheme = g_strndup(*str, cur - *str);
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_fragment:
+ * @uri:  pointer to an URI structure
+ * @str:  pointer to the string to analyze
+ *
+ * Parse the query part of an URI
+ *
+ * fragment      = *( pchar / "/" / "?" )
+ * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']'
+ *       in the fragment identifier but this is used very broadly for
+ *       xpointer scheme selection, so we are allowing it here to not break
+ *       for example all the DocBook processing chains.
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_fragment(URI *uri, const char **str)
+{
+    const char *cur;
+
+    if (str == NULL) {
+        return -1;
+    }
+
+    cur = *str;
+
+    while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
+           (*cur == '[') || (*cur == ']') ||
+           ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) {
+        NEXT(cur);
+    }
+    if (uri != NULL) {
+        g_free(uri->fragment);
+        if (uri->cleanup & 2) {
+            uri->fragment = g_strndup(*str, cur - *str);
+        } else {
+            uri->fragment = uri_string_unescape(*str, cur - *str, NULL);
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_query:
+ * @uri:  pointer to an URI structure
+ * @str:  pointer to the string to analyze
+ *
+ * Parse the query part of an URI
+ *
+ * query = *uric
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_query(URI *uri, const char **str)
+{
+    const char *cur;
+
+    if (str == NULL) {
+        return -1;
+    }
+
+    cur = *str;
+
+    while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
+           ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) {
+        NEXT(cur);
+    }
+    if (uri != NULL) {
+        g_free(uri->query);
+        uri->query = g_strndup(*str, cur - *str);
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_port:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse a port  part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * port          = *DIGIT
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_port(URI *uri, const char **str)
+{
+    const char *cur = *str;
+    int port = 0;
+
+    if (ISA_DIGIT(cur)) {
+        while (ISA_DIGIT(cur)) {
+            port = port * 10 + (*cur - '0');
+            if (port > 65535) {
+                return 1;
+            }
+            cur++;
+        }
+        if (uri) {
+            uri->port = port;
+        }
+        *str = cur;
+        return 0;
+    }
+    return 1;
+}
+
+/**
+ * rfc3986_parse_user_info:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse a user information part and fill in the appropriate fields
+ * of the @uri structure
+ *
+ * userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_user_info(URI *uri, const char **str)
+{
+    const char *cur;
+
+    cur = *str;
+    while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur) ||
+           (*cur == ':')) {
+        NEXT(cur);
+    }
+    if (*cur == '@') {
+        if (uri != NULL) {
+            g_free(uri->user);
+            if (uri->cleanup & 2) {
+                uri->user = g_strndup(*str, cur - *str);
+            } else {
+                uri->user = uri_string_unescape(*str, cur - *str, NULL);
+            }
+        }
+        *str = cur;
+        return 0;
+    }
+    return 1;
+}
+
+/**
+ * rfc3986_parse_dec_octet:
+ * @str:  the string to analyze
+ *
+ *    dec-octet     = DIGIT                 ; 0-9
+ *                  / %x31-39 DIGIT         ; 10-99
+ *                  / "1" 2DIGIT            ; 100-199
+ *                  / "2" %x30-34 DIGIT     ; 200-249
+ *                  / "25" %x30-35          ; 250-255
+ *
+ * Skip a dec-octet.
+ *
+ * Returns 0 if found and skipped, 1 otherwise
+ */
+static int rfc3986_parse_dec_octet(const char **str)
+{
+    const char *cur = *str;
+
+    if (!(ISA_DIGIT(cur))) {
+        return 1;
+    }
+    if (!ISA_DIGIT(cur + 1)) {
+        cur++;
+    } else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur + 2))) {
+        cur += 2;
+    } else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2))) {
+        cur += 3;
+    } else if ((*cur == '2') && (*(cur + 1) >= '0') && (*(cur + 1) <= '4') &&
+             (ISA_DIGIT(cur + 2))) {
+        cur += 3;
+    } else if ((*cur == '2') && (*(cur + 1) == '5') && (*(cur + 2) >= '0') &&
+             (*(cur + 1) <= '5')) {
+        cur += 3;
+    } else {
+        return 1;
+    }
+    *str = cur;
+    return 0;
+}
+/**
+ * rfc3986_parse_host:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an host part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * host          = IP-literal / IPv4address / reg-name
+ * IP-literal    = "[" ( IPv6address / IPvFuture  ) "]"
+ * IPv4address   = dec-octet "." dec-octet "." dec-octet "." dec-octet
+ * reg-name      = *( unreserved / pct-encoded / sub-delims )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_host(URI *uri, const char **str)
+{
+    const char *cur = *str;
+    const char *host;
+
+    host = cur;
+    /*
+     * IPv6 and future addressing scheme are enclosed between brackets
+     */
+    if (*cur == '[') {
+        cur++;
+        while ((*cur != ']') && (*cur != 0)) {
+            cur++;
+        }
+        if (*cur != ']') {
+            return 1;
+        }
+        cur++;
+        goto found;
+    }
+    /*
+     * try to parse an IPv4
+     */
+    if (ISA_DIGIT(cur)) {
+        if (rfc3986_parse_dec_octet(&cur) != 0) {
+            goto not_ipv4;
+        }
+        if (*cur != '.') {
+            goto not_ipv4;
+        }
+        cur++;
+        if (rfc3986_parse_dec_octet(&cur) != 0) {
+            goto not_ipv4;
+        }
+        if (*cur != '.') {
+            goto not_ipv4;
+        }
+        if (rfc3986_parse_dec_octet(&cur) != 0) {
+            goto not_ipv4;
+        }
+        if (*cur != '.') {
+            goto not_ipv4;
+        }
+        if (rfc3986_parse_dec_octet(&cur) != 0) {
+            goto not_ipv4;
+        }
+        goto found;
+    not_ipv4:
+        cur = *str;
+    }
+    /*
+     * then this should be a hostname which can be empty
+     */
+    while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur)) {
+        NEXT(cur);
+    }
+found:
+    if (uri != NULL) {
+        g_free(uri->authority);
+        uri->authority = NULL;
+        g_free(uri->server);
+        if (cur != host) {
+            if (uri->cleanup & 2) {
+                uri->server = g_strndup(host, cur - host);
+            } else {
+                uri->server = uri_string_unescape(host, cur - host, NULL);
+            }
+        } else {
+            uri->server = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_authority:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an authority part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * authority     = [ userinfo "@" ] host [ ":" port ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_authority(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+    /*
+     * try to parse a userinfo and check for the trailing @
+     */
+    ret = rfc3986_parse_user_info(uri, &cur);
+    if ((ret != 0) || (*cur != '@')) {
+        cur = *str;
+    } else {
+        cur++;
+    }
+    ret = rfc3986_parse_host(uri, &cur);
+    if (ret != 0) {
+        return ret;
+    }
+    if (*cur == ':') {
+        cur++;
+        ret = rfc3986_parse_port(uri, &cur);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_segment:
+ * @str:  the string to analyze
+ * @forbid: an optional forbidden character
+ * @empty: allow an empty segment
+ *
+ * Parse a segment and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * segment       = *pchar
+ * segment-nz    = 1*pchar
+ * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+ *               ; non-zero-length segment without any colon ":"
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_segment(const char **str, char forbid, int empty)
+{
+    const char *cur;
+
+    cur = *str;
+    if (!ISA_PCHAR(cur)) {
+        if (empty) {
+            return 0;
+        }
+        return 1;
+    }
+    while (ISA_PCHAR(cur) && (*cur != forbid)) {
+        NEXT(cur);
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_path_ab_empty:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an path absolute or empty and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-abempty  = *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_ab_empty(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+
+    while (*cur == '/') {
+        cur++;
+        ret = rfc3986_parse_segment(&cur, 0, 1);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (uri != NULL) {
+        g_free(uri->path);
+        if (*str != cur) {
+            if (uri->cleanup & 2) {
+                uri->path = g_strndup(*str, cur - *str);
+            } else {
+                uri->path = uri_string_unescape(*str, cur - *str, NULL);
+            }
+        } else {
+            uri->path = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_path_absolute:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an path absolute and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-absolute = "/" [ segment-nz *( "/" segment ) ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_absolute(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+
+    if (*cur != '/') {
+        return 1;
+    }
+    cur++;
+    ret = rfc3986_parse_segment(&cur, 0, 0);
+    if (ret == 0) {
+        while (*cur == '/') {
+            cur++;
+            ret = rfc3986_parse_segment(&cur, 0, 1);
+            if (ret != 0) {
+                return ret;
+            }
+        }
+    }
+    if (uri != NULL) {
+        g_free(uri->path);
+        if (cur != *str) {
+            if (uri->cleanup & 2) {
+                uri->path = g_strndup(*str, cur - *str);
+            } else {
+                uri->path = uri_string_unescape(*str, cur - *str, NULL);
+            }
+        } else {
+            uri->path = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_path_rootless:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an path without root and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-rootless = segment-nz *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_rootless(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+
+    ret = rfc3986_parse_segment(&cur, 0, 0);
+    if (ret != 0) {
+        return ret;
+    }
+    while (*cur == '/') {
+        cur++;
+        ret = rfc3986_parse_segment(&cur, 0, 1);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (uri != NULL) {
+        g_free(uri->path);
+        if (cur != *str) {
+            if (uri->cleanup & 2) {
+                uri->path = g_strndup(*str, cur - *str);
+            } else {
+                uri->path = uri_string_unescape(*str, cur - *str, NULL);
+            }
+        } else {
+            uri->path = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_path_no_scheme:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an path which is not a scheme and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-noscheme = segment-nz-nc *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_no_scheme(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+
+    ret = rfc3986_parse_segment(&cur, ':', 0);
+    if (ret != 0) {
+        return ret;
+    }
+    while (*cur == '/') {
+        cur++;
+        ret = rfc3986_parse_segment(&cur, 0, 1);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (uri != NULL) {
+        g_free(uri->path);
+        if (cur != *str) {
+            if (uri->cleanup & 2) {
+                uri->path = g_strndup(*str, cur - *str);
+            } else {
+                uri->path = uri_string_unescape(*str, cur - *str, NULL);
+            }
+        } else {
+            uri->path = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_hier_part:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an hierarchical part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * hier-part     = "//" authority path-abempty
+ *                / path-absolute
+ *                / path-rootless
+ *                / path-empty
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_hier_part(URI *uri, const char **str)
+{
+    const char *cur;
+    int ret;
+
+    cur = *str;
+
+    if ((*cur == '/') && (*(cur + 1) == '/')) {
+        cur += 2;
+        ret = rfc3986_parse_authority(uri, &cur);
+        if (ret != 0) {
+            return ret;
+        }
+        ret = rfc3986_parse_path_ab_empty(uri, &cur);
+        if (ret != 0) {
+            return ret;
+        }
+        *str = cur;
+        return 0;
+    } else if (*cur == '/') {
+        ret = rfc3986_parse_path_absolute(uri, &cur);
+        if (ret != 0) {
+            return ret;
+        }
+    } else if (ISA_PCHAR(cur)) {
+        ret = rfc3986_parse_path_rootless(uri, &cur);
+        if (ret != 0) {
+            return ret;
+        }
+    } else {
+        /* path-empty is effectively empty */
+        if (uri != NULL) {
+            g_free(uri->path);
+            uri->path = NULL;
+        }
+    }
+    *str = cur;
+    return 0;
+}
+
+/**
+ * rfc3986_parse_relative_ref:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an URI string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
+ * relative-part = "//" authority path-abempty
+ *               / path-absolute
+ *               / path-noscheme
+ *               / path-empty
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_relative_ref(URI *uri, const char *str)
+{
+    int ret;
+
+    if ((*str == '/') && (*(str + 1) == '/')) {
+        str += 2;
+        ret = rfc3986_parse_authority(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+        ret = rfc3986_parse_path_ab_empty(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    } else if (*str == '/') {
+        ret = rfc3986_parse_path_absolute(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    } else if (ISA_PCHAR(str)) {
+        ret = rfc3986_parse_path_no_scheme(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    } else {
+        /* path-empty is effectively empty */
+        if (uri != NULL) {
+            g_free(uri->path);
+            uri->path = NULL;
+        }
+    }
+
+    if (*str == '?') {
+        str++;
+        ret = rfc3986_parse_query(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (*str == '#') {
+        str++;
+        ret = rfc3986_parse_fragment(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (*str != 0) {
+        uri_clean(uri);
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * rfc3986_parse:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an URI string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse(URI *uri, const char *str)
+{
+    int ret;
+
+    ret = rfc3986_parse_scheme(uri, &str);
+    if (ret != 0) {
+        return ret;
+    }
+    if (*str != ':') {
+        return 1;
+    }
+    str++;
+    ret = rfc3986_parse_hier_part(uri, &str);
+    if (ret != 0) {
+        return ret;
+    }
+    if (*str == '?') {
+        str++;
+        ret = rfc3986_parse_query(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (*str == '#') {
+        str++;
+        ret = rfc3986_parse_fragment(uri, &str);
+        if (ret != 0) {
+            return ret;
+        }
+    }
+    if (*str != 0) {
+        uri_clean(uri);
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * rfc3986_parse_uri_reference:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an URI reference string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_uri_reference(URI *uri, const char *str)
+{
+    int ret;
+
+    if (str == NULL) {
+        return -1;
+    }
+    uri_clean(uri);
+
+    /*
+     * Try first to parse absolute refs, then fallback to relative if
+     * it fails.
+     */
+    ret = rfc3986_parse(uri, str);
+    if (ret != 0) {
+        uri_clean(uri);
+        ret = rfc3986_parse_relative_ref(uri, str);
+        if (ret != 0) {
+            uri_clean(uri);
+            return ret;
+        }
+    }
+    return 0;
+}
+
+/**
+ * uri_parse:
+ * @str:  the URI string to analyze
+ *
+ * Parse an URI based on RFC 3986
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * Returns a newly built URI or NULL in case of error
+ */
+URI *uri_parse(const char *str)
+{
+    URI *uri;
+    int ret;
+
+    if (str == NULL) {
+        return NULL;
+    }
+    uri = uri_new();
+    ret = rfc3986_parse_uri_reference(uri, str);
+    if (ret) {
+        uri_free(uri);
+        return NULL;
+    }
+    return uri;
+}
+
+/**
+ * uri_parse_into:
+ * @uri:  pointer to an URI structure
+ * @str:  the string to analyze
+ *
+ * Parse an URI reference string based on RFC 3986 and fills in the
+ * appropriate fields of the @uri structure
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns 0 or the error code
+ */
+int uri_parse_into(URI *uri, const char *str)
+{
+    return rfc3986_parse_uri_reference(uri, str);
+}
+
+/**
+ * uri_parse_raw:
+ * @str:  the URI string to analyze
+ * @raw:  if 1 unescaping of URI pieces are disabled
+ *
+ * Parse an URI but allows to keep intact the original fragments.
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns a newly built URI or NULL in case of error
+ */
+URI *uri_parse_raw(const char *str, int raw)
+{
+    URI *uri;
+    int ret;
+
+    if (str == NULL) {
+        return NULL;
+    }
+    uri = uri_new();
+    if (raw) {
+        uri->cleanup |= 2;
+    }
+    ret = uri_parse_into(uri, str);
+    if (ret) {
+        uri_free(uri);
+        return NULL;
+    }
+    return uri;
+}
+
+/************************************************************************
+ *                                                                      *
+ *                    Generic URI structure functions                   *
+ *                                                                      *
+ ************************************************************************/
+
+/**
+ * uri_new:
+ *
+ * Simply creates an empty URI
+ *
+ * Returns the new structure or NULL in case of error
+ */
+URI *uri_new(void)
+{
+    return g_new0(URI, 1);
+}
+
+/**
+ * realloc2n:
+ *
+ * Function to handle properly a reallocation when saving an URI
+ * Also imposes some limit on the length of an URI string output
+ */
+static char *realloc2n(char *ret, int *max)
+{
+    char *temp;
+    int tmp;
+
+    tmp = *max * 2;
+    temp = g_realloc(ret, (tmp + 1));
+    *max = tmp;
+    return temp;
+}
+
+/**
+ * uri_to_string:
+ * @uri:  pointer to an URI
+ *
+ * Save the URI as an escaped string
+ *
+ * Returns a new string (to be deallocated by caller)
+ */
+char *uri_to_string(URI *uri)
+{
+    char *ret = NULL;
+    char *temp;
+    const char *p;
+    int len;
+    int max;
+
+    if (uri == NULL) {
+        return NULL;
+    }
+
+    max = 80;
+    ret = g_malloc(max + 1);
+    len = 0;
+
+    if (uri->scheme != NULL) {
+        p = uri->scheme;
+        while (*p != 0) {
+            if (len >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            ret[len++] = *p++;
+        }
+        if (len >= max) {
+            temp = realloc2n(ret, &max);
+            ret = temp;
+        }
+        ret[len++] = ':';
+    }
+    if (uri->opaque != NULL) {
+        p = uri->opaque;
+        while (*p != 0) {
+            if (len + 3 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p))) {
+                ret[len++] = *p++;
+            } else {
+                int val = *(unsigned char *)p++;
+                int hi = val / 0x10, lo = val % 0x10;
+                ret[len++] = '%';
+                ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+                ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+            }
+        }
+    } else {
+        if (uri->server != NULL) {
+            if (len + 3 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            ret[len++] = '/';
+            ret[len++] = '/';
+            if (uri->user != NULL) {
+                p = uri->user;
+                while (*p != 0) {
+                    if (len + 3 >= max) {
+                        temp = realloc2n(ret, &max);
+                        ret = temp;
+                    }
+                    if ((IS_UNRESERVED(*(p))) || ((*(p) == ';')) ||
+                        ((*(p) == ':')) || ((*(p) == '&')) || ((*(p) == '=')) ||
+                        ((*(p) == '+')) || ((*(p) == '$')) || ((*(p) == ','))) {
+                        ret[len++] = *p++;
+                    } else {
+                        int val = *(unsigned char *)p++;
+                        int hi = val / 0x10, lo = val % 0x10;
+                        ret[len++] = '%';
+                        ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+                        ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+                    }
+                }
+                if (len + 3 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                ret[len++] = '@';
+            }
+            p = uri->server;
+            while (*p != 0) {
+                if (len >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                ret[len++] = *p++;
+            }
+            if (uri->port > 0) {
+                if (len + 10 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                len += snprintf(&ret[len], max - len, ":%d", uri->port);
+            }
+        } else if (uri->authority != NULL) {
+            if (len + 3 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            ret[len++] = '/';
+            ret[len++] = '/';
+            p = uri->authority;
+            while (*p != 0) {
+                if (len + 3 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                if ((IS_UNRESERVED(*(p))) || ((*(p) == '$')) ||
+                    ((*(p) == ',')) || ((*(p) == ';')) || ((*(p) == ':')) ||
+                    ((*(p) == '@')) || ((*(p) == '&')) || ((*(p) == '=')) ||
+                    ((*(p) == '+'))) {
+                    ret[len++] = *p++;
+                } else {
+                    int val = *(unsigned char *)p++;
+                    int hi = val / 0x10, lo = val % 0x10;
+                    ret[len++] = '%';
+                    ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+                    ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+                }
+            }
+        } else if (uri->scheme != NULL) {
+            if (len + 3 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            ret[len++] = '/';
+            ret[len++] = '/';
+        }
+        if (uri->path != NULL) {
+            p = uri->path;
+            /*
+             * the colon in file:///d: should not be escaped or
+             * Windows accesses fail later.
+             */
+            if ((uri->scheme != NULL) && (p[0] == '/') &&
+                (((p[1] >= 'a') && (p[1] <= 'z')) ||
+                 ((p[1] >= 'A') && (p[1] <= 'Z'))) &&
+                (p[2] == ':') && (!strcmp(uri->scheme, "file"))) {
+                if (len + 3 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                ret[len++] = *p++;
+                ret[len++] = *p++;
+                ret[len++] = *p++;
+            }
+            while (*p != 0) {
+                if (len + 3 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) ||
+                    ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) ||
+                    ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) ||
+                    ((*(p) == ','))) {
+                    ret[len++] = *p++;
+                } else {
+                    int val = *(unsigned char *)p++;
+                    int hi = val / 0x10, lo = val % 0x10;
+                    ret[len++] = '%';
+                    ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+                    ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+                }
+            }
+        }
+        if (uri->query != NULL) {
+            if (len + 1 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            ret[len++] = '?';
+            p = uri->query;
+            while (*p != 0) {
+                if (len + 1 >= max) {
+                    temp = realloc2n(ret, &max);
+                    ret = temp;
+                }
+                ret[len++] = *p++;
+            }
+        }
+    }
+    if (uri->fragment != NULL) {
+        if (len + 3 >= max) {
+            temp = realloc2n(ret, &max);
+            ret = temp;
+        }
+        ret[len++] = '#';
+        p = uri->fragment;
+        while (*p != 0) {
+            if (len + 3 >= max) {
+                temp = realloc2n(ret, &max);
+                ret = temp;
+            }
+            if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p)))) {
+                ret[len++] = *p++;
+            } else {
+                int val = *(unsigned char *)p++;
+                int hi = val / 0x10, lo = val % 0x10;
+                ret[len++] = '%';
+                ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+                ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+            }
+        }
+    }
+    if (len >= max) {
+        temp = realloc2n(ret, &max);
+        ret = temp;
+    }
+    ret[len] = 0;
+    return ret;
+}
+
+/**
+ * uri_clean:
+ * @uri:  pointer to an URI
+ *
+ * Make sure the URI struct is free of content
+ */
+static void uri_clean(URI *uri)
+{
+    if (uri == NULL) {
+        return;
+    }
+
+    g_free(uri->scheme);
+    uri->scheme = NULL;
+    g_free(uri->server);
+    uri->server = NULL;
+    g_free(uri->user);
+    uri->user = NULL;
+    g_free(uri->path);
+    uri->path = NULL;
+    g_free(uri->fragment);
+    uri->fragment = NULL;
+    g_free(uri->opaque);
+    uri->opaque = NULL;
+    g_free(uri->authority);
+    uri->authority = NULL;
+    g_free(uri->query);
+    uri->query = NULL;
+}
+
+/**
+ * uri_free:
+ * @uri:  pointer to an URI, NULL is ignored
+ *
+ * Free up the URI struct
+ */
+void uri_free(URI *uri)
+{
+    uri_clean(uri);
+    g_free(uri);
+}
+
+/************************************************************************
+ *                                                                      *
+ *                           Helper functions                           *
+ *                                                                      *
+ ************************************************************************/
+
+/**
+ * normalize_uri_path:
+ * @path:  pointer to the path string
+ *
+ * Applies the 5 normalization steps to a path string--that is, RFC 2396
+ * Section 5.2, steps 6.c through 6.g.
+ *
+ * Normalization occurs directly on the string, no new allocation is done
+ *
+ * Returns 0 or an error code
+ */
+static int normalize_uri_path(char *path)
+{
+    char *cur, *out;
+
+    if (path == NULL) {
+        return -1;
+    }
+
+    /* Skip all initial "/" chars.  We want to get to the beginning of the
+     * first non-empty segment.
+     */
+    cur = path;
+    while (cur[0] == '/') {
+        ++cur;
+    }
+    if (cur[0] == '\0') {
+        return 0;
+    }
+
+    /* Keep everything we've seen so far.  */
+    out = cur;
+
+    /*
+     * Analyze each segment in sequence for cases (c) and (d).
+     */
+    while (cur[0] != '\0') {
+        /*
+         * c) All occurrences of "./", where "." is a complete path segment,
+         *    are removed from the buffer string.
+         */
+        if ((cur[0] == '.') && (cur[1] == '/')) {
+            cur += 2;
+            /* '//' normalization should be done at this point too */
+            while (cur[0] == '/') {
+                cur++;
+            }
+            continue;
+        }
+
+        /*
+         * d) If the buffer string ends with "." as a complete path segment,
+         *    that "." is removed.
+         */
+        if ((cur[0] == '.') && (cur[1] == '\0')) {
+            break;
+        }
+
+        /* Otherwise keep the segment.  */
+        while (cur[0] != '/') {
+            if (cur[0] == '\0') {
+                goto done_cd;
+            }
+            (out++)[0] = (cur++)[0];
+        }
+        /* nomalize // */
+        while ((cur[0] == '/') && (cur[1] == '/')) {
+            cur++;
+        }
+
+        (out++)[0] = (cur++)[0];
+    }
+done_cd:
+    out[0] = '\0';
+
+    /* Reset to the beginning of the first segment for the next sequence.  */
+    cur = path;
+    while (cur[0] == '/') {
+        ++cur;
+    }
+    if (cur[0] == '\0') {
+        return 0;
+    }
+
+    /*
+     * Analyze each segment in sequence for cases (e) and (f).
+     *
+     * e) All occurrences of "<segment>/../", where <segment> is a
+     *    complete path segment not equal to "..", are removed from the
+     *    buffer string.  Removal of these path segments is performed
+     *    iteratively, removing the leftmost matching pattern on each
+     *    iteration, until no matching pattern remains.
+     *
+     * f) If the buffer string ends with "<segment>/..", where <segment>
+     *    is a complete path segment not equal to "..", that
+     *    "<segment>/.." is removed.
+     *
+     * To satisfy the "iterative" clause in (e), we need to collapse the
+     * string every time we find something that needs to be removed.  Thus,
+     * we don't need to keep two pointers into the string: we only need a
+     * "current position" pointer.
+     */
+    while (1) {
+        char *segp, *tmp;
+
+        /* At the beginning of each iteration of this loop, "cur" points to
+         * the first character of the segment we want to examine.
+         */
+
+        /* Find the end of the current segment.  */
+        segp = cur;
+        while ((segp[0] != '/') && (segp[0] != '\0')) {
+            ++segp;
+        }
+
+        /* If this is the last segment, we're done (we need at least two
+         * segments to meet the criteria for the (e) and (f) cases).
+         */
+        if (segp[0] == '\0') {
+            break;
+        }
+
+        /* If the first segment is "..", or if the next segment _isn't_ "..",
+         * keep this segment and try the next one.
+         */
+        ++segp;
+        if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur + 3)) ||
+            ((segp[0] != '.') || (segp[1] != '.') ||
+             ((segp[2] != '/') && (segp[2] != '\0')))) {
+            cur = segp;
+            continue;
+        }
+
+        /* If we get here, remove this segment and the next one and back up
+         * to the previous segment (if there is one), to implement the
+         * "iteratively" clause.  It's pretty much impossible to back up
+         * while maintaining two pointers into the buffer, so just compact
+         * the whole buffer now.
+         */
+
+        /* If this is the end of the buffer, we're done.  */
+        if (segp[2] == '\0') {
+            cur[0] = '\0';
+            break;
+        }
+        /* Valgrind complained, strcpy(cur, segp + 3); */
+        /* string will overlap, do not use strcpy */
+        tmp = cur;
+        segp += 3;
+        while ((*tmp++ = *segp++) != 0) {
+            /* No further work */
+        }
+
+        /* If there are no previous segments, then keep going from here.  */
+        segp = cur;
+        while ((segp > path) && ((--segp)[0] == '/')) {
+            /* No further work */
+        }
+        if (segp == path) {
+            continue;
+        }
+
+        /* "segp" is pointing to the end of a previous segment; find it's
+         * start.  We need to back up to the previous segment and start
+         * over with that to handle things like "foo/bar/../..".  If we
+         * don't do this, then on the first pass we'll remove the "bar/..",
+         * but be pointing at the second ".." so we won't realize we can also
+         * remove the "foo/..".
+         */
+        cur = segp;
+        while ((cur > path) && (cur[-1] != '/')) {
+            --cur;
+        }
+    }
+    out[0] = '\0';
+
+    /*
+     * g) If the resulting buffer string still begins with one or more
+     *    complete path segments of "..", then the reference is
+     *    considered to be in error. Implementations may handle this
+     *    error by retaining these components in the resolved path (i.e.,
+     *    treating them as part of the final URI), by removing them from
+     *    the resolved path (i.e., discarding relative levels above the
+     *    root), or by avoiding traversal of the reference.
+     *
+     * We discard them from the final path.
+     */
+    if (path[0] == '/') {
+        cur = path;
+        while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.') &&
+               ((cur[3] == '/') || (cur[3] == '\0'))) {
+            cur += 3;
+        }
+
+        if (cur != path) {
+            out = path;
+            while (cur[0] != '\0') {
+                (out++)[0] = (cur++)[0];
+            }
+            out[0] = 0;
+        }
+    }
+
+    return 0;
+}
+
+static int is_hex(char c)
+{
+    if (((c >= '0') && (c <= '9')) || ((c >= 'a') && (c <= 'f')) ||
+        ((c >= 'A') && (c <= 'F'))) {
+        return 1;
+    }
+    return 0;
+}
+
+/**
+ * uri_string_unescape:
+ * @str:  the string to unescape
+ * @len:   the length in bytes to unescape (or <= 0 to indicate full string)
+ * @target:  optional destination buffer
+ *
+ * Unescaping routine, but does not check that the string is an URI. The
+ * output is a direct unsigned char translation of %XX values (no encoding)
+ * Note that the length of the result can only be smaller or same size as
+ * the input string.
+ *
+ * Returns a copy of the string, but unescaped, will return NULL only in case
+ * of error
+ */
+char *uri_string_unescape(const char *str, int len, char *target)
+{
+    char *ret, *out;
+    const char *in;
+
+    if (str == NULL) {
+        return NULL;
+    }
+    if (len <= 0) {
+        len = strlen(str);
+    }
+    if (len < 0) {
+        return NULL;
+    }
+
+    if (target == NULL) {
+        ret = g_malloc(len + 1);
+    } else {
+        ret = target;
+    }
+    in = str;
+    out = ret;
+    while (len > 0) {
+        if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) {
+            in++;
+            if ((*in >= '0') && (*in <= '9')) {
+                *out = (*in - '0');
+            } else if ((*in >= 'a') && (*in <= 'f')) {
+                *out = (*in - 'a') + 10;
+            } else if ((*in >= 'A') && (*in <= 'F')) {
+                *out = (*in - 'A') + 10;
+            }
+            in++;
+            if ((*in >= '0') && (*in <= '9')) {
+                *out = *out * 16 + (*in - '0');
+            } else if ((*in >= 'a') && (*in <= 'f')) {
+                *out = *out * 16 + (*in - 'a') + 10;
+            } else if ((*in >= 'A') && (*in <= 'F')) {
+                *out = *out * 16 + (*in - 'A') + 10;
+            }
+            in++;
+            len -= 3;
+            out++;
+        } else {
+            *out++ = *in++;
+            len--;
+        }
+    }
+    *out = 0;
+    return ret;
+}
+
+/**
+ * uri_string_escape:
+ * @str:  string to escape
+ * @list: exception list string of chars not to escape
+ *
+ * This routine escapes a string to hex, ignoring reserved characters (a-z)
+ * and the characters in the exception list.
+ *
+ * Returns a new escaped string or NULL in case of error.
+ */
+char *uri_string_escape(const char *str, const char *list)
+{
+    char *ret, ch;
+    char *temp;
+    const char *in;
+    int len, out;
+
+    if (str == NULL) {
+        return NULL;
+    }
+    if (str[0] == 0) {
+        return g_strdup(str);
+    }
+    len = strlen(str);
+    if (!(len > 0)) {
+        return NULL;
+    }
+
+    len += 20;
+    ret = g_malloc(len);
+    in = str;
+    out = 0;
+    while (*in != 0) {
+        if (len - out <= 3) {
+            temp = realloc2n(ret, &len);
+            ret = temp;
+        }
+
+        ch = *in;
+
+        if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!strchr(list, ch))) {
+            unsigned char val;
+            ret[out++] = '%';
+            val = ch >> 4;
+            if (val <= 9) {
+                ret[out++] = '0' + val;
+            } else {
+                ret[out++] = 'A' + val - 0xA;
+            }
+            val = ch & 0xF;
+            if (val <= 9) {
+                ret[out++] = '0' + val;
+            } else {
+                ret[out++] = 'A' + val - 0xA;
+            }
+            in++;
+        } else {
+            ret[out++] = *in++;
+        }
+    }
+    ret[out] = 0;
+    return ret;
+}
+
+/************************************************************************
+ *                                                                      *
+ *                           Public functions                           *
+ *                                                                      *
+ ************************************************************************/
+
+/**
+ * uri_resolve:
+ * @URI:  the URI instance found in the document
+ * @base:  the base value
+ *
+ * Computes he final URI of the reference done by checking that
+ * the given URI is valid, and building the final URI using the
+ * base URI. This is processed according to section 5.2 of the
+ * RFC 2396
+ *
+ * 5.2. Resolving Relative References to Absolute Form
+ *
+ * Returns a new URI string (to be freed by the caller) or NULL in case
+ *         of error.
+ */
+char *uri_resolve(const char *uri, const char *base)
+{
+    char *val = NULL;
+    int ret, len, indx, cur, out;
+    URI *ref = NULL;
+    URI *bas = NULL;
+    URI *res = NULL;
+
+    /*
+     * 1) The URI reference is parsed into the potential four components and
+     *    fragment identifier, as described in Section 4.3.
+     *
+     *    NOTE that a completely empty URI is treated by modern browsers
+     *    as a reference to "." rather than as a synonym for the current
+     *    URI.  Should we do that here?
+     */
+    if (uri == NULL) {
+        ret = -1;
+    } else {
+        if (*uri) {
+            ref = uri_new();
+            ret = uri_parse_into(ref, uri);
+        } else {
+            ret = 0;
+        }
+    }
+    if (ret != 0) {
+        goto done;
+    }
+    if ((ref != NULL) && (ref->scheme != NULL)) {
+        /*
+         * The URI is absolute don't modify.
+         */
+        val = g_strdup(uri);
+        goto done;
+    }
+    if (base == NULL) {
+        ret = -1;
+    } else {
+        bas = uri_new();
+        ret = uri_parse_into(bas, base);
+    }
+    if (ret != 0) {
+        if (ref) {
+            val = uri_to_string(ref);
+        }
+        goto done;
+    }
+    if (ref == NULL) {
+        /*
+         * the base fragment must be ignored
+         */
+        g_free(bas->fragment);
+        bas->fragment = NULL;
+        val = uri_to_string(bas);
+        goto done;
+    }
+
+    /*
+     * 2) If the path component is empty and the scheme, authority, and
+     *    query components are undefined, then it is a reference to the
+     *    current document and we are done.  Otherwise, the reference URI's
+     *    query and fragment components are defined as found (or not found)
+     *    within the URI reference and not inherited from the base URI.
+     *
+     *    NOTE that in modern browsers, the parsing differs from the above
+     *    in the following aspect:  the query component is allowed to be
+     *    defined while still treating this as a reference to the current
+     *    document.
+     */
+    res = uri_new();
+    if ((ref->scheme == NULL) && (ref->path == NULL) &&
+        ((ref->authority == NULL) && (ref->server == NULL))) {
+        res->scheme = g_strdup(bas->scheme);
+        if (bas->authority != NULL) {
+            res->authority = g_strdup(bas->authority);
+        } else if (bas->server != NULL) {
+            res->server = g_strdup(bas->server);
+            res->user = g_strdup(bas->user);
+            res->port = bas->port;
+        }
+        res->path = g_strdup(bas->path);
+        if (ref->query != NULL) {
+            res->query = g_strdup(ref->query);
+        } else {
+            res->query = g_strdup(bas->query);
+        }
+        res->fragment = g_strdup(ref->fragment);
+        goto step_7;
+    }
+
+    /*
+     * 3) If the scheme component is defined, indicating that the reference
+     *    starts with a scheme name, then the reference is interpreted as an
+     *    absolute URI and we are done.  Otherwise, the reference URI's
+     *    scheme is inherited from the base URI's scheme component.
+     */
+    if (ref->scheme != NULL) {
+        val = uri_to_string(ref);
+        goto done;
+    }
+    res->scheme = g_strdup(bas->scheme);
+
+    res->query = g_strdup(ref->query);
+    res->fragment = g_strdup(ref->fragment);
+
+    /*
+     * 4) If the authority component is defined, then the reference is a
+     *    network-path and we skip to step 7.  Otherwise, the reference
+     *    URI's authority is inherited from the base URI's authority
+     *    component, which will also be undefined if the URI scheme does not
+     *    use an authority component.
+     */
+    if ((ref->authority != NULL) || (ref->server != NULL)) {
+        if (ref->authority != NULL) {
+            res->authority = g_strdup(ref->authority);
+        } else {
+            res->server = g_strdup(ref->server);
+            res->user = g_strdup(ref->user);
+            res->port = ref->port;
+        }
+        res->path = g_strdup(ref->path);
+        goto step_7;
+    }
+    if (bas->authority != NULL) {
+        res->authority = g_strdup(bas->authority);
+    } else if (bas->server != NULL) {
+        res->server = g_strdup(bas->server);
+        res->user = g_strdup(bas->user);
+        res->port = bas->port;
+    }
+
+    /*
+     * 5) If the path component begins with a slash character ("/"), then
+     *    the reference is an absolute-path and we skip to step 7.
+     */
+    if ((ref->path != NULL) && (ref->path[0] == '/')) {
+        res->path = g_strdup(ref->path);
+        goto step_7;
+    }
+
+    /*
+     * 6) If this step is reached, then we are resolving a relative-path
+     *    reference.  The relative path needs to be merged with the base
+     *    URI's path.  Although there are many ways to do this, we will
+     *    describe a simple method using a separate string buffer.
+     *
+     * Allocate a buffer large enough for the result string.
+     */
+    len = 2; /* extra / and 0 */
+    if (ref->path != NULL) {
+        len += strlen(ref->path);
+    }
+    if (bas->path != NULL) {
+        len += strlen(bas->path);
+    }
+    res->path = g_malloc(len);
+    res->path[0] = 0;
+
+    /*
+     * a) All but the last segment of the base URI's path component is
+     *    copied to the buffer.  In other words, any characters after the
+     *    last (right-most) slash character, if any, are excluded.
+     */
+    cur = 0;
+    out = 0;
+    if (bas->path != NULL) {
+        while (bas->path[cur] != 0) {
+            while ((bas->path[cur] != 0) && (bas->path[cur] != '/')) {
+                cur++;
+            }
+            if (bas->path[cur] == 0) {
+                break;
+            }
+
+            cur++;
+            while (out < cur) {
+                res->path[out] = bas->path[out];
+                out++;
+            }
+        }
+    }
+    res->path[out] = 0;
+
+    /*
+     * b) The reference's path component is appended to the buffer
+     *    string.
+     */
+    if (ref->path != NULL && ref->path[0] != 0) {
+        indx = 0;
+        /*
+         * Ensure the path includes a '/'
+         */
+        if ((out == 0) && (bas->server != NULL)) {
+            res->path[out++] = '/';
+        }
+        while (ref->path[indx] != 0) {
+            res->path[out++] = ref->path[indx++];
+        }
+    }
+    res->path[out] = 0;
+
+    /*
+     * Steps c) to h) are really path normalization steps
+     */
+    normalize_uri_path(res->path);
+
+step_7:
+
+    /*
+     * 7) The resulting URI components, including any inherited from the
+     *    base URI, are recombined to give the absolute form of the URI
+     *    reference.
+     */
+    val = uri_to_string(res);
+
+done:
+    uri_free(ref);
+    uri_free(bas);
+    uri_free(res);
+    return val;
+}
+
+/**
+ * uri_resolve_relative:
+ * @URI:  the URI reference under consideration
+ * @base:  the base value
+ *
+ * Expresses the URI of the reference in terms relative to the
+ * base.  Some examples of this operation include:
+ *     base = "http://site1.com/docs/book1.html"
+ *        URI input                        URI returned
+ *     docs/pic1.gif                    pic1.gif
+ *     docs/img/pic1.gif                img/pic1.gif
+ *     img/pic1.gif                     ../img/pic1.gif
+ *     http://site1.com/docs/pic1.gif   pic1.gif
+ *     http://site2.com/docs/pic1.gif   http://site2.com/docs/pic1.gif
+ *
+ *     base = "docs/book1.html"
+ *        URI input                        URI returned
+ *     docs/pic1.gif                    pic1.gif
+ *     docs/img/pic1.gif                img/pic1.gif
+ *     img/pic1.gif                     ../img/pic1.gif
+ *     http://site1.com/docs/pic1.gif   http://site1.com/docs/pic1.gif
+ *
+ *
+ * Note: if the URI reference is really weird or complicated, it may be
+ *       worthwhile to first convert it into a "nice" one by calling
+ *       uri_resolve (using 'base') before calling this routine,
+ *       since this routine (for reasonable efficiency) assumes URI has
+ *       already been through some validation.
+ *
+ * Returns a new URI string (to be freed by the caller) or NULL in case
+ * error.
+ */
+char *uri_resolve_relative(const char *uri, const char *base)
+{
+    char *val = NULL;
+    int ret;
+    int ix;
+    int pos = 0;
+    int nbslash = 0;
+    int len;
+    URI *ref = NULL;
+    URI *bas = NULL;
+    char *bptr, *uptr, *vptr;
+    int remove_path = 0;
+
+    if ((uri == NULL) || (*uri == 0)) {
+        return NULL;
+    }
+
+    /*
+     * First parse URI into a standard form
+     */
+    ref = uri_new();
+    /* If URI not already in "relative" form */
+    if (uri[0] != '.') {
+        ret = uri_parse_into(ref, uri);
+        if (ret != 0) {
+            goto done; /* Error in URI, return NULL */
+        }
+    } else {
+        ref->path = g_strdup(uri);
+    }
+
+    /*
+     * Next parse base into the same standard form
+     */
+    if ((base == NULL) || (*base == 0)) {
+        val = g_strdup(uri);
+        goto done;
+    }
+    bas = uri_new();
+    if (base[0] != '.') {
+        ret = uri_parse_into(bas, base);
+        if (ret != 0) {
+            goto done; /* Error in base, return NULL */
+        }
+    } else {
+        bas->path = g_strdup(base);
+    }
+
+    /*
+     * If the scheme / server on the URI differs from the base,
+     * just return the URI
+     */
+    if ((ref->scheme != NULL) &&
+        ((bas->scheme == NULL) || (strcmp(bas->scheme, ref->scheme)) ||
+         (strcmp(bas->server, ref->server)))) {
+        val = g_strdup(uri);
+        goto done;
+    }
+    if (bas->path == ref->path ||
+        (bas->path && ref->path && !strcmp(bas->path, ref->path))) {
+        val = g_strdup("");
+        goto done;
+    }
+    if (bas->path == NULL) {
+        val = g_strdup(ref->path);
+        goto done;
+    }
+    if (ref->path == NULL) {
+        ref->path = (char *)"/";
+        remove_path = 1;
+    }
+
+    /*
+     * At this point (at last!) we can compare the two paths
+     *
+     * First we take care of the special case where either of the
+     * two path components may be missing (bug 316224)
+     */
+    if (bas->path == NULL) {
+        if (ref->path != NULL) {
+            uptr = ref->path;
+            if (*uptr == '/') {
+                uptr++;
+            }
+            /* exception characters from uri_to_string */
+            val = uri_string_escape(uptr, "/;&=+$,");
+        }
+        goto done;
+    }
+    bptr = bas->path;
+    if (ref->path == NULL) {
+        for (ix = 0; bptr[ix] != 0; ix++) {
+            if (bptr[ix] == '/') {
+                nbslash++;
+            }
+        }
+        uptr = NULL;
+        len = 1; /* this is for a string terminator only */
+    } else {
+        /*
+         * Next we compare the two strings and find where they first differ
+         */
+        if ((ref->path[pos] == '.') && (ref->path[pos + 1] == '/')) {
+            pos += 2;
+        }
+        if ((*bptr == '.') && (bptr[1] == '/')) {
+            bptr += 2;
+        } else if ((*bptr == '/') && (ref->path[pos] != '/')) {
+            bptr++;
+        }
+        while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0)) {
+            pos++;
+        }
+
+        if (bptr[pos] == ref->path[pos]) {
+            val = g_strdup("");
+            goto done; /* (I can't imagine why anyone would do this) */
+        }
+
+        /*
+         * In URI, "back up" to the last '/' encountered.  This will be the
+         * beginning of the "unique" suffix of URI
+         */
+        ix = pos;
+        if ((ref->path[ix] == '/') && (ix > 0)) {
+            ix--;
+        } else if ((ref->path[ix] == 0) && (ix > 1)
+                && (ref->path[ix - 1] == '/')) {
+            ix -= 2;
+        }
+        for (; ix > 0; ix--) {
+            if (ref->path[ix] == '/') {
+                break;
+            }
+        }
+        if (ix == 0) {
+            uptr = ref->path;
+        } else {
+            ix++;
+            uptr = &ref->path[ix];
+        }
+
+        /*
+         * In base, count the number of '/' from the differing point
+         */
+        if (bptr[pos] != ref->path[pos]) { /* check for trivial URI == base */
+            for (; bptr[ix] != 0; ix++) {
+                if (bptr[ix] == '/') {
+                    nbslash++;
+                }
+            }
+        }
+        len = strlen(uptr) + 1;
+    }
+
+    if (nbslash == 0) {
+        if (uptr != NULL) {
+            /* exception characters from uri_to_string */
+            val = uri_string_escape(uptr, "/;&=+$,");
+        }
+        goto done;
+    }
+
+    /*
+     * Allocate just enough space for the returned string -
+     * length of the remainder of the URI, plus enough space
+     * for the "../" groups, plus one for the terminator
+     */
+    val = g_malloc(len + 3 * nbslash);
+    vptr = val;
+    /*
+     * Put in as many "../" as needed
+     */
+    for (; nbslash > 0; nbslash--) {
+        *vptr++ = '.';
+        *vptr++ = '.';
+        *vptr++ = '/';
+    }
+    /*
+     * Finish up with the end of the URI
+     */
+    if (uptr != NULL) {
+        if ((vptr > val) && (len > 0) && (uptr[0] == '/') &&
+            (vptr[-1] == '/')) {
+            memcpy(vptr, uptr + 1, len - 1);
+            vptr[len - 2] = 0;
+        } else {
+            memcpy(vptr, uptr, len);
+            vptr[len - 1] = 0;
+        }
+    } else {
+        vptr[len - 1] = 0;
+    }
+
+    /* escape the freshly-built path */
+    vptr = val;
+    /* exception characters from uri_to_string */
+    val = uri_string_escape(vptr, "/;&=+$,");
+    g_free(vptr);
+
+done:
+    /*
+     * Free the working variables
+     */
+    if (remove_path != 0) {
+        ref->path = NULL;
+    }
+    uri_free(ref);
+    uri_free(bas);
+
+    return val;
+}
+
+/*
+ * Utility functions to help parse and assemble query strings.
+ */
+
+struct QueryParams *query_params_new(int init_alloc)
+{
+    struct QueryParams *ps;
+
+    if (init_alloc <= 0) {
+        init_alloc = 1;
+    }
+
+    ps = g_new(QueryParams, 1);
+    ps->n = 0;
+    ps->alloc = init_alloc;
+    ps->p = g_new(QueryParam, ps->alloc);
+
+    return ps;
+}
+
+/* Ensure there is space to store at least one more parameter
+ * at the end of the set.
+ */
+static int query_params_append(struct QueryParams *ps, const char *name,
+                               const char *value)
+{
+    if (ps->n >= ps->alloc) {
+        ps->p = g_renew(QueryParam, ps->p, ps->alloc * 2);
+        ps->alloc *= 2;
+    }
+
+    ps->p[ps->n].name = g_strdup(name);
+    ps->p[ps->n].value = g_strdup(value);
+    ps->p[ps->n].ignore = 0;
+    ps->n++;
+
+    return 0;
+}
+
+void query_params_free(struct QueryParams *ps)
+{
+    int i;
+
+    for (i = 0; i < ps->n; ++i) {
+        g_free(ps->p[i].name);
+        g_free(ps->p[i].value);
+    }
+    g_free(ps->p);
+    g_free(ps);
+}
+
+struct QueryParams *query_params_parse(const char *query)
+{
+    struct QueryParams *ps;
+    const char *end, *eq;
+
+    ps = query_params_new(0);
+    if (!query || query[0] == '\0') {
+        return ps;
+    }
+
+    while (*query) {
+        char *name = NULL, *value = NULL;
+
+        /* Find the next separator, or end of the string. */
+        end = strchr(query, '&');
+        if (!end) {
+            end = qemu_strchrnul(query, ';');
+        }
+
+        /* Find the first '=' character between here and end. */
+        eq = strchr(query, '=');
+        if (eq && eq >= end) {
+            eq = NULL;
+        }
+
+        /* Empty section (eg. "&&"). */
+        if (end == query) {
+            goto next;
+        }
+
+        /* If there is no '=' character, then we have just "name"
+         * and consistent with CGI.pm we assume value is "".
+         */
+        else if (!eq) {
+            name = uri_string_unescape(query, end - query, NULL);
+            value = NULL;
+        }
+        /* Or if we have "name=" here (works around annoying
+         * problem when calling uri_string_unescape with len = 0).
+         */
+        else if (eq + 1 == end) {
+            name = uri_string_unescape(query, eq - query, NULL);
+            value = g_new0(char, 1);
+        }
+        /* If the '=' character is at the beginning then we have
+         * "=value" and consistent with CGI.pm we _ignore_ this.
+         */
+        else if (query == eq) {
+            goto next;
+        }
+
+        /* Otherwise it's "name=value". */
+        else {
+            name = uri_string_unescape(query, eq - query, NULL);
+            value = uri_string_unescape(eq + 1, end - (eq + 1), NULL);
+        }
+
+        /* Append to the parameter set. */
+        query_params_append(ps, name, value);
+        g_free(name);
+        g_free(value);
+
+    next:
+        query = end;
+        if (*query) {
+            query++; /* skip '&' separator */
+        }
+    }
+
+    return ps;
+}
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
new file mode 100644
index 000000000..f1cd6af2b
--- /dev/null
+++ b/util/userfaultfd.c
@@ -0,0 +1,345 @@
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/userfaultfd.h"
+#include "trace.h"
+#include <poll.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+
+/**
+ * uffd_query_features: query UFFD features
+ *
+ * Returns: 0 on success, negative value in case of an error
+ *
+ * @features: parameter to receive 'uffdio_api.features'
+ */
+int uffd_query_features(uint64_t *features)
+{
+    int uffd_fd;
+    struct uffdio_api api_struct = { 0 };
+    int ret = -1;
+
+    uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    if (uffd_fd < 0) {
+        trace_uffd_query_features_nosys(errno);
+        return -1;
+    }
+
+    api_struct.api = UFFD_API;
+    api_struct.features = 0;
+
+    if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+        trace_uffd_query_features_api_failed(errno);
+        goto out;
+    }
+    *features = api_struct.features;
+    ret = 0;
+
+out:
+    close(uffd_fd);
+    return ret;
+}
+
+/**
+ * uffd_create_fd: create UFFD file descriptor
+ *
+ * Returns non-negative file descriptor or negative value in case of an error
+ *
+ * @features: UFFD features to request
+ * @non_blocking: create UFFD file descriptor for non-blocking operation
+ */
+int uffd_create_fd(uint64_t features, bool non_blocking)
+{
+    int uffd_fd;
+    int flags;
+    struct uffdio_api api_struct = { 0 };
+    uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
+
+    flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
+    uffd_fd = syscall(__NR_userfaultfd, flags);
+    if (uffd_fd < 0) {
+        trace_uffd_create_fd_nosys(errno);
+        return -1;
+    }
+
+    api_struct.api = UFFD_API;
+    api_struct.features = features;
+    if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+        trace_uffd_create_fd_api_failed(errno);
+        goto fail;
+    }
+    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+        trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
+        goto fail;
+    }
+
+    return uffd_fd;
+
+fail:
+    close(uffd_fd);
+    return -1;
+}
+
+/**
+ * uffd_close_fd: close UFFD file descriptor
+ *
+ * @uffd_fd: UFFD file descriptor
+ */
+void uffd_close_fd(int uffd_fd)
+{
+    assert(uffd_fd >= 0);
+    close(uffd_fd);
+}
+
+/**
+ * uffd_register_memory: register memory range via UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
+ * @ioctls: optional pointer to receive supported IOCTL mask
+ */
+int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
+        uint64_t mode, uint64_t *ioctls)
+{
+    struct uffdio_register uffd_register;
+
+    uffd_register.range.start = (uintptr_t) addr;
+    uffd_register.range.len = length;
+    uffd_register.mode = mode;
+
+    if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
+        trace_uffd_register_memory_failed(addr, length, mode, errno);
+        return -1;
+    }
+    if (ioctls) {
+        *ioctls = uffd_register.ioctls;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_unregister_memory: un-register memory range with UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ */
+int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
+{
+    struct uffdio_range uffd_range;
+
+    uffd_range.start = (uintptr_t) addr;
+    uffd_range.len = length;
+
+    if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
+        trace_uffd_unregister_memory_failed(addr, length, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
+ *
+ * Returns 0 on success, negative value in case of error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @wp: write-protect/unprotect
+ * @dont_wake: do not wake threads waiting on wr-protected page
+ */
+int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
+        bool wp, bool dont_wake)
+{
+    struct uffdio_writeprotect uffd_writeprotect;
+
+    uffd_writeprotect.range.start = (uintptr_t) addr;
+    uffd_writeprotect.range.len = length;
+    if (!wp && dont_wake) {
+        /* DONTWAKE is meaningful only on protection release */
+        uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+    } else {
+        uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
+    }
+
+    if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+        error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
+                " mode=%" PRIx64 " errno=%i", addr, length,
+                (uint64_t) uffd_writeprotect.mode, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_copy_page: copy range of pages to destination via UFFD-IO
+ *
+ * Copy range of source pages to the destination to resolve
+ * missing page fault somewhere in the destination range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @dst_addr: destination base address
+ * @src_addr: source base address
+ * @length: length of the range to copy
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
+        uint64_t length, bool dont_wake)
+{
+    struct uffdio_copy uffd_copy;
+
+    uffd_copy.dst = (uintptr_t) dst_addr;
+    uffd_copy.src = (uintptr_t) src_addr;
+    uffd_copy.len = length;
+    uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
+
+    if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
+        error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
+                " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
+                length, (uint64_t) uffd_copy.mode, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
+ *
+ * Fill range pages with zeroes to resolve missing page fault within the range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range to fill with zeroes
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
+{
+    struct uffdio_zeropage uffd_zeropage;
+
+    uffd_zeropage.range.start = (uintptr_t) addr;
+    uffd_zeropage.range.len = length;
+    uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
+
+    if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
+        error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
+                " mode=%" PRIx64 " errno=%i", addr, length,
+                (uint64_t) uffd_zeropage.mode, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
+ *
+ * Wake up threads waiting on any page/pages from the designated range.
+ * The main use case is when during some period, page faults are resolved
+ * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
+ * for the whole memory range are satisfied in a single call to uffd_wakeup().
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range
+ */
+int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
+{
+    struct uffdio_range uffd_range;
+
+    uffd_range.start = (uintptr_t) addr;
+    uffd_range.len = length;
+
+    if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
+        error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
+                addr, length, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
+/**
+ * uffd_read_events: read pending UFFD events
+ *
+ * Returns number of fetched messages, 0 if non is available or
+ * negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @msgs: pointer to message buffer
+ * @count: number of messages that can fit in the buffer
+ */
+int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
+{
+    ssize_t res;
+    do {
+        res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
+    } while (res < 0 && errno == EINTR);
+
+    if ((res < 0 && errno == EAGAIN)) {
+        return 0;
+    }
+    if (res < 0) {
+        error_report("uffd_read_events() failed: errno=%i", errno);
+        return -1;
+    }
+
+    return (int) (res / sizeof(struct uffd_msg));
+}
+
+/**
+ * uffd_poll_events: poll UFFD file descriptor for read
+ *
+ * Returns true if events are available for read, false otherwise
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @tmo: timeout value
+ */
+bool uffd_poll_events(int uffd_fd, int tmo)
+{
+    int res;
+    struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
+
+    do {
+        res = poll(&poll_fd, 1, tmo);
+    } while (res < 0 && errno == EINTR);
+
+    if (res == 0) {
+        return false;
+    }
+    if (res < 0) {
+        error_report("uffd_poll_events() failed: errno=%i", errno);
+        return false;
+    }
+
+    return (poll_fd.revents & POLLIN) != 0;
+}
diff --git a/util/uuid.c b/util/uuid.c
new file mode 100644
index 000000000..b1108dde7
--- /dev/null
+++ b/util/uuid.c
@@ -0,0 +1,118 @@
+/*
+ *  QEMU UUID functions
+ *
+ *  Copyright 2016 Red Hat, Inc.
+ *
+ *  Authors:
+ *   Fam Zheng <famz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/uuid.h"
+#include "qemu/bswap.h"
+
+void qemu_uuid_generate(QemuUUID *uuid)
+{
+    int i;
+    uint32_t tmp[4];
+
+    QEMU_BUILD_BUG_ON(sizeof(QemuUUID) != 16);
+
+    for (i = 0; i < 4; ++i) {
+        tmp[i] = g_random_int();
+    }
+    memcpy(uuid, tmp, sizeof(tmp));
+    /* Set the two most significant bits (bits 6 and 7) of the
+      clock_seq_hi_and_reserved to zero and one, respectively. */
+    uuid->data[8] = (uuid->data[8] & 0x3f) | 0x80;
+    /* Set the four most significant bits (bits 12 through 15) of the
+      time_hi_and_version field to the 4-bit version number.
+      */
+    uuid->data[6] = (uuid->data[6] & 0xf) | 0x40;
+}
+
+int qemu_uuid_is_null(const QemuUUID *uu)
+{
+    static QemuUUID null_uuid;
+    return qemu_uuid_is_equal(uu, &null_uuid);
+}
+
+int qemu_uuid_is_equal(const QemuUUID *lhv, const QemuUUID *rhv)
+{
+    return memcmp(lhv, rhv, sizeof(QemuUUID)) == 0;
+}
+
+void qemu_uuid_unparse(const QemuUUID *uuid, char *out)
+{
+    const unsigned char *uu = &uuid->data[0];
+    snprintf(out, UUID_FMT_LEN + 1, UUID_FMT,
+             uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
+             uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
+}
+
+char *qemu_uuid_unparse_strdup(const QemuUUID *uuid)
+{
+    const unsigned char *uu = &uuid->data[0];
+    return g_strdup_printf(UUID_FMT,
+                           uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6],
+                           uu[7], uu[8], uu[9], uu[10], uu[11], uu[12],
+                           uu[13], uu[14], uu[15]);
+}
+
+static bool qemu_uuid_is_valid(const char *str)
+{
+    int i;
+
+    for (i = 0; i < strlen(str); i++) {
+        const char c = str[i];
+        if (i == 8 || i == 13 || i == 18 || i == 23) {
+            if (str[i] != '-') {
+                return false;
+            }
+        } else {
+            if ((c >= '0' && c <= '9') ||
+                (c >= 'A' && c <= 'F') ||
+                (c >= 'a' && c <= 'f')) {
+                continue;
+            }
+            return false;
+        }
+    }
+    return i == 36;
+}
+
+int qemu_uuid_parse(const char *str, QemuUUID *uuid)
+{
+    unsigned char *uu = &uuid->data[0];
+    int ret;
+
+    if (!qemu_uuid_is_valid(str)) {
+        return -1;
+    }
+
+    ret = sscanf(str, UUID_FMT, &uu[0], &uu[1], &uu[2], &uu[3],
+                 &uu[4], &uu[5], &uu[6], &uu[7], &uu[8], &uu[9],
+                 &uu[10], &uu[11], &uu[12], &uu[13], &uu[14],
+                 &uu[15]);
+
+    if (ret != 16) {
+        return -1;
+    }
+    return 0;
+}
+
+/* Swap from UUID format endian (BE) to the opposite or vice versa.
+ */
+QemuUUID qemu_uuid_bswap(QemuUUID uuid)
+{
+    bswap32s(&uuid.fields.time_low);
+    bswap16s(&uuid.fields.time_mid);
+    bswap16s(&uuid.fields.time_high_and_version);
+    return uuid;
+}
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
new file mode 100644
index 000000000..00a80431a
--- /dev/null
+++ b/util/vfio-helpers.c
@@ -0,0 +1,861 @@
+/*
+ * VFIO utility
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ *   Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/cpu-common.h"
+#include "exec/memory.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/pci_regs.h"
+#include "qemu/event_notifier.h"
+#include "qemu/vfio-helpers.h"
+#include "qemu/lockable.h"
+#include "trace.h"
+
+#define QEMU_VFIO_DEBUG 0
+
+#define QEMU_VFIO_IOVA_MIN 0x10000ULL
+/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
+ * we can use a runtime limit; alternatively it's also possible to do platform
+ * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
+ **/
+#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
+
+typedef struct {
+    /* Page aligned addr. */
+    void *host;
+    size_t size;
+    uint64_t iova;
+} IOVAMapping;
+
+struct IOVARange {
+    uint64_t start;
+    uint64_t end;
+};
+
+struct QEMUVFIOState {
+    QemuMutex lock;
+
+    /* These fields are protected by BQL */
+    int container;
+    int group;
+    int device;
+    RAMBlockNotifier ram_notifier;
+    struct vfio_region_info config_region_info, bar_region_info[6];
+    struct IOVARange *usable_iova_ranges;
+    uint8_t nb_iova_ranges;
+
+    /* These fields are protected by @lock */
+    /* VFIO's IO virtual address space is managed by splitting into a few
+     * sections:
+     *
+     * ---------------       <= 0
+     * |xxxxxxxxxxxxx|
+     * |-------------|       <= QEMU_VFIO_IOVA_MIN
+     * |             |
+     * |    Fixed    |
+     * |             |
+     * |-------------|       <= low_water_mark
+     * |             |
+     * |    Free     |
+     * |             |
+     * |-------------|       <= high_water_mark
+     * |             |
+     * |    Temp     |
+     * |             |
+     * |-------------|       <= QEMU_VFIO_IOVA_MAX
+     * |xxxxxxxxxxxxx|
+     * |xxxxxxxxxxxxx|
+     * ---------------
+     *
+     * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+     *
+     * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
+     *   [QEMU_VFIO_IOVA_MIN, low_water_mark).  Once allocated they will not be
+     *   reclaimed - low_water_mark never shrinks;
+     *
+     * - IOVAs in range [low_water_mark, high_water_mark) are free;
+     *
+     * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+     *   mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
+     *   is recycled. The caller should make sure I/O's depending on these
+     *   mappings are completed before calling.
+     **/
+    uint64_t low_water_mark;
+    uint64_t high_water_mark;
+    IOVAMapping *mappings;
+    int nr_mappings;
+};
+
+/**
+ * Find group file by PCI device address as specified @device, and return the
+ * path. The returned string is owned by caller and should be g_free'ed later.
+ */
+static char *sysfs_find_group_file(const char *device, Error **errp)
+{
+    char *sysfs_link;
+    char *sysfs_group;
+    char *p;
+    char *path = NULL;
+
+    sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
+    sysfs_group = g_malloc0(PATH_MAX);
+    if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
+        error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
+        goto out;
+    }
+    p = strrchr(sysfs_group, '/');
+    if (!p) {
+        error_setg(errp, "Failed to find iommu group number");
+        goto out;
+    }
+
+    path = g_strdup_printf("/dev/vfio/%s", p + 1);
+out:
+    g_free(sysfs_link);
+    g_free(sysfs_group);
+    return path;
+}
+
+static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
+{
+    assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
+}
+
+static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
+{
+    g_autofree char *barname = NULL;
+    assert_bar_index_valid(s, index);
+    s->bar_region_info[index] = (struct vfio_region_info) {
+        .index = VFIO_PCI_BAR0_REGION_INDEX + index,
+        .argsz = sizeof(struct vfio_region_info),
+    };
+    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
+        error_setg_errno(errp, errno, "Failed to get BAR region info");
+        return -errno;
+    }
+    barname = g_strdup_printf("bar[%d]", index);
+    trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
+                                s->bar_region_info[index].size,
+                                s->bar_region_info[index].cap_offset);
+
+    return 0;
+}
+
+/**
+ * Map a PCI bar area.
+ */
+void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
+                            uint64_t offset, uint64_t size, int prot,
+                            Error **errp)
+{
+    void *p;
+    assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
+    assert_bar_index_valid(s, index);
+    p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
+             prot, MAP_SHARED,
+             s->device, s->bar_region_info[index].offset + offset);
+    trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
+                                size, offset, p);
+    if (p == MAP_FAILED) {
+        error_setg_errno(errp, errno, "Failed to map BAR region");
+        p = NULL;
+    }
+    return p;
+}
+
+/**
+ * Unmap a PCI bar area.
+ */
+void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
+                             uint64_t offset, uint64_t size)
+{
+    if (bar) {
+        munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
+    }
+}
+
+/**
+ * Initialize device IRQ with @irq_type and register an event notifier.
+ */
+int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
+                           int irq_type, Error **errp)
+{
+    int r;
+    struct vfio_irq_set *irq_set;
+    size_t irq_set_size;
+    struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+
+    irq_info.index = irq_type;
+    if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
+        error_setg_errno(errp, errno, "Failed to get device interrupt info");
+        return -errno;
+    }
+    if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+        error_setg(errp, "Device interrupt doesn't support eventfd");
+        return -EINVAL;
+    }
+
+    irq_set_size = sizeof(*irq_set) + sizeof(int);
+    irq_set = g_malloc0(irq_set_size);
+
+    /* Get to a known IRQ state */
+    *irq_set = (struct vfio_irq_set) {
+        .argsz = irq_set_size,
+        .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+        .index = irq_info.index,
+        .start = 0,
+        .count = 1,
+    };
+
+    *(int *)&irq_set->data = event_notifier_get_fd(e);
+    r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
+    g_free(irq_set);
+    if (r) {
+        error_setg_errno(errp, errno, "Failed to setup device interrupt");
+        return -errno;
+    }
+    return 0;
+}
+
+static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
+                                     int size, int ofs)
+{
+    int ret;
+
+    trace_qemu_vfio_pci_read_config(buf, ofs, size,
+                                    s->config_region_info.offset,
+                                    s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+    do {
+        ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
+    } while (ret == -1 && errno == EINTR);
+    return ret == size ? 0 : -errno;
+}
+
+static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
+{
+    int ret;
+
+    trace_qemu_vfio_pci_write_config(buf, ofs, size,
+                                     s->config_region_info.offset,
+                                     s->config_region_info.size);
+    assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+    do {
+        ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
+    } while (ret == -1 && errno == EINTR);
+    return ret == size ? 0 : -errno;
+}
+
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
+{
+    struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
+    struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
+    struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+    int i;
+
+    while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+        if (!cap->next) {
+            return;
+        }
+        cap = (struct vfio_info_cap_header *)(buf + cap->next);
+    }
+
+    cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+    s->nb_iova_ranges = cap_iova_range->nr_iovas;
+    if (s->nb_iova_ranges > 1) {
+        s->usable_iova_ranges =
+            g_realloc(s->usable_iova_ranges,
+                      s->nb_iova_ranges * sizeof(struct IOVARange));
+    }
+
+    for (i = 0; i < s->nb_iova_ranges; i++) {
+        s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+        s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+    }
+}
+
+static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+                              Error **errp)
+{
+    int ret;
+    int i;
+    uint16_t pci_cmd;
+    struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+    struct vfio_iommu_type1_info *iommu_info = NULL;
+    size_t iommu_info_size = sizeof(*iommu_info);
+    struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+    char *group_file = NULL;
+
+    s->usable_iova_ranges = NULL;
+
+    /* Create a new container */
+    s->container = open("/dev/vfio/vfio", O_RDWR);
+
+    if (s->container == -1) {
+        error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
+        return -errno;
+    }
+    if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+        error_setg(errp, "Invalid VFIO version");
+        ret = -EINVAL;
+        goto fail_container;
+    }
+
+    if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+        error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
+        ret = -EINVAL;
+        goto fail_container;
+    }
+
+    /* Open the group */
+    group_file = sysfs_find_group_file(device, errp);
+    if (!group_file) {
+        ret = -EINVAL;
+        goto fail_container;
+    }
+
+    s->group = open(group_file, O_RDWR);
+    if (s->group == -1) {
+        error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
+                         group_file);
+        g_free(group_file);
+        ret = -errno;
+        goto fail_container;
+    }
+    g_free(group_file);
+
+    /* Test the group is viable and available */
+    if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
+        error_setg_errno(errp, errno, "Failed to get VFIO group status");
+        ret = -errno;
+        goto fail;
+    }
+
+    if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+        error_setg(errp, "VFIO group is not viable");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    /* Add the group to the container */
+    if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
+        error_setg_errno(errp, errno, "Failed to add group to VFIO container");
+        ret = -errno;
+        goto fail;
+    }
+
+    /* Enable the IOMMU model we want */
+    if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+        error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
+        ret = -errno;
+        goto fail;
+    }
+
+    iommu_info = g_malloc0(iommu_info_size);
+    iommu_info->argsz = iommu_info_size;
+
+    /* Get additional IOMMU info */
+    if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+        error_setg_errno(errp, errno, "Failed to get IOMMU info");
+        ret = -errno;
+        goto fail;
+    }
+
+    /*
+     * if the kernel does not report usable IOVA regions, choose
+     * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+     */
+    s->nb_iova_ranges = 1;
+    s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+    s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+    s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+    if (iommu_info->argsz > iommu_info_size) {
+        iommu_info_size = iommu_info->argsz;
+        iommu_info = g_realloc(iommu_info, iommu_info_size);
+        if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+            ret = -errno;
+            goto fail;
+        }
+        collect_usable_iova_ranges(s, iommu_info);
+    }
+
+    s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
+
+    if (s->device < 0) {
+        error_setg_errno(errp, errno, "Failed to get device fd");
+        ret = -errno;
+        goto fail;
+    }
+
+    /* Test and setup the device */
+    if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
+        error_setg_errno(errp, errno, "Failed to get device info");
+        ret = -errno;
+        goto fail;
+    }
+
+    if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+        error_setg(errp, "Invalid device regions");
+        ret = -EINVAL;
+        goto fail;
+    }
+
+    s->config_region_info = (struct vfio_region_info) {
+        .index = VFIO_PCI_CONFIG_REGION_INDEX,
+        .argsz = sizeof(struct vfio_region_info),
+    };
+    if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
+        error_setg_errno(errp, errno, "Failed to get config region info");
+        ret = -errno;
+        goto fail;
+    }
+    trace_qemu_vfio_region_info("config", s->config_region_info.offset,
+                                s->config_region_info.size,
+                                s->config_region_info.cap_offset);
+
+    for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
+        ret = qemu_vfio_pci_init_bar(s, i, errp);
+        if (ret) {
+            goto fail;
+        }
+    }
+
+    /* Enable bus master */
+    ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+    if (ret) {
+        goto fail;
+    }
+    pci_cmd |= PCI_COMMAND_MASTER;
+    ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+    if (ret) {
+        goto fail;
+    }
+    g_free(iommu_info);
+    return 0;
+fail:
+    g_free(s->usable_iova_ranges);
+    s->usable_iova_ranges = NULL;
+    s->nb_iova_ranges = 0;
+    g_free(iommu_info);
+    close(s->group);
+fail_container:
+    close(s->container);
+    return ret;
+}
+
+static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
+                                      size_t size, size_t max_size)
+{
+    QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+    Error *local_err = NULL;
+    int ret;
+
+    trace_qemu_vfio_ram_block_added(s, host, max_size);
+    ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
+    if (ret) {
+        error_reportf_err(local_err,
+                          "qemu_vfio_dma_map(%p, %zu) failed: ",
+                          host, max_size);
+    }
+}
+
+static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
+                                        size_t size, size_t max_size)
+{
+    QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+    if (host) {
+        trace_qemu_vfio_ram_block_removed(s, host, max_size);
+        qemu_vfio_dma_unmap(s, host);
+    }
+}
+
+static void qemu_vfio_open_common(QEMUVFIOState *s)
+{
+    qemu_mutex_init(&s->lock);
+    s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
+    s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
+    s->low_water_mark = QEMU_VFIO_IOVA_MIN;
+    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    ram_block_notifier_add(&s->ram_notifier);
+}
+
+/**
+ * Open a PCI device, e.g. "0000:00:01.0".
+ */
+QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
+{
+    int r;
+    QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
+
+    /*
+     * VFIO may pin all memory inside mappings, resulting it in pinning
+     * all memory inside RAM blocks unconditionally.
+     */
+    r = ram_block_discard_disable(true);
+    if (r) {
+        error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
+        g_free(s);
+        return NULL;
+    }
+
+    r = qemu_vfio_init_pci(s, device, errp);
+    if (r) {
+        ram_block_discard_disable(false);
+        g_free(s);
+        return NULL;
+    }
+    qemu_vfio_open_common(s);
+    return s;
+}
+
+static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
+{
+    for (int i = 0; i < s->nr_mappings; ++i) {
+        trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+                                     s->mappings[i].iova,
+                                     s->mappings[i].size);
+    }
+}
+
+/**
+ * Find the mapping entry that contains [host, host + size) and set @index to
+ * the position. If no entry contains it, @index is the position _after_ which
+ * to insert the new mapping. IOW, it is the index of the largest element that
+ * is smaller than @host, or -1 if no entry is.
+ */
+static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
+                                           int *index)
+{
+    IOVAMapping *p = s->mappings;
+    IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
+    IOVAMapping *mid;
+    trace_qemu_vfio_find_mapping(s, host);
+    if (!p) {
+        *index = -1;
+        return NULL;
+    }
+    while (true) {
+        mid = p + (q - p) / 2;
+        if (mid == p) {
+            break;
+        }
+        if (mid->host > host) {
+            q = mid;
+        } else if (mid->host < host) {
+            p = mid;
+        } else {
+            break;
+        }
+    }
+    if (mid->host > host) {
+        mid--;
+    } else if (mid < &s->mappings[s->nr_mappings - 1]
+               && (mid + 1)->host <= host) {
+        mid++;
+    }
+    *index = mid - &s->mappings[0];
+    if (mid >= &s->mappings[0] &&
+        mid->host <= host && mid->host + mid->size > host) {
+        assert(mid < &s->mappings[s->nr_mappings]);
+        return mid;
+    }
+    /* At this point *index + 1 is the right position to insert the new
+     * mapping.*/
+    return NULL;
+}
+
+/**
+ * Allocate IOVA and create a new mapping record and insert it in @s.
+ */
+static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
+                                          void *host, size_t size,
+                                          int index, uint64_t iova)
+{
+    int shift;
+    IOVAMapping m = {.host = host, .size = size, .iova = iova};
+    IOVAMapping *insert;
+
+    assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+    assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
+    assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
+    trace_qemu_vfio_new_mapping(s, host, size, index, iova);
+
+    assert(index >= 0);
+    s->nr_mappings++;
+    s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
+    insert = &s->mappings[index];
+    shift = s->nr_mappings - index - 1;
+    if (shift) {
+        memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
+    }
+    *insert = m;
+    return insert;
+}
+
+/* Do the DMA mapping with VFIO. */
+static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
+                                uint64_t iova, Error **errp)
+{
+    struct vfio_iommu_type1_dma_map dma_map = {
+        .argsz = sizeof(dma_map),
+        .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+        .iova = iova,
+        .vaddr = (uintptr_t)host,
+        .size = size,
+    };
+    trace_qemu_vfio_do_mapping(s, host, iova, size);
+
+    if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+        error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
+        return -errno;
+    }
+    return 0;
+}
+
+/**
+ * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
+ */
+static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
+                                   Error **errp)
+{
+    int index;
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = mapping->iova,
+        .size = mapping->size,
+    };
+
+    index = mapping - s->mappings;
+    assert(mapping->size > 0);
+    assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
+    assert(index >= 0 && index < s->nr_mappings);
+    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
+    }
+    memmove(mapping, &s->mappings[index + 1],
+            sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
+    s->nr_mappings--;
+    s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
+}
+
+/* Check if the mapping list is (ascending) ordered. */
+static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
+{
+    int i;
+    if (QEMU_VFIO_DEBUG) {
+        for (i = 0; i < s->nr_mappings - 1; ++i) {
+            if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
+                error_report("item %d not sorted!", i);
+                qemu_vfio_dump_mappings(s);
+                return false;
+            }
+            if (!(s->mappings[i].host + s->mappings[i].size <=
+                  s->mappings[i + 1].host)) {
+                error_report("item %d overlap with next!", i);
+                qemu_vfio_dump_mappings(s);
+                return false;
+            }
+        }
+    }
+    return true;
+}
+
+static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
+                                      uint64_t *iova, Error **errp)
+{
+    int i;
+
+    for (i = 0; i < s->nb_iova_ranges; i++) {
+        if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+            continue;
+        }
+        s->low_water_mark =
+            MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+        if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
+            s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
+            *iova = s->low_water_mark;
+            s->low_water_mark += size;
+            return true;
+        }
+    }
+    error_setg(errp, "fixed iova range not found");
+
+    return false;
+}
+
+static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
+                                     uint64_t *iova, Error **errp)
+{
+    int i;
+
+    for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+        if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+            continue;
+        }
+        s->high_water_mark =
+            MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+        if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
+            s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
+            *iova = s->high_water_mark - size;
+            s->high_water_mark = *iova;
+            return true;
+        }
+    }
+    error_setg(errp, "temporary iova range not found");
+
+    return false;
+}
+
+/**
+ * qemu_vfio_water_mark_reached:
+ *
+ * Returns %true if high watermark has been reached, %false otherwise.
+ */
+static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
+                                         Error **errp)
+{
+    if (s->high_water_mark - s->low_water_mark + 1 < size) {
+        error_setg(errp, "iova exhausted (water mark reached)");
+        return true;
+    }
+    return false;
+}
+
+/* Map [host, host + size) area into a contiguous IOVA address space, and store
+ * the result in @iova if not NULL. The caller need to make sure the area is
+ * aligned to page size, and mustn't overlap with existing mapping areas (split
+ * mapping status within this area is not allowed).
+ */
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+                      bool temporary, uint64_t *iova, Error **errp)
+{
+    int index;
+    IOVAMapping *mapping;
+    uint64_t iova0;
+
+    assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
+    assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+    trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
+    QEMU_LOCK_GUARD(&s->lock);
+    mapping = qemu_vfio_find_mapping(s, host, &index);
+    if (mapping) {
+        iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
+    } else {
+        int ret;
+
+        if (qemu_vfio_water_mark_reached(s, size, errp)) {
+            return -ENOMEM;
+        }
+        if (!temporary) {
+            if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
+                return -ENOMEM;
+            }
+
+            mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
+            assert(qemu_vfio_verify_mappings(s));
+            ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
+            if (ret < 0) {
+                qemu_vfio_undo_mapping(s, mapping, NULL);
+                return ret;
+            }
+            qemu_vfio_dump_mappings(s);
+        } else {
+            if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
+                return -ENOMEM;
+            }
+            ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
+            if (ret < 0) {
+                return ret;
+            }
+        }
+    }
+    trace_qemu_vfio_dma_mapped(s, host, iova0, size);
+    if (iova) {
+        *iova = iova0;
+    }
+    return 0;
+}
+
+/* Reset the high watermark and free all "temporary" mappings. */
+int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
+{
+    struct vfio_iommu_type1_dma_unmap unmap = {
+        .argsz = sizeof(unmap),
+        .flags = 0,
+        .iova = s->high_water_mark,
+        .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+    };
+    trace_qemu_vfio_dma_reset_temporary(s);
+    QEMU_LOCK_GUARD(&s->lock);
+    if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+        error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
+        return -errno;
+    }
+    s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+    return 0;
+}
+
+/* Unmapping the whole area that was previously mapped with
+ * qemu_vfio_dma_map(). */
+void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
+{
+    int index = 0;
+    IOVAMapping *m;
+
+    if (!host) {
+        return;
+    }
+
+    trace_qemu_vfio_dma_unmap(s, host);
+    QEMU_LOCK_GUARD(&s->lock);
+    m = qemu_vfio_find_mapping(s, host, &index);
+    if (!m) {
+        return;
+    }
+    qemu_vfio_undo_mapping(s, m, NULL);
+}
+
+static void qemu_vfio_reset(QEMUVFIOState *s)
+{
+    ioctl(s->device, VFIO_DEVICE_RESET);
+}
+
+/* Close and free the VFIO resources. */
+void qemu_vfio_close(QEMUVFIOState *s)
+{
+    int i;
+
+    if (!s) {
+        return;
+    }
+    for (i = 0; i < s->nr_mappings; ++i) {
+        qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
+    }
+    ram_block_notifier_remove(&s->ram_notifier);
+    g_free(s->usable_iova_ranges);
+    s->nb_iova_ranges = 0;
+    qemu_vfio_reset(s);
+    close(s->device);
+    close(s->group);
+    close(s->container);
+    ram_block_discard_disable(false);
+}
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
new file mode 100644
index 000000000..783d847a6
--- /dev/null
+++ b/util/vhost-user-server.c
@@ -0,0 +1,446 @@
+/*
+ * Sharing QEMU devices via vhost-user protocol
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/vhost-user-server.h"
+#include "block/aio-wait.h"
+
+/*
+ * Theory of operation:
+ *
+ * VuServer is started and stopped by vhost_user_server_start() and
+ * vhost_user_server_stop() from the main loop thread. Starting the server
+ * opens a vhost-user UNIX domain socket and listens for incoming connections.
+ * Only one connection is allowed at a time.
+ *
+ * The connection is handled by the vu_client_trip() coroutine in the
+ * VuServer->ctx AioContext. The coroutine consists of a vu_dispatch() loop
+ * where libvhost-user calls vu_message_read() to receive the next vhost-user
+ * protocol messages over the UNIX domain socket.
+ *
+ * When virtqueues are set up libvhost-user calls set_watch() to monitor kick
+ * fds. These fds are also handled in the VuServer->ctx AioContext.
+ *
+ * Both vu_client_trip() and kick fd monitoring can be stopped by shutting down
+ * the socket connection. Shutting down the socket connection causes
+ * vu_message_read() to fail since no more data can be received from the socket.
+ * After vu_dispatch() fails, vu_client_trip() calls vu_deinit() to stop
+ * libvhost-user before terminating the coroutine. vu_deinit() calls
+ * remove_watch() to stop monitoring kick fds and this stops virtqueue
+ * processing.
+ *
+ * When vu_client_trip() has finished cleaning up it schedules a BH in the main
+ * loop thread to accept the next client connection.
+ *
+ * When libvhost-user detects an error it calls panic_cb() and sets the
+ * dev->broken flag. Both vu_client_trip() and kick fd processing stop when
+ * the dev->broken flag is set.
+ *
+ * It is possible to switch AioContexts using
+ * vhost_user_server_detach_aio_context() and
+ * vhost_user_server_attach_aio_context(). They stop monitoring fds in the old
+ * AioContext and resume monitoring in the new AioContext. The vu_client_trip()
+ * coroutine remains in a yielded state during the switch. This is made
+ * possible by QIOChannel's support for spurious coroutine re-entry in
+ * qio_channel_yield(). The coroutine will restart I/O when re-entered from the
+ * new AioContext.
+ */
+
+static void vmsg_close_fds(VhostUserMsg *vmsg)
+{
+    int i;
+    for (i = 0; i < vmsg->fd_num; i++) {
+        close(vmsg->fds[i]);
+    }
+}
+
+static void vmsg_unblock_fds(VhostUserMsg *vmsg)
+{
+    int i;
+    for (i = 0; i < vmsg->fd_num; i++) {
+        qemu_set_nonblock(vmsg->fds[i]);
+    }
+}
+
+static void panic_cb(VuDev *vu_dev, const char *buf)
+{
+    error_report("vu_panic: %s", buf);
+}
+
+static bool coroutine_fn
+vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+{
+    struct iovec iov = {
+        .iov_base = (char *)vmsg,
+        .iov_len = VHOST_USER_HDR_SIZE,
+    };
+    int rc, read_bytes = 0;
+    Error *local_err = NULL;
+    const size_t max_fds = G_N_ELEMENTS(vmsg->fds);
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    QIOChannel *ioc = server->ioc;
+
+    vmsg->fd_num = 0;
+    if (!ioc) {
+        error_report_err(local_err);
+        goto fail;
+    }
+
+    assert(qemu_in_coroutine());
+    do {
+        size_t nfds = 0;
+        int *fds = NULL;
+
+        /*
+         * qio_channel_readv_full may have short reads, keeping calling it
+         * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
+         */
+        rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+        if (rc < 0) {
+            if (rc == QIO_CHANNEL_ERR_BLOCK) {
+                assert(local_err == NULL);
+                qio_channel_yield(ioc, G_IO_IN);
+                continue;
+            } else {
+                error_report_err(local_err);
+                goto fail;
+            }
+        }
+
+        if (nfds > 0) {
+            if (vmsg->fd_num + nfds > max_fds) {
+                error_report("A maximum of %zu fds are allowed, "
+                             "however got %zu fds now",
+                             max_fds, vmsg->fd_num + nfds);
+                g_free(fds);
+                goto fail;
+            }
+            memcpy(vmsg->fds + vmsg->fd_num, fds, nfds * sizeof(vmsg->fds[0]));
+            vmsg->fd_num += nfds;
+            g_free(fds);
+        }
+
+        if (rc == 0) { /* socket closed */
+            goto fail;
+        }
+
+        iov.iov_base += rc;
+        iov.iov_len -= rc;
+        read_bytes += rc;
+    } while (read_bytes != VHOST_USER_HDR_SIZE);
+
+    /* qio_channel_readv_full will make socket fds blocking, unblock them */
+    vmsg_unblock_fds(vmsg);
+    if (vmsg->size > sizeof(vmsg->payload)) {
+        error_report("Error: too big message request: %d, "
+                     "size: vmsg->size: %u, "
+                     "while sizeof(vmsg->payload) = %zu",
+                     vmsg->request, vmsg->size, sizeof(vmsg->payload));
+        goto fail;
+    }
+
+    struct iovec iov_payload = {
+        .iov_base = (char *)&vmsg->payload,
+        .iov_len = vmsg->size,
+    };
+    if (vmsg->size) {
+        rc = qio_channel_readv_all_eof(ioc, &iov_payload, 1, &local_err);
+        if (rc != 1) {
+            if (local_err) {
+                error_report_err(local_err);
+            }
+            goto fail;
+        }
+    }
+
+    return true;
+
+fail:
+    vmsg_close_fds(vmsg);
+
+    return false;
+}
+
+static coroutine_fn void vu_client_trip(void *opaque)
+{
+    VuServer *server = opaque;
+    VuDev *vu_dev = &server->vu_dev;
+
+    while (!vu_dev->broken && vu_dispatch(vu_dev)) {
+        /* Keep running */
+    }
+
+    vu_deinit(vu_dev);
+
+    /* vu_deinit() should have called remove_watch() */
+    assert(QTAILQ_EMPTY(&server->vu_fd_watches));
+
+    object_unref(OBJECT(server->sioc));
+    server->sioc = NULL;
+
+    object_unref(OBJECT(server->ioc));
+    server->ioc = NULL;
+
+    server->co_trip = NULL;
+    if (server->restart_listener_bh) {
+        qemu_bh_schedule(server->restart_listener_bh);
+    }
+    aio_wait_kick();
+}
+
+/*
+ * a wrapper for vu_kick_cb
+ *
+ * since aio_dispatch can only pass one user data pointer to the
+ * callback function, pack VuDev and pvt into a struct. Then unpack it
+ * and pass them to vu_kick_cb
+ */
+static void kick_handler(void *opaque)
+{
+    VuFdWatch *vu_fd_watch = opaque;
+    VuDev *vu_dev = vu_fd_watch->vu_dev;
+
+    vu_fd_watch->cb(vu_dev, 0, vu_fd_watch->pvt);
+
+    /* Stop vu_client_trip() if an error occurred in vu_fd_watch->cb() */
+    if (vu_dev->broken) {
+        VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+
+        qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+    }
+}
+
+static VuFdWatch *find_vu_fd_watch(VuServer *server, int fd)
+{
+
+    VuFdWatch *vu_fd_watch, *next;
+    QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
+        if (vu_fd_watch->fd == fd) {
+            return vu_fd_watch;
+        }
+    }
+    return NULL;
+}
+
+static void
+set_watch(VuDev *vu_dev, int fd, int vu_evt,
+          vu_watch_cb cb, void *pvt)
+{
+
+    VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+    g_assert(vu_dev);
+    g_assert(fd >= 0);
+    g_assert(cb);
+
+    VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
+
+    if (!vu_fd_watch) {
+        VuFdWatch *vu_fd_watch = g_new0(VuFdWatch, 1);
+
+        QTAILQ_INSERT_TAIL(&server->vu_fd_watches, vu_fd_watch, next);
+
+        vu_fd_watch->fd = fd;
+        vu_fd_watch->cb = cb;
+        qemu_set_nonblock(fd);
+        aio_set_fd_handler(server->ioc->ctx, fd, true, kick_handler,
+                           NULL, NULL, vu_fd_watch);
+        vu_fd_watch->vu_dev = vu_dev;
+        vu_fd_watch->pvt = pvt;
+    }
+}
+
+
+static void remove_watch(VuDev *vu_dev, int fd)
+{
+    VuServer *server;
+    g_assert(vu_dev);
+    g_assert(fd >= 0);
+
+    server = container_of(vu_dev, VuServer, vu_dev);
+
+    VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
+
+    if (!vu_fd_watch) {
+        return;
+    }
+    aio_set_fd_handler(server->ioc->ctx, fd, true, NULL, NULL, NULL, NULL);
+
+    QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next);
+    g_free(vu_fd_watch);
+}
+
+
+static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
+                      gpointer opaque)
+{
+    VuServer *server = opaque;
+
+    if (server->sioc) {
+        warn_report("Only one vhost-user client is allowed to "
+                    "connect the server one time");
+        return;
+    }
+
+    if (!vu_init(&server->vu_dev, server->max_queues, sioc->fd, panic_cb,
+                 vu_message_read, set_watch, remove_watch, server->vu_iface)) {
+        error_report("Failed to initialize libvhost-user");
+        return;
+    }
+
+    /*
+     * Unset the callback function for network listener to make another
+     * vhost-user client keeping waiting until this client disconnects
+     */
+    qio_net_listener_set_client_func(server->listener,
+                                     NULL,
+                                     NULL,
+                                     NULL);
+    server->sioc = sioc;
+    /*
+     * Increase the object reference, so sioc will not freed by
+     * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
+     */
+    object_ref(OBJECT(server->sioc));
+    qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
+    server->ioc = QIO_CHANNEL(sioc);
+    object_ref(OBJECT(server->ioc));
+
+    /* TODO vu_message_write() spins if non-blocking! */
+    qio_channel_set_blocking(server->ioc, false, NULL);
+
+    server->co_trip = qemu_coroutine_create(vu_client_trip, server);
+
+    aio_context_acquire(server->ctx);
+    vhost_user_server_attach_aio_context(server, server->ctx);
+    aio_context_release(server->ctx);
+}
+
+void vhost_user_server_stop(VuServer *server)
+{
+    aio_context_acquire(server->ctx);
+
+    qemu_bh_delete(server->restart_listener_bh);
+    server->restart_listener_bh = NULL;
+
+    if (server->sioc) {
+        VuFdWatch *vu_fd_watch;
+
+        QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+            aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+                               NULL, NULL, NULL, vu_fd_watch);
+        }
+
+        qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+
+        AIO_WAIT_WHILE(server->ctx, server->co_trip);
+    }
+
+    aio_context_release(server->ctx);
+
+    if (server->listener) {
+        qio_net_listener_disconnect(server->listener);
+        object_unref(OBJECT(server->listener));
+    }
+}
+
+/*
+ * Allow the next client to connect to the server. Called from a BH in the main
+ * loop.
+ */
+static void restart_listener_bh(void *opaque)
+{
+    VuServer *server = opaque;
+
+    qio_net_listener_set_client_func(server->listener, vu_accept, server,
+                                     NULL);
+}
+
+/* Called with ctx acquired */
+void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx)
+{
+    VuFdWatch *vu_fd_watch;
+
+    server->ctx = ctx;
+
+    if (!server->sioc) {
+        return;
+    }
+
+    qio_channel_attach_aio_context(server->ioc, ctx);
+
+    QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+        aio_set_fd_handler(ctx, vu_fd_watch->fd, true, kick_handler, NULL,
+                           NULL, vu_fd_watch);
+    }
+
+    aio_co_schedule(ctx, server->co_trip);
+}
+
+/* Called with server->ctx acquired */
+void vhost_user_server_detach_aio_context(VuServer *server)
+{
+    if (server->sioc) {
+        VuFdWatch *vu_fd_watch;
+
+        QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+            aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+                               NULL, NULL, NULL, vu_fd_watch);
+        }
+
+        qio_channel_detach_aio_context(server->ioc);
+    }
+
+    server->ctx = NULL;
+}
+
+bool vhost_user_server_start(VuServer *server,
+                             SocketAddress *socket_addr,
+                             AioContext *ctx,
+                             uint16_t max_queues,
+                             const VuDevIface *vu_iface,
+                             Error **errp)
+{
+    QEMUBH *bh;
+    QIONetListener *listener;
+
+    if (socket_addr->type != SOCKET_ADDRESS_TYPE_UNIX &&
+        socket_addr->type != SOCKET_ADDRESS_TYPE_FD) {
+        error_setg(errp, "Only socket address types 'unix' and 'fd' are supported");
+        return false;
+    }
+
+    listener = qio_net_listener_new();
+    if (qio_net_listener_open_sync(listener, socket_addr, 1,
+                                   errp) < 0) {
+        object_unref(OBJECT(listener));
+        return false;
+    }
+
+    bh = qemu_bh_new(restart_listener_bh, server);
+
+    /* zero out unspecified fields */
+    *server = (VuServer) {
+        .listener              = listener,
+        .restart_listener_bh   = bh,
+        .vu_iface              = vu_iface,
+        .max_queues            = max_queues,
+        .ctx                   = ctx,
+    };
+
+    qio_net_listener_set_name(server->listener, "vhost-user-backend-listener");
+
+    qio_net_listener_set_client_func(server->listener,
+                                     vu_accept,
+                                     server,
+                                     NULL);
+
+    QTAILQ_INIT(&server->vu_fd_watches);
+    return true;
+}
diff --git a/util/yank.c b/util/yank.c
new file mode 100644
index 000000000..abf47c346
--- /dev/null
+++ b/util/yank.c
@@ -0,0 +1,199 @@
+/*
+ * QEMU yank feature
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/lockable.h"
+#include "qapi/qapi-commands-yank.h"
+#include "qapi/qapi-visit-yank.h"
+#include "qapi/clone-visitor.h"
+#include "qemu/yank.h"
+
+struct YankFuncAndParam {
+    YankFn *func;
+    void *opaque;
+    QLIST_ENTRY(YankFuncAndParam) next;
+};
+
+struct YankInstanceEntry {
+    YankInstance *instance;
+    QLIST_HEAD(, YankFuncAndParam) yankfns;
+    QLIST_ENTRY(YankInstanceEntry) next;
+};
+
+typedef struct YankFuncAndParam YankFuncAndParam;
+typedef struct YankInstanceEntry YankInstanceEntry;
+
+/*
+ * This lock protects the yank_instance_list below. Because it's taken by
+ * OOB-capable commands, it must be "fast", i.e. it may only be held for a
+ * bounded, short time. See docs/devel/qapi-code-gen.txt for additional
+ * information.
+ */
+static QemuMutex yank_lock;
+
+static QLIST_HEAD(, YankInstanceEntry) yank_instance_list
+    = QLIST_HEAD_INITIALIZER(yank_instance_list);
+
+static bool yank_instance_equal(const YankInstance *a, const YankInstance *b)
+{
+    if (a->type != b->type) {
+        return false;
+    }
+
+    switch (a->type) {
+    case YANK_INSTANCE_TYPE_BLOCK_NODE:
+        return g_str_equal(a->u.block_node.node_name,
+                           b->u.block_node.node_name);
+
+    case YANK_INSTANCE_TYPE_CHARDEV:
+        return g_str_equal(a->u.chardev.id, b->u.chardev.id);
+
+    case YANK_INSTANCE_TYPE_MIGRATION:
+        return true;
+
+    default:
+        abort();
+    }
+}
+
+static YankInstanceEntry *yank_find_entry(const YankInstance *instance)
+{
+    YankInstanceEntry *entry;
+
+    QLIST_FOREACH(entry, &yank_instance_list, next) {
+        if (yank_instance_equal(entry->instance, instance)) {
+            return entry;
+        }
+    }
+    return NULL;
+}
+
+bool yank_register_instance(const YankInstance *instance, Error **errp)
+{
+    YankInstanceEntry *entry;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+
+    if (yank_find_entry(instance)) {
+        error_setg(errp, "duplicate yank instance");
+        return false;
+    }
+
+    entry = g_new0(YankInstanceEntry, 1);
+    entry->instance = QAPI_CLONE(YankInstance, instance);
+    QLIST_INIT(&entry->yankfns);
+    QLIST_INSERT_HEAD(&yank_instance_list, entry, next);
+
+    return true;
+}
+
+void yank_unregister_instance(const YankInstance *instance)
+{
+    YankInstanceEntry *entry;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    assert(QLIST_EMPTY(&entry->yankfns));
+    QLIST_REMOVE(entry, next);
+    qapi_free_YankInstance(entry->instance);
+    g_free(entry);
+}
+
+void yank_register_function(const YankInstance *instance,
+                            YankFn *func,
+                            void *opaque)
+{
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    func_entry = g_new0(YankFuncAndParam, 1);
+    func_entry->func = func;
+    func_entry->opaque = opaque;
+
+    QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next);
+}
+
+void yank_unregister_function(const YankInstance *instance,
+                              YankFn *func,
+                              void *opaque)
+{
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+    entry = yank_find_entry(instance);
+    assert(entry);
+
+    QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+        if (func_entry->func == func && func_entry->opaque == opaque) {
+            QLIST_REMOVE(func_entry, next);
+            g_free(func_entry);
+            return;
+        }
+    }
+
+    abort();
+}
+
+void qmp_yank(YankInstanceList *instances,
+              Error **errp)
+{
+    YankInstanceList *tail;
+    YankInstanceEntry *entry;
+    YankFuncAndParam *func_entry;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+    for (tail = instances; tail; tail = tail->next) {
+        entry = yank_find_entry(tail->value);
+        if (!entry) {
+            error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");
+            return;
+        }
+    }
+    for (tail = instances; tail; tail = tail->next) {
+        entry = yank_find_entry(tail->value);
+        assert(entry);
+        QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+            func_entry->func(func_entry->opaque);
+        }
+    }
+}
+
+YankInstanceList *qmp_query_yank(Error **errp)
+{
+    YankInstanceEntry *entry;
+    YankInstanceList *ret;
+
+    ret = NULL;
+
+    QEMU_LOCK_GUARD(&yank_lock);
+    QLIST_FOREACH(entry, &yank_instance_list, next) {
+        YankInstanceList *new_entry;
+        new_entry = g_new0(YankInstanceList, 1);
+        new_entry->value = QAPI_CLONE(YankInstance, entry->instance);
+        new_entry->next = ret;
+        ret = new_entry;
+    }
+
+    return ret;
+}
+
+static void __attribute__((__constructor__)) yank_init(void)
+{
+    qemu_mutex_init(&yank_lock);
+}
author	Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>	2023-10-10 11:40:56 +0000
committer	Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>	2023-10-10 11:40:56 +0000
commit	e02cda008591317b1625707ff8e115a4841aa889 (patch)
tree	aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /util
parent	cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)