aboutsummaryrefslogtreecommitdiffstats
path: root/util
diff options
context:
space:
mode:
authorTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
committerTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
commite02cda008591317b1625707ff8e115a4841aa889 (patch)
treeaee302e3cf8b59ec2d32ec481be3d1afddfc8968 /util
parentcc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)
Introduce Virtio-loopback epsilon release:
Epsilon release introduces a new compatibility layer which make virtio-loopback design to work with QEMU and rust-vmm vhost-user backend without require any changes. Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
Diffstat (limited to 'util')
-rw-r--r--util/aio-posix.c730
-rw-r--r--util/aio-posix.h81
-rw-r--r--util/aio-wait.c72
-rw-r--r--util/aio-win32.c447
-rw-r--r--util/aiocb.c55
-rw-r--r--util/async.c690
-rw-r--r--util/atomic64.c83
-rw-r--r--util/base64.c60
-rw-r--r--util/bitmap.c489
-rw-r--r--util/bitops.c157
-rw-r--r--util/block-helpers.c46
-rw-r--r--util/block-helpers.h19
-rw-r--r--util/buffer.c173
-rw-r--r--util/bufferiszero.c355
-rw-r--r--util/cacheflush.c146
-rw-r--r--util/cacheinfo.c199
-rw-r--r--util/compatfd.c106
-rw-r--r--util/coroutine-sigaltstack.c304
-rw-r--r--util/coroutine-ucontext.c332
-rw-r--r--util/coroutine-win32.c102
-rw-r--r--util/crc-ccitt.c127
-rw-r--r--util/crc32c.c115
-rw-r--r--util/cutils.c1059
-rw-r--r--util/dbus.c57
-rw-r--r--util/drm.c75
-rw-r--r--util/envlist.c232
-rw-r--r--util/error.c305
-rw-r--r--util/event_notifier-posix.c140
-rw-r--r--util/event_notifier-win32.c50
-rw-r--r--util/fdmon-epoll.c155
-rw-r--r--util/fdmon-io_uring.c361
-rw-r--r--util/fdmon-poll.c108
-rw-r--r--util/fifo8.c118
-rw-r--r--util/filemonitor-inotify.c339
-rw-r--r--util/filemonitor-stub.c59
-rw-r--r--util/getauxval.c118
-rw-r--r--util/guest-random.c101
-rw-r--r--util/hbitmap.c933
-rw-r--r--util/hexdump.c65
-rw-r--r--util/host-utils.c268
-rw-r--r--util/id.c69
-rw-r--r--util/iov.c764
-rw-r--r--util/iova-tree.c114
-rw-r--r--util/keyval.c577
-rw-r--r--util/lockcnt.c399
-rw-r--r--util/log.c389
-rw-r--r--util/main-loop.c594
-rw-r--r--util/memfd.c206
-rw-r--r--util/meson.build89
-rw-r--r--util/mmap-alloc.c306
-rw-r--r--util/module.c387
-rw-r--r--util/notify.c76
-rw-r--r--util/nvdimm-utils.c30
-rw-r--r--util/osdep.c617
-rw-r--r--util/oslib-posix.c860
-rw-r--r--util/oslib-win32.c654
-rw-r--r--util/pagesize.c18
-rw-r--r--util/path.c70
-rw-r--r--util/qdist.c397
-rw-r--r--util/qemu-co-shared-resource.c90
-rw-r--r--util/qemu-config.c565
-rw-r--r--util/qemu-coroutine-io.c93
-rw-r--r--util/qemu-coroutine-lock.c467
-rw-r--r--util/qemu-coroutine-sleep.c80
-rw-r--r--util/qemu-coroutine.c204
-rw-r--r--util/qemu-error.c413
-rw-r--r--util/qemu-openpty.c139
-rw-r--r--util/qemu-option.c1226
-rw-r--r--util/qemu-print.c70
-rw-r--r--util/qemu-progress.c162
-rw-r--r--util/qemu-sockets.c1482
-rw-r--r--util/qemu-thread-common.h54
-rw-r--r--util/qemu-thread-posix.c632
-rw-r--r--util/qemu-thread-win32.c461
-rw-r--r--util/qemu-timer-common.c63
-rw-r--r--util/qemu-timer.c674
-rw-r--r--util/qht.c963
-rw-r--r--util/qsp.c813
-rw-r--r--util/range.c72
-rw-r--r--util/rcu.c455
-rw-r--r--util/readline.c549
-rw-r--r--util/selfmap.c83
-rw-r--r--util/stats64.c137
-rw-r--r--util/sys_membarrier.c50
-rw-r--r--util/systemd.c79
-rw-r--r--util/thread-pool.c352
-rw-r--r--util/throttle.c637
-rw-r--r--util/timed-average.c231
-rw-r--r--util/trace-events106
-rw-r--r--util/trace.h1
-rw-r--r--util/transactions.c100
-rw-r--r--util/unicode.c156
-rw-r--r--util/uri.c2314
-rw-r--r--util/userfaultfd.c345
-rw-r--r--util/uuid.c118
-rw-r--r--util/vfio-helpers.c861
-rw-r--r--util/vhost-user-server.c446
-rw-r--r--util/yank.c199
98 files changed, 31989 insertions, 0 deletions
diff --git a/util/aio-posix.c b/util/aio-posix.c
new file mode 100644
index 000000000..2b86777e9
--- /dev/null
+++ b/util/aio-posix.c
@@ -0,0 +1,730 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "qemu/main-loop.h"
+#include "qemu/rcu.h"
+#include "qemu/rcu_queue.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+#include "aio-posix.h"
+
+/* Stop userspace polling on a handler if it isn't active for some time */
+#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+
+bool aio_poll_disabled(AioContext *ctx)
+{
+ return qatomic_read(&ctx->poll_disable_cnt);
+}
+
+void aio_add_ready_handler(AioHandlerList *ready_list,
+ AioHandler *node,
+ int revents)
+{
+ QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */
+ node->pfd.revents = revents;
+ QLIST_INSERT_HEAD(ready_list, node, node_ready);
+}
+
+static AioHandler *find_aio_handler(AioContext *ctx, int fd)
+{
+ AioHandler *node;
+
+ QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+ if (node->pfd.fd == fd) {
+ if (!QLIST_IS_INSERTED(node, node_deleted)) {
+ return node;
+ }
+ }
+ }
+
+ return NULL;
+}
+
+static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+{
+ /* If the GSource is in the process of being destroyed then
+ * g_source_remove_poll() causes an assertion failure. Skip
+ * removal in that case, because glib cleans up its state during
+ * destruction anyway.
+ */
+ if (!g_source_is_destroyed(&ctx->source)) {
+ g_source_remove_poll(&ctx->source, &node->pfd);
+ }
+
+ node->pfd.revents = 0;
+
+ /* If the fd monitor has already marked it deleted, leave it alone */
+ if (QLIST_IS_INSERTED(node, node_deleted)) {
+ return false;
+ }
+
+ /* If a read is in progress, just mark the node as deleted */
+ if (qemu_lockcnt_count(&ctx->list_lock)) {
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+ return false;
+ }
+ /* Otherwise, delete it for real. We can't just mark it as
+ * deleted because deleted nodes are only cleaned up while
+ * no one is walking the handlers list.
+ */
+ QLIST_SAFE_REMOVE(node, node_poll);
+ QLIST_REMOVE(node, node);
+ return true;
+}
+
+void aio_set_fd_handler(AioContext *ctx,
+ int fd,
+ bool is_external,
+ IOHandler *io_read,
+ IOHandler *io_write,
+ AioPollFn *io_poll,
+ void *opaque)
+{
+ AioHandler *node;
+ AioHandler *new_node = NULL;
+ bool is_new = false;
+ bool deleted = false;
+ int poll_disable_change;
+
+ qemu_lockcnt_lock(&ctx->list_lock);
+
+ node = find_aio_handler(ctx, fd);
+
+ /* Are we deleting the fd handler? */
+ if (!io_read && !io_write && !io_poll) {
+ if (node == NULL) {
+ qemu_lockcnt_unlock(&ctx->list_lock);
+ return;
+ }
+ /* Clean events in order to unregister fd from the ctx epoll. */
+ node->pfd.events = 0;
+
+ poll_disable_change = -!node->io_poll;
+ } else {
+ poll_disable_change = !io_poll - (node && !node->io_poll);
+ if (node == NULL) {
+ is_new = true;
+ }
+ /* Alloc and insert if it's not already there */
+ new_node = g_new0(AioHandler, 1);
+
+ /* Update handler with latest information */
+ new_node->io_read = io_read;
+ new_node->io_write = io_write;
+ new_node->io_poll = io_poll;
+ new_node->opaque = opaque;
+ new_node->is_external = is_external;
+
+ if (is_new) {
+ new_node->pfd.fd = fd;
+ } else {
+ new_node->pfd = node->pfd;
+ }
+ g_source_add_poll(&ctx->source, &new_node->pfd);
+
+ new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0);
+ new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
+
+ QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node);
+ }
+
+ /* No need to order poll_disable_cnt writes against other updates;
+ * the counter is only used to avoid wasting time and latency on
+ * iterated polling when the system call will be ultimately necessary.
+ * Changing handlers is a rare event, and a little wasted polling until
+ * the aio_notify below is not an issue.
+ */
+ qatomic_set(&ctx->poll_disable_cnt,
+ qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change);
+
+ ctx->fdmon_ops->update(ctx, node, new_node);
+ if (node) {
+ deleted = aio_remove_fd_handler(ctx, node);
+ }
+ qemu_lockcnt_unlock(&ctx->list_lock);
+ aio_notify(ctx);
+
+ if (deleted) {
+ g_free(node);
+ }
+}
+
+void aio_set_fd_poll(AioContext *ctx, int fd,
+ IOHandler *io_poll_begin,
+ IOHandler *io_poll_end)
+{
+ AioHandler *node = find_aio_handler(ctx, fd);
+
+ if (!node) {
+ return;
+ }
+
+ node->io_poll_begin = io_poll_begin;
+ node->io_poll_end = io_poll_end;
+}
+
+void aio_set_event_notifier(AioContext *ctx,
+ EventNotifier *notifier,
+ bool is_external,
+ EventNotifierHandler *io_read,
+ AioPollFn *io_poll)
+{
+ aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external,
+ (IOHandler *)io_read, NULL, io_poll, notifier);
+}
+
+void aio_set_event_notifier_poll(AioContext *ctx,
+ EventNotifier *notifier,
+ EventNotifierHandler *io_poll_begin,
+ EventNotifierHandler *io_poll_end)
+{
+ aio_set_fd_poll(ctx, event_notifier_get_fd(notifier),
+ (IOHandler *)io_poll_begin,
+ (IOHandler *)io_poll_end);
+}
+
+static bool poll_set_started(AioContext *ctx, bool started)
+{
+ AioHandler *node;
+ bool progress = false;
+
+ if (started == ctx->poll_started) {
+ return false;
+ }
+
+ ctx->poll_started = started;
+
+ qemu_lockcnt_inc(&ctx->list_lock);
+ QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+ IOHandler *fn;
+
+ if (QLIST_IS_INSERTED(node, node_deleted)) {
+ continue;
+ }
+
+ if (started) {
+ fn = node->io_poll_begin;
+ } else {
+ fn = node->io_poll_end;
+ }
+
+ if (fn) {
+ fn(node->opaque);
+ }
+
+ /* Poll one last time in case ->io_poll_end() raced with the event */
+ if (!started) {
+ progress = node->io_poll(node->opaque) || progress;
+ }
+ }
+ qemu_lockcnt_dec(&ctx->list_lock);
+
+ return progress;
+}
+
+
+bool aio_prepare(AioContext *ctx)
+{
+ /* Poll mode cannot be used with glib's event loop, disable it. */
+ poll_set_started(ctx, false);
+
+ return false;
+}
+
+bool aio_pending(AioContext *ctx)
+{
+ AioHandler *node;
+ bool result = false;
+
+ /*
+ * We have to walk very carefully in case aio_set_fd_handler is
+ * called while we're walking.
+ */
+ qemu_lockcnt_inc(&ctx->list_lock);
+
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ int revents;
+
+ revents = node->pfd.revents & node->pfd.events;
+ if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read &&
+ aio_node_check(ctx, node->is_external)) {
+ result = true;
+ break;
+ }
+ if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write &&
+ aio_node_check(ctx, node->is_external)) {
+ result = true;
+ break;
+ }
+ }
+ qemu_lockcnt_dec(&ctx->list_lock);
+
+ return result;
+}
+
+static void aio_free_deleted_handlers(AioContext *ctx)
+{
+ AioHandler *node;
+
+ if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) {
+ return;
+ }
+ if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+ return; /* we are nested, let the parent do the freeing */
+ }
+
+ while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) {
+ QLIST_REMOVE(node, node);
+ QLIST_REMOVE(node, node_deleted);
+ QLIST_SAFE_REMOVE(node, node_poll);
+ g_free(node);
+ }
+
+ qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+}
+
+static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node)
+{
+ bool progress = false;
+ int revents;
+
+ revents = node->pfd.revents & node->pfd.events;
+ node->pfd.revents = 0;
+
+ /*
+ * Start polling AioHandlers when they become ready because activity is
+ * likely to continue. Note that starvation is theoretically possible when
+ * fdmon_supports_polling(), but only until the fd fires for the first
+ * time.
+ */
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ !QLIST_IS_INSERTED(node, node_poll) &&
+ node->io_poll) {
+ trace_poll_add(ctx, node, node->pfd.fd, revents);
+ if (ctx->poll_started && node->io_poll_begin) {
+ node->io_poll_begin(node->opaque);
+ }
+ QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll);
+ }
+
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) &&
+ aio_node_check(ctx, node->is_external) &&
+ node->io_read) {
+ node->io_read(node->opaque);
+
+ /* aio_notify() does not count as progress */
+ if (node->opaque != &ctx->notifier) {
+ progress = true;
+ }
+ }
+ if (!QLIST_IS_INSERTED(node, node_deleted) &&
+ (revents & (G_IO_OUT | G_IO_ERR)) &&
+ aio_node_check(ctx, node->is_external) &&
+ node->io_write) {
+ node->io_write(node->opaque);
+ progress = true;
+ }
+
+ return progress;
+}
+
+/*
+ * If we have a list of ready handlers then this is more efficient than
+ * scanning all handlers with aio_dispatch_handlers().
+ */
+static bool aio_dispatch_ready_handlers(AioContext *ctx,
+ AioHandlerList *ready_list)
+{
+ bool progress = false;
+ AioHandler *node;
+
+ while ((node = QLIST_FIRST(ready_list))) {
+ QLIST_REMOVE(node, node_ready);
+ progress = aio_dispatch_handler(ctx, node) || progress;
+ }
+
+ return progress;
+}
+
+/* Slower than aio_dispatch_ready_handlers() but only used via glib */
+static bool aio_dispatch_handlers(AioContext *ctx)
+{
+ AioHandler *node, *tmp;
+ bool progress = false;
+
+ QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+ progress = aio_dispatch_handler(ctx, node) || progress;
+ }
+
+ return progress;
+}
+
+void aio_dispatch(AioContext *ctx)
+{
+ qemu_lockcnt_inc(&ctx->list_lock);
+ aio_bh_poll(ctx);
+ aio_dispatch_handlers(ctx);
+ aio_free_deleted_handlers(ctx);
+ qemu_lockcnt_dec(&ctx->list_lock);
+
+ timerlistgroup_run_timers(&ctx->tlg);
+}
+
+static bool run_poll_handlers_once(AioContext *ctx,
+ int64_t now,
+ int64_t *timeout)
+{
+ bool progress = false;
+ AioHandler *node;
+ AioHandler *tmp;
+
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (aio_node_check(ctx, node->is_external) &&
+ node->io_poll(node->opaque)) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+
+ /*
+ * Polling was successful, exit try_poll_mode immediately
+ * to adjust the next polling time.
+ */
+ *timeout = 0;
+ if (node->opaque != &ctx->notifier) {
+ progress = true;
+ }
+ }
+
+ /* Caller handles freeing deleted nodes. Don't do it here. */
+ }
+
+ return progress;
+}
+
+static bool fdmon_supports_polling(AioContext *ctx)
+{
+ return ctx->fdmon_ops->need_wait != aio_poll_disabled;
+}
+
+static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now)
+{
+ AioHandler *node;
+ AioHandler *tmp;
+ bool progress = false;
+
+ /*
+ * File descriptor monitoring implementations without userspace polling
+ * support suffer from starvation when a subset of handlers is polled
+ * because fds will not be processed in a timely fashion. Don't remove
+ * idle poll handlers.
+ */
+ if (!fdmon_supports_polling(ctx)) {
+ return false;
+ }
+
+ QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) {
+ if (node->poll_idle_timeout == 0LL) {
+ node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS;
+ } else if (now >= node->poll_idle_timeout) {
+ trace_poll_remove(ctx, node, node->pfd.fd);
+ node->poll_idle_timeout = 0LL;
+ QLIST_SAFE_REMOVE(node, node_poll);
+ if (ctx->poll_started && node->io_poll_end) {
+ node->io_poll_end(node->opaque);
+
+ /*
+ * Final poll in case ->io_poll_end() races with an event.
+ * Nevermind about re-adding the handler in the rare case where
+ * this causes progress.
+ */
+ progress = node->io_poll(node->opaque) || progress;
+ }
+ }
+ }
+
+ return progress;
+}
+
+/* run_poll_handlers:
+ * @ctx: the AioContext
+ * @max_ns: maximum time to poll for, in nanoseconds
+ *
+ * Polls for a given time.
+ *
+ * Note that the caller must have incremented ctx->list_lock.
+ *
+ * Returns: true if progress was made, false otherwise
+ */
+static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout)
+{
+ bool progress;
+ int64_t start_time, elapsed_time;
+
+ assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
+
+ trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
+
+ /*
+ * Optimization: ->io_poll() handlers often contain RCU read critical
+ * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock()
+ * -> rcu_read_lock() -> ... sequences with expensive memory
+ * synchronization primitives. Make the entire polling loop an RCU
+ * critical section because nested rcu_read_lock()/rcu_read_unlock() calls
+ * are cheap.
+ */
+ RCU_READ_LOCK_GUARD();
+
+ start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ do {
+ progress = run_poll_handlers_once(ctx, start_time, timeout);
+ elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time;
+ max_ns = qemu_soonest_timeout(*timeout, max_ns);
+ assert(!(max_ns && progress));
+ } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx));
+
+ if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) {
+ *timeout = 0;
+ progress = true;
+ }
+
+ /* If time has passed with no successful polling, adjust *timeout to
+ * keep the same ending time.
+ */
+ if (*timeout != -1) {
+ *timeout -= MIN(*timeout, elapsed_time);
+ }
+
+ trace_run_poll_handlers_end(ctx, progress, *timeout);
+ return progress;
+}
+
+/* try_poll_mode:
+ * @ctx: the AioContext
+ * @timeout: timeout for blocking wait, computed by the caller and updated if
+ * polling succeeds.
+ *
+ * Note that the caller must have incremented ctx->list_lock.
+ *
+ * Returns: true if progress was made, false otherwise
+ */
+static bool try_poll_mode(AioContext *ctx, int64_t *timeout)
+{
+ int64_t max_ns;
+
+ if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
+ return false;
+ }
+
+ max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
+ if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
+ poll_set_started(ctx, true);
+
+ if (run_poll_handlers(ctx, max_ns, timeout)) {
+ return true;
+ }
+ }
+
+ if (poll_set_started(ctx, false)) {
+ *timeout = 0;
+ return true;
+ }
+
+ return false;
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+ AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
+ int ret = 0;
+ bool progress;
+ bool use_notify_me;
+ int64_t timeout;
+ int64_t start = 0;
+
+ /*
+ * There cannot be two concurrent aio_poll calls for the same AioContext (or
+ * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+ * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+ *
+ * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
+ * is special in that it runs in the main thread, but that thread's context
+ * is qemu_aio_context.
+ */
+ assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
+ qemu_get_aio_context() : ctx));
+
+ qemu_lockcnt_inc(&ctx->list_lock);
+
+ if (ctx->poll_max_ns) {
+ start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+ }
+
+ timeout = blocking ? aio_compute_timeout(ctx) : 0;
+ progress = try_poll_mode(ctx, &timeout);
+ assert(!(timeout && progress));
+
+ /*
+ * aio_notify can avoid the expensive event_notifier_set if
+ * everything (file descriptors, bottom halves, timers) will
+ * be re-evaluated before the next blocking poll(). This is
+ * already true when aio_poll is called with blocking == false;
+ * if blocking == true, it is only true after poll() returns,
+ * so disable the optimization now.
+ */
+ use_notify_me = timeout != 0;
+ if (use_notify_me) {
+ qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
+ /*
+ * Write ctx->notify_me before reading ctx->notified. Pairs with
+ * smp_mb in aio_notify().
+ */
+ smp_mb();
+
+ /* Don't block if aio_notify() was called */
+ if (qatomic_read(&ctx->notified)) {
+ timeout = 0;
+ }
+ }
+
+ /* If polling is allowed, non-blocking aio_poll does not need the
+ * system call---a single round of run_poll_handlers_once suffices.
+ */
+ if (timeout || ctx->fdmon_ops->need_wait(ctx)) {
+ ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout);
+ }
+
+ if (use_notify_me) {
+ /* Finish the poll before clearing the flag. */
+ qatomic_store_release(&ctx->notify_me,
+ qatomic_read(&ctx->notify_me) - 2);
+ }
+
+ aio_notify_accept(ctx);
+
+ /* Adjust polling time */
+ if (ctx->poll_max_ns) {
+ int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start;
+
+ if (block_ns <= ctx->poll_ns) {
+ /* This is the sweet spot, no adjustment needed */
+ } else if (block_ns > ctx->poll_max_ns) {
+ /* We'd have to poll for too long, poll less */
+ int64_t old = ctx->poll_ns;
+
+ if (ctx->poll_shrink) {
+ ctx->poll_ns /= ctx->poll_shrink;
+ } else {
+ ctx->poll_ns = 0;
+ }
+
+ trace_poll_shrink(ctx, old, ctx->poll_ns);
+ } else if (ctx->poll_ns < ctx->poll_max_ns &&
+ block_ns < ctx->poll_max_ns) {
+ /* There is room to grow, poll longer */
+ int64_t old = ctx->poll_ns;
+ int64_t grow = ctx->poll_grow;
+
+ if (grow == 0) {
+ grow = 2;
+ }
+
+ if (ctx->poll_ns) {
+ ctx->poll_ns *= grow;
+ } else {
+ ctx->poll_ns = 4000; /* start polling at 4 microseconds */
+ }
+
+ if (ctx->poll_ns > ctx->poll_max_ns) {
+ ctx->poll_ns = ctx->poll_max_ns;
+ }
+
+ trace_poll_grow(ctx, old, ctx->poll_ns);
+ }
+ }
+
+ progress |= aio_bh_poll(ctx);
+
+ if (ret > 0) {
+ progress |= aio_dispatch_ready_handlers(ctx, &ready_list);
+ }
+
+ aio_free_deleted_handlers(ctx);
+
+ qemu_lockcnt_dec(&ctx->list_lock);
+
+ progress |= timerlistgroup_run_timers(&ctx->tlg);
+
+ return progress;
+}
+
+void aio_context_setup(AioContext *ctx)
+{
+ ctx->fdmon_ops = &fdmon_poll_ops;
+ ctx->epollfd = -1;
+
+ /* Use the fastest fd monitoring implementation if available */
+ if (fdmon_io_uring_setup(ctx)) {
+ return;
+ }
+
+ fdmon_epoll_setup(ctx);
+}
+
+void aio_context_destroy(AioContext *ctx)
+{
+ fdmon_io_uring_destroy(ctx);
+ fdmon_epoll_disable(ctx);
+ aio_free_deleted_handlers(ctx);
+}
+
+void aio_context_use_g_source(AioContext *ctx)
+{
+ /*
+ * Disable io_uring when the glib main loop is used because it doesn't
+ * support mixed glib/aio_poll() usage. It relies on aio_poll() being
+ * called regularly so that changes to the monitored file descriptors are
+ * submitted, otherwise a list of pending fd handlers builds up.
+ */
+ fdmon_io_uring_destroy(ctx);
+ aio_free_deleted_handlers(ctx);
+}
+
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+ int64_t grow, int64_t shrink, Error **errp)
+{
+ /* No thread synchronization here, it doesn't matter if an incorrect value
+ * is used once.
+ */
+ ctx->poll_max_ns = max_ns;
+ ctx->poll_ns = 0;
+ ctx->poll_grow = grow;
+ ctx->poll_shrink = shrink;
+
+ aio_notify(ctx);
+}
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+ Error **errp)
+{
+ /*
+ * No thread synchronization here, it doesn't matter if an incorrect value
+ * is used once.
+ */
+ ctx->aio_max_batch = max_batch;
+
+ aio_notify(ctx);
+}
diff --git a/util/aio-posix.h b/util/aio-posix.h
new file mode 100644
index 000000000..c80c04506
--- /dev/null
+++ b/util/aio-posix.h
@@ -0,0 +1,81 @@
+/*
+ * AioContext POSIX event loop implementation internal APIs
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2020
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef AIO_POSIX_H
+#define AIO_POSIX_H
+
+#include "block/aio.h"
+
+struct AioHandler {
+ GPollFD pfd;
+ IOHandler *io_read;
+ IOHandler *io_write;
+ AioPollFn *io_poll;
+ IOHandler *io_poll_begin;
+ IOHandler *io_poll_end;
+ void *opaque;
+ QLIST_ENTRY(AioHandler) node;
+ QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */
+ QLIST_ENTRY(AioHandler) node_deleted;
+ QLIST_ENTRY(AioHandler) node_poll;
+#ifdef CONFIG_LINUX_IO_URING
+ QSLIST_ENTRY(AioHandler) node_submitted;
+ unsigned flags; /* see fdmon-io_uring.c */
+#endif
+ int64_t poll_idle_timeout; /* when to stop userspace polling */
+ bool is_external;
+};
+
+/* Add a handler to a ready list */
+void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node,
+ int revents);
+
+extern const FDMonOps fdmon_poll_ops;
+
+#ifdef CONFIG_EPOLL_CREATE1
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd);
+void fdmon_epoll_setup(AioContext *ctx);
+void fdmon_epoll_disable(AioContext *ctx);
+#else
+static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+ return false;
+}
+
+static inline void fdmon_epoll_setup(AioContext *ctx)
+{
+}
+
+static inline void fdmon_epoll_disable(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_EPOLL_CREATE1 */
+
+#ifdef CONFIG_LINUX_IO_URING
+bool fdmon_io_uring_setup(AioContext *ctx);
+void fdmon_io_uring_destroy(AioContext *ctx);
+#else
+static inline bool fdmon_io_uring_setup(AioContext *ctx)
+{
+ return false;
+}
+
+static inline void fdmon_io_uring_destroy(AioContext *ctx)
+{
+}
+#endif /* !CONFIG_LINUX_IO_URING */
+
+#endif /* AIO_POSIX_H */
diff --git a/util/aio-wait.c b/util/aio-wait.c
new file mode 100644
index 000000000..bdb3d3af2
--- /dev/null
+++ b/util/aio-wait.c
@@ -0,0 +1,72 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "block/aio-wait.h"
+
+AioWait global_aio_wait;
+
+static void dummy_bh_cb(void *opaque)
+{
+ /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */
+}
+
+void aio_wait_kick(void)
+{
+ /* The barrier (or an atomic op) is in the caller. */
+ if (qatomic_read(&global_aio_wait.num_waiters)) {
+ aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+ }
+}
+
+typedef struct {
+ bool done;
+ QEMUBHFunc *cb;
+ void *opaque;
+} AioWaitBHData;
+
+/* Context: BH in IOThread */
+static void aio_wait_bh(void *opaque)
+{
+ AioWaitBHData *data = opaque;
+
+ data->cb(data->opaque);
+
+ data->done = true;
+ aio_wait_kick();
+}
+
+void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
+{
+ AioWaitBHData data = {
+ .cb = cb,
+ .opaque = opaque,
+ };
+
+ assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+
+ aio_bh_schedule_oneshot(ctx, aio_wait_bh, &data);
+ AIO_WAIT_WHILE(ctx, !data.done);
+}
diff --git a/util/aio-win32.c b/util/aio-win32.c
new file mode 100644
index 000000000..d5b09a119
--- /dev/null
+++ b/util/aio-win32.c
@@ -0,0 +1,447 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM Corp., 2008
+ * Copyright Red Hat Inc., 2012
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/block.h"
+#include "qemu/main-loop.h"
+#include "qemu/queue.h"
+#include "qemu/sockets.h"
+#include "qapi/error.h"
+#include "qemu/rcu_queue.h"
+
+struct AioHandler {
+ EventNotifier *e;
+ IOHandler *io_read;
+ IOHandler *io_write;
+ EventNotifierHandler *io_notify;
+ GPollFD pfd;
+ int deleted;
+ void *opaque;
+ bool is_external;
+ QLIST_ENTRY(AioHandler) node;
+};
+
+static void aio_remove_fd_handler(AioContext *ctx, AioHandler *node)
+{
+ /*
+ * If the GSource is in the process of being destroyed then
+ * g_source_remove_poll() causes an assertion failure. Skip
+ * removal in that case, because glib cleans up its state during
+ * destruction anyway.
+ */
+ if (!g_source_is_destroyed(&ctx->source)) {
+ g_source_remove_poll(&ctx->source, &node->pfd);
+ }
+
+ /* If aio_poll is in progress, just mark the node as deleted */
+ if (qemu_lockcnt_count(&ctx->list_lock)) {
+ node->deleted = 1;
+ node->pfd.revents = 0;
+ } else {
+ /* Otherwise, delete it for real. We can't just mark it as
+ * deleted because deleted nodes are only cleaned up after
+ * releasing the list_lock.
+ */
+ QLIST_REMOVE(node, node);
+ g_free(node);
+ }
+}
+
+void aio_set_fd_handler(AioContext *ctx,
+ int fd,
+ bool is_external,
+ IOHandler *io_read,
+ IOHandler *io_write,
+ AioPollFn *io_poll,
+ void *opaque)
+{
+ /* fd is a SOCKET in our case */
+ AioHandler *old_node;
+ AioHandler *node = NULL;
+
+ qemu_lockcnt_lock(&ctx->list_lock);
+ QLIST_FOREACH(old_node, &ctx->aio_handlers, node) {
+ if (old_node->pfd.fd == fd && !old_node->deleted) {
+ break;
+ }
+ }
+
+ if (io_read || io_write) {
+ HANDLE event;
+ long bitmask = 0;
+
+ /* Alloc and insert if it's not already there */
+ node = g_new0(AioHandler, 1);
+ node->pfd.fd = fd;
+
+ node->pfd.events = 0;
+ if (node->io_read) {
+ node->pfd.events |= G_IO_IN;
+ }
+ if (node->io_write) {
+ node->pfd.events |= G_IO_OUT;
+ }
+
+ node->e = &ctx->notifier;
+
+ /* Update handler with latest information */
+ node->opaque = opaque;
+ node->io_read = io_read;
+ node->io_write = io_write;
+ node->is_external = is_external;
+
+ if (io_read) {
+ bitmask |= FD_READ | FD_ACCEPT | FD_CLOSE;
+ }
+
+ if (io_write) {
+ bitmask |= FD_WRITE | FD_CONNECT;
+ }
+
+ QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node);
+ event = event_notifier_get_handle(&ctx->notifier);
+ WSAEventSelect(node->pfd.fd, event, bitmask);
+ }
+ if (old_node) {
+ aio_remove_fd_handler(ctx, old_node);
+ }
+
+ qemu_lockcnt_unlock(&ctx->list_lock);
+ aio_notify(ctx);
+}
+
+void aio_set_fd_poll(AioContext *ctx, int fd,
+ IOHandler *io_poll_begin,
+ IOHandler *io_poll_end)
+{
+ /* Not implemented */
+}
+
+void aio_set_event_notifier(AioContext *ctx,
+ EventNotifier *e,
+ bool is_external,
+ EventNotifierHandler *io_notify,
+ AioPollFn *io_poll)
+{
+ AioHandler *node;
+
+ qemu_lockcnt_lock(&ctx->list_lock);
+ QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+ if (node->e == e && !node->deleted) {
+ break;
+ }
+ }
+
+ /* Are we deleting the fd handler? */
+ if (!io_notify) {
+ if (node) {
+ aio_remove_fd_handler(ctx, node);
+ }
+ } else {
+ if (node == NULL) {
+ /* Alloc and insert if it's not already there */
+ node = g_new0(AioHandler, 1);
+ node->e = e;
+ node->pfd.fd = (uintptr_t)event_notifier_get_handle(e);
+ node->pfd.events = G_IO_IN;
+ node->is_external = is_external;
+ QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node);
+
+ g_source_add_poll(&ctx->source, &node->pfd);
+ }
+ /* Update handler with latest information */
+ node->io_notify = io_notify;
+ }
+
+ qemu_lockcnt_unlock(&ctx->list_lock);
+ aio_notify(ctx);
+}
+
+void aio_set_event_notifier_poll(AioContext *ctx,
+ EventNotifier *notifier,
+ EventNotifierHandler *io_poll_begin,
+ EventNotifierHandler *io_poll_end)
+{
+ /* Not implemented */
+}
+
+bool aio_prepare(AioContext *ctx)
+{
+ static struct timeval tv0;
+ AioHandler *node;
+ bool have_select_revents = false;
+ fd_set rfds, wfds;
+
+ /*
+ * We have to walk very carefully in case aio_set_fd_handler is
+ * called while we're walking.
+ */
+ qemu_lockcnt_inc(&ctx->list_lock);
+
+ /* fill fd sets */
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ if (node->io_read) {
+ FD_SET ((SOCKET)node->pfd.fd, &rfds);
+ }
+ if (node->io_write) {
+ FD_SET ((SOCKET)node->pfd.fd, &wfds);
+ }
+ }
+
+ if (select(0, &rfds, &wfds, NULL, &tv0) > 0) {
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ node->pfd.revents = 0;
+ if (FD_ISSET(node->pfd.fd, &rfds)) {
+ node->pfd.revents |= G_IO_IN;
+ have_select_revents = true;
+ }
+
+ if (FD_ISSET(node->pfd.fd, &wfds)) {
+ node->pfd.revents |= G_IO_OUT;
+ have_select_revents = true;
+ }
+ }
+ }
+
+ qemu_lockcnt_dec(&ctx->list_lock);
+ return have_select_revents;
+}
+
+bool aio_pending(AioContext *ctx)
+{
+ AioHandler *node;
+ bool result = false;
+
+ /*
+ * We have to walk very carefully in case aio_set_fd_handler is
+ * called while we're walking.
+ */
+ qemu_lockcnt_inc(&ctx->list_lock);
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ if (node->pfd.revents && node->io_notify) {
+ result = true;
+ break;
+ }
+
+ if ((node->pfd.revents & G_IO_IN) && node->io_read) {
+ result = true;
+ break;
+ }
+ if ((node->pfd.revents & G_IO_OUT) && node->io_write) {
+ result = true;
+ break;
+ }
+ }
+
+ qemu_lockcnt_dec(&ctx->list_lock);
+ return result;
+}
+
+static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event)
+{
+ AioHandler *node;
+ bool progress = false;
+ AioHandler *tmp;
+
+ /*
+ * We have to walk very carefully in case aio_set_fd_handler is
+ * called while we're walking.
+ */
+ QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) {
+ int revents = node->pfd.revents;
+
+ if (!node->deleted &&
+ (revents || event_notifier_get_handle(node->e) == event) &&
+ node->io_notify) {
+ node->pfd.revents = 0;
+ node->io_notify(node->e);
+
+ /* aio_notify() does not count as progress */
+ if (node->e != &ctx->notifier) {
+ progress = true;
+ }
+ }
+
+ if (!node->deleted &&
+ (node->io_read || node->io_write)) {
+ node->pfd.revents = 0;
+ if ((revents & G_IO_IN) && node->io_read) {
+ node->io_read(node->opaque);
+ progress = true;
+ }
+ if ((revents & G_IO_OUT) && node->io_write) {
+ node->io_write(node->opaque);
+ progress = true;
+ }
+
+ /* if the next select() will return an event, we have progressed */
+ if (event == event_notifier_get_handle(&ctx->notifier)) {
+ WSANETWORKEVENTS ev;
+ WSAEnumNetworkEvents(node->pfd.fd, event, &ev);
+ if (ev.lNetworkEvents) {
+ progress = true;
+ }
+ }
+ }
+
+ if (node->deleted) {
+ if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) {
+ QLIST_REMOVE(node, node);
+ g_free(node);
+ qemu_lockcnt_inc_and_unlock(&ctx->list_lock);
+ }
+ }
+ }
+
+ return progress;
+}
+
+void aio_dispatch(AioContext *ctx)
+{
+ qemu_lockcnt_inc(&ctx->list_lock);
+ aio_bh_poll(ctx);
+ aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE);
+ qemu_lockcnt_dec(&ctx->list_lock);
+ timerlistgroup_run_timers(&ctx->tlg);
+}
+
+bool aio_poll(AioContext *ctx, bool blocking)
+{
+ AioHandler *node;
+ HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+ bool progress, have_select_revents, first;
+ int count;
+ int timeout;
+
+ /*
+ * There cannot be two concurrent aio_poll calls for the same AioContext (or
+ * an aio_poll concurrent with a GSource prepare/check/dispatch callback).
+ * We rely on this below to avoid slow locked accesses to ctx->notify_me.
+ *
+ * aio_poll() may only be called in the AioContext's thread. iohandler_ctx
+ * is special in that it runs in the main thread, but that thread's context
+ * is qemu_aio_context.
+ */
+ assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ?
+ qemu_get_aio_context() : ctx));
+ progress = false;
+
+ /* aio_notify can avoid the expensive event_notifier_set if
+ * everything (file descriptors, bottom halves, timers) will
+ * be re-evaluated before the next blocking poll(). This is
+ * already true when aio_poll is called with blocking == false;
+ * if blocking == true, it is only true after poll() returns,
+ * so disable the optimization now.
+ */
+ if (blocking) {
+ qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2);
+ /*
+ * Write ctx->notify_me before computing the timeout
+ * (reading bottom half flags, etc.). Pairs with
+ * smp_mb in aio_notify().
+ */
+ smp_mb();
+ }
+
+ qemu_lockcnt_inc(&ctx->list_lock);
+ have_select_revents = aio_prepare(ctx);
+
+ /* fill fd sets */
+ count = 0;
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ if (!node->deleted && node->io_notify
+ && aio_node_check(ctx, node->is_external)) {
+ events[count++] = event_notifier_get_handle(node->e);
+ }
+ }
+
+ first = true;
+
+ /* ctx->notifier is always registered. */
+ assert(count > 0);
+
+ /* Multiple iterations, all of them non-blocking except the first,
+ * may be necessary to process all pending events. After the first
+ * WaitForMultipleObjects call ctx->notify_me will be decremented.
+ */
+ do {
+ HANDLE event;
+ int ret;
+
+ timeout = blocking && !have_select_revents
+ ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0;
+ ret = WaitForMultipleObjects(count, events, FALSE, timeout);
+ if (blocking) {
+ assert(first);
+ qatomic_store_release(&ctx->notify_me,
+ qatomic_read(&ctx->notify_me) - 2);
+ aio_notify_accept(ctx);
+ }
+
+ if (first) {
+ progress |= aio_bh_poll(ctx);
+ first = false;
+ }
+
+ /* if we have any signaled events, dispatch event */
+ event = NULL;
+ if ((DWORD) (ret - WAIT_OBJECT_0) < count) {
+ event = events[ret - WAIT_OBJECT_0];
+ events[ret - WAIT_OBJECT_0] = events[--count];
+ } else if (!have_select_revents) {
+ break;
+ }
+
+ have_select_revents = false;
+ blocking = false;
+
+ progress |= aio_dispatch_handlers(ctx, event);
+ } while (count > 0);
+
+ qemu_lockcnt_dec(&ctx->list_lock);
+
+ progress |= timerlistgroup_run_timers(&ctx->tlg);
+ return progress;
+}
+
+void aio_context_setup(AioContext *ctx)
+{
+}
+
+void aio_context_destroy(AioContext *ctx)
+{
+}
+
+void aio_context_use_g_source(AioContext *ctx)
+{
+}
+
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+ int64_t grow, int64_t shrink, Error **errp)
+{
+ if (max_ns) {
+ error_setg(errp, "AioContext polling is not implemented on Windows");
+ }
+}
+
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+ Error **errp)
+{
+}
diff --git a/util/aiocb.c b/util/aiocb.c
new file mode 100644
index 000000000..5aef3a069
--- /dev/null
+++ b/util/aiocb.c
@@ -0,0 +1,55 @@
+/*
+ * BlockAIOCB allocation
+ *
+ * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ BlockAIOCB *acb;
+
+ acb = g_malloc(aiocb_info->aiocb_size);
+ acb->aiocb_info = aiocb_info;
+ acb->bs = bs;
+ acb->cb = cb;
+ acb->opaque = opaque;
+ acb->refcnt = 1;
+ return acb;
+}
+
+void qemu_aio_ref(void *p)
+{
+ BlockAIOCB *acb = p;
+ acb->refcnt++;
+}
+
+void qemu_aio_unref(void *p)
+{
+ BlockAIOCB *acb = p;
+ assert(acb->refcnt > 0);
+ if (--acb->refcnt == 0) {
+ g_free(acb);
+ }
+}
diff --git a/util/async.c b/util/async.c
new file mode 100644
index 000000000..6f6717a34
--- /dev/null
+++ b/util/async.c
@@ -0,0 +1,690 @@
+/*
+ * Data plane event loop
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009-2017 QEMU contributors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "block/aio.h"
+#include "block/thread-pool.h"
+#include "qemu/main-loop.h"
+#include "qemu/atomic.h"
+#include "qemu/rcu_queue.h"
+#include "block/raw-aio.h"
+#include "qemu/coroutine_int.h"
+#include "trace.h"
+
+/***********************************************************/
+/* bottom halves (can be seen as timers which expire ASAP) */
+
+/* QEMUBH::flags values */
+enum {
+ /* Already enqueued and waiting for aio_bh_poll() */
+ BH_PENDING = (1 << 0),
+
+ /* Invoke the callback */
+ BH_SCHEDULED = (1 << 1),
+
+ /* Delete without invoking callback */
+ BH_DELETED = (1 << 2),
+
+ /* Delete after invoking callback */
+ BH_ONESHOT = (1 << 3),
+
+ /* Schedule periodically when the event loop is idle */
+ BH_IDLE = (1 << 4),
+};
+
+struct QEMUBH {
+ AioContext *ctx;
+ const char *name;
+ QEMUBHFunc *cb;
+ void *opaque;
+ QSLIST_ENTRY(QEMUBH) next;
+ unsigned flags;
+};
+
+/* Called concurrently from any thread */
+static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags)
+{
+ AioContext *ctx = bh->ctx;
+ unsigned old_flags;
+
+ /*
+ * The memory barrier implicit in qatomic_fetch_or makes sure that:
+ * 1. idle & any writes needed by the callback are done before the
+ * locations are read in the aio_bh_poll.
+ * 2. ctx is loaded before the callback has a chance to execute and bh
+ * could be freed.
+ */
+ old_flags = qatomic_fetch_or(&bh->flags, BH_PENDING | new_flags);
+ if (!(old_flags & BH_PENDING)) {
+ QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next);
+ }
+
+ aio_notify(ctx);
+}
+
+/* Only called from aio_bh_poll() and aio_ctx_finalize() */
+static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags)
+{
+ QEMUBH *bh = QSLIST_FIRST_RCU(head);
+
+ if (!bh) {
+ return NULL;
+ }
+
+ QSLIST_REMOVE_HEAD(head, next);
+
+ /*
+ * The qatomic_and is paired with aio_bh_enqueue(). The implicit memory
+ * barrier ensures that the callback sees all writes done by the scheduling
+ * thread. It also ensures that the scheduling thread sees the cleared
+ * flag before bh->cb has run, and thus will call aio_notify again if
+ * necessary.
+ */
+ *flags = qatomic_fetch_and(&bh->flags,
+ ~(BH_PENDING | BH_SCHEDULED | BH_IDLE));
+ return bh;
+}
+
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb,
+ void *opaque, const char *name)
+{
+ QEMUBH *bh;
+ bh = g_new(QEMUBH, 1);
+ *bh = (QEMUBH){
+ .ctx = ctx,
+ .cb = cb,
+ .opaque = opaque,
+ .name = name,
+ };
+ aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT);
+}
+
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+ const char *name)
+{
+ QEMUBH *bh;
+ bh = g_new(QEMUBH, 1);
+ *bh = (QEMUBH){
+ .ctx = ctx,
+ .cb = cb,
+ .opaque = opaque,
+ .name = name,
+ };
+ return bh;
+}
+
+void aio_bh_call(QEMUBH *bh)
+{
+ bh->cb(bh->opaque);
+}
+
+/* Multiple occurrences of aio_bh_poll cannot be called concurrently. */
+int aio_bh_poll(AioContext *ctx)
+{
+ BHListSlice slice;
+ BHListSlice *s;
+ int ret = 0;
+
+ QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list);
+ QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next);
+
+ while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) {
+ QEMUBH *bh;
+ unsigned flags;
+
+ bh = aio_bh_dequeue(&s->bh_list, &flags);
+ if (!bh) {
+ QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next);
+ continue;
+ }
+
+ if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+ /* Idle BHs don't count as progress */
+ if (!(flags & BH_IDLE)) {
+ ret = 1;
+ }
+ aio_bh_call(bh);
+ }
+ if (flags & (BH_DELETED | BH_ONESHOT)) {
+ g_free(bh);
+ }
+ }
+
+ return ret;
+}
+
+void qemu_bh_schedule_idle(QEMUBH *bh)
+{
+ aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE);
+}
+
+void qemu_bh_schedule(QEMUBH *bh)
+{
+ aio_bh_enqueue(bh, BH_SCHEDULED);
+}
+
+/* This func is async.
+ */
+void qemu_bh_cancel(QEMUBH *bh)
+{
+ qatomic_and(&bh->flags, ~BH_SCHEDULED);
+}
+
+/* This func is async.The bottom half will do the delete action at the finial
+ * end.
+ */
+void qemu_bh_delete(QEMUBH *bh)
+{
+ aio_bh_enqueue(bh, BH_DELETED);
+}
+
+static int64_t aio_compute_bh_timeout(BHList *head, int timeout)
+{
+ QEMUBH *bh;
+
+ QSLIST_FOREACH_RCU(bh, head, next) {
+ if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+ if (bh->flags & BH_IDLE) {
+ /* idle bottom halves will be polled at least
+ * every 10ms */
+ timeout = 10000000;
+ } else {
+ /* non-idle bottom halves will be executed
+ * immediately */
+ return 0;
+ }
+ }
+ }
+
+ return timeout;
+}
+
+int64_t
+aio_compute_timeout(AioContext *ctx)
+{
+ BHListSlice *s;
+ int64_t deadline;
+ int timeout = -1;
+
+ timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout);
+ if (timeout == 0) {
+ return 0;
+ }
+
+ QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+ timeout = aio_compute_bh_timeout(&s->bh_list, timeout);
+ if (timeout == 0) {
+ return 0;
+ }
+ }
+
+ deadline = timerlistgroup_deadline_ns(&ctx->tlg);
+ if (deadline == 0) {
+ return 0;
+ } else {
+ return qemu_soonest_timeout(timeout, deadline);
+ }
+}
+
+static gboolean
+aio_ctx_prepare(GSource *source, gint *timeout)
+{
+ AioContext *ctx = (AioContext *) source;
+
+ qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) | 1);
+
+ /*
+ * Write ctx->notify_me before computing the timeout
+ * (reading bottom half flags, etc.). Pairs with
+ * smp_mb in aio_notify().
+ */
+ smp_mb();
+
+ /* We assume there is no timeout already supplied */
+ *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
+
+ if (aio_prepare(ctx)) {
+ *timeout = 0;
+ }
+
+ return *timeout == 0;
+}
+
+static gboolean
+aio_ctx_check(GSource *source)
+{
+ AioContext *ctx = (AioContext *) source;
+ QEMUBH *bh;
+ BHListSlice *s;
+
+ /* Finish computing the timeout before clearing the flag. */
+ qatomic_store_release(&ctx->notify_me, qatomic_read(&ctx->notify_me) & ~1);
+ aio_notify_accept(ctx);
+
+ QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) {
+ if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+ return true;
+ }
+ }
+
+ QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) {
+ QSLIST_FOREACH_RCU(bh, &s->bh_list, next) {
+ if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) {
+ return true;
+ }
+ }
+ }
+ return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0);
+}
+
+static gboolean
+aio_ctx_dispatch(GSource *source,
+ GSourceFunc callback,
+ gpointer user_data)
+{
+ AioContext *ctx = (AioContext *) source;
+
+ assert(callback == NULL);
+ aio_dispatch(ctx);
+ return true;
+}
+
+static void
+aio_ctx_finalize(GSource *source)
+{
+ AioContext *ctx = (AioContext *) source;
+ QEMUBH *bh;
+ unsigned flags;
+
+ thread_pool_free(ctx->thread_pool);
+
+#ifdef CONFIG_LINUX_AIO
+ if (ctx->linux_aio) {
+ laio_detach_aio_context(ctx->linux_aio, ctx);
+ laio_cleanup(ctx->linux_aio);
+ ctx->linux_aio = NULL;
+ }
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+ if (ctx->linux_io_uring) {
+ luring_detach_aio_context(ctx->linux_io_uring, ctx);
+ luring_cleanup(ctx->linux_io_uring);
+ ctx->linux_io_uring = NULL;
+ }
+#endif
+
+ assert(QSLIST_EMPTY(&ctx->scheduled_coroutines));
+ qemu_bh_delete(ctx->co_schedule_bh);
+
+ /* There must be no aio_bh_poll() calls going on */
+ assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list));
+
+ while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) {
+ /*
+ * qemu_bh_delete() must have been called on BHs in this AioContext. In
+ * many cases memory leaks, hangs, or inconsistent state occur when a
+ * BH is leaked because something still expects it to run.
+ *
+ * If you hit this, fix the lifecycle of the BH so that
+ * qemu_bh_delete() and any associated cleanup is called before the
+ * AioContext is finalized.
+ */
+ if (unlikely(!(flags & BH_DELETED))) {
+ fprintf(stderr, "%s: BH '%s' leaked, aborting...\n",
+ __func__, bh->name);
+ abort();
+ }
+
+ g_free(bh);
+ }
+
+ aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL);
+ event_notifier_cleanup(&ctx->notifier);
+ qemu_rec_mutex_destroy(&ctx->lock);
+ qemu_lockcnt_destroy(&ctx->list_lock);
+ timerlistgroup_deinit(&ctx->tlg);
+ aio_context_destroy(ctx);
+}
+
+static GSourceFuncs aio_source_funcs = {
+ aio_ctx_prepare,
+ aio_ctx_check,
+ aio_ctx_dispatch,
+ aio_ctx_finalize
+};
+
+GSource *aio_get_g_source(AioContext *ctx)
+{
+ aio_context_use_g_source(ctx);
+ g_source_ref(&ctx->source);
+ return &ctx->source;
+}
+
+ThreadPool *aio_get_thread_pool(AioContext *ctx)
+{
+ if (!ctx->thread_pool) {
+ ctx->thread_pool = thread_pool_new(ctx);
+ }
+ return ctx->thread_pool;
+}
+
+#ifdef CONFIG_LINUX_AIO
+LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp)
+{
+ if (!ctx->linux_aio) {
+ ctx->linux_aio = laio_init(errp);
+ if (ctx->linux_aio) {
+ laio_attach_aio_context(ctx->linux_aio, ctx);
+ }
+ }
+ return ctx->linux_aio;
+}
+
+LinuxAioState *aio_get_linux_aio(AioContext *ctx)
+{
+ assert(ctx->linux_aio);
+ return ctx->linux_aio;
+}
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp)
+{
+ if (ctx->linux_io_uring) {
+ return ctx->linux_io_uring;
+ }
+
+ ctx->linux_io_uring = luring_init(errp);
+ if (!ctx->linux_io_uring) {
+ return NULL;
+ }
+
+ luring_attach_aio_context(ctx->linux_io_uring, ctx);
+ return ctx->linux_io_uring;
+}
+
+LuringState *aio_get_linux_io_uring(AioContext *ctx)
+{
+ assert(ctx->linux_io_uring);
+ return ctx->linux_io_uring;
+}
+#endif
+
+void aio_notify(AioContext *ctx)
+{
+ /*
+ * Write e.g. bh->flags before writing ctx->notified. Pairs with smp_mb in
+ * aio_notify_accept.
+ */
+ smp_wmb();
+ qatomic_set(&ctx->notified, true);
+
+ /*
+ * Write ctx->notified before reading ctx->notify_me. Pairs
+ * with smp_mb in aio_ctx_prepare or aio_poll.
+ */
+ smp_mb();
+ if (qatomic_read(&ctx->notify_me)) {
+ event_notifier_set(&ctx->notifier);
+ }
+}
+
+void aio_notify_accept(AioContext *ctx)
+{
+ qatomic_set(&ctx->notified, false);
+
+ /*
+ * Write ctx->notified before reading e.g. bh->flags. Pairs with smp_wmb
+ * in aio_notify.
+ */
+ smp_mb();
+}
+
+static void aio_timerlist_notify(void *opaque, QEMUClockType type)
+{
+ aio_notify(opaque);
+}
+
+static void aio_context_notifier_cb(EventNotifier *e)
+{
+ AioContext *ctx = container_of(e, AioContext, notifier);
+
+ event_notifier_test_and_clear(&ctx->notifier);
+}
+
+/* Returns true if aio_notify() was called (e.g. a BH was scheduled) */
+static bool aio_context_notifier_poll(void *opaque)
+{
+ EventNotifier *e = opaque;
+ AioContext *ctx = container_of(e, AioContext, notifier);
+
+ return qatomic_read(&ctx->notified);
+}
+
+static void co_schedule_bh_cb(void *opaque)
+{
+ AioContext *ctx = opaque;
+ QSLIST_HEAD(, Coroutine) straight, reversed;
+
+ QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines);
+ QSLIST_INIT(&straight);
+
+ while (!QSLIST_EMPTY(&reversed)) {
+ Coroutine *co = QSLIST_FIRST(&reversed);
+ QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next);
+ QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next);
+ }
+
+ while (!QSLIST_EMPTY(&straight)) {
+ Coroutine *co = QSLIST_FIRST(&straight);
+ QSLIST_REMOVE_HEAD(&straight, co_scheduled_next);
+ trace_aio_co_schedule_bh_cb(ctx, co);
+ aio_context_acquire(ctx);
+
+ /* Protected by write barrier in qemu_aio_coroutine_enter */
+ qatomic_set(&co->scheduled, NULL);
+ qemu_aio_coroutine_enter(ctx, co);
+ aio_context_release(ctx);
+ }
+}
+
+AioContext *aio_context_new(Error **errp)
+{
+ int ret;
+ AioContext *ctx;
+
+ ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+ QSLIST_INIT(&ctx->bh_list);
+ QSIMPLEQ_INIT(&ctx->bh_slice_list);
+ aio_context_setup(ctx);
+
+ ret = event_notifier_init(&ctx->notifier, false);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "Failed to initialize event notifier");
+ goto fail;
+ }
+ g_source_set_can_recurse(&ctx->source, true);
+ qemu_lockcnt_init(&ctx->list_lock);
+
+ ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
+ QSLIST_INIT(&ctx->scheduled_coroutines);
+
+ aio_set_event_notifier(ctx, &ctx->notifier,
+ false,
+ aio_context_notifier_cb,
+ aio_context_notifier_poll);
+#ifdef CONFIG_LINUX_AIO
+ ctx->linux_aio = NULL;
+#endif
+
+#ifdef CONFIG_LINUX_IO_URING
+ ctx->linux_io_uring = NULL;
+#endif
+
+ ctx->thread_pool = NULL;
+ qemu_rec_mutex_init(&ctx->lock);
+ timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
+
+ ctx->poll_ns = 0;
+ ctx->poll_max_ns = 0;
+ ctx->poll_grow = 0;
+ ctx->poll_shrink = 0;
+
+ ctx->aio_max_batch = 0;
+
+ return ctx;
+fail:
+ g_source_destroy(&ctx->source);
+ return NULL;
+}
+
+void aio_co_schedule(AioContext *ctx, Coroutine *co)
+{
+ trace_aio_co_schedule(ctx, co);
+ const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
+ __func__);
+
+ if (scheduled) {
+ fprintf(stderr,
+ "%s: Co-routine was already scheduled in '%s'\n",
+ __func__, scheduled);
+ abort();
+ }
+
+ /* The coroutine might run and release the last ctx reference before we
+ * invoke qemu_bh_schedule(). Take a reference to keep ctx alive until
+ * we're done.
+ */
+ aio_context_ref(ctx);
+
+ QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines,
+ co, co_scheduled_next);
+ qemu_bh_schedule(ctx->co_schedule_bh);
+
+ aio_context_unref(ctx);
+}
+
+typedef struct AioCoRescheduleSelf {
+ Coroutine *co;
+ AioContext *new_ctx;
+} AioCoRescheduleSelf;
+
+static void aio_co_reschedule_self_bh(void *opaque)
+{
+ AioCoRescheduleSelf *data = opaque;
+ aio_co_schedule(data->new_ctx, data->co);
+}
+
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx)
+{
+ AioContext *old_ctx = qemu_get_current_aio_context();
+
+ if (old_ctx != new_ctx) {
+ AioCoRescheduleSelf data = {
+ .co = qemu_coroutine_self(),
+ .new_ctx = new_ctx,
+ };
+ /*
+ * We can't directly schedule the coroutine in the target context
+ * because this would be racy: The other thread could try to enter the
+ * coroutine before it has yielded in this one.
+ */
+ aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data);
+ qemu_coroutine_yield();
+ }
+}
+
+void aio_co_wake(struct Coroutine *co)
+{
+ AioContext *ctx;
+
+ /* Read coroutine before co->ctx. Matches smp_wmb in
+ * qemu_coroutine_enter.
+ */
+ smp_read_barrier_depends();
+ ctx = qatomic_read(&co->ctx);
+
+ aio_co_enter(ctx, co);
+}
+
+void aio_co_enter(AioContext *ctx, struct Coroutine *co)
+{
+ if (ctx != qemu_get_current_aio_context()) {
+ aio_co_schedule(ctx, co);
+ return;
+ }
+
+ if (qemu_in_coroutine()) {
+ Coroutine *self = qemu_coroutine_self();
+ assert(self != co);
+ QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next);
+ } else {
+ aio_context_acquire(ctx);
+ qemu_aio_coroutine_enter(ctx, co);
+ aio_context_release(ctx);
+ }
+}
+
+void aio_context_ref(AioContext *ctx)
+{
+ g_source_ref(&ctx->source);
+}
+
+void aio_context_unref(AioContext *ctx)
+{
+ g_source_unref(&ctx->source);
+}
+
+void aio_context_acquire(AioContext *ctx)
+{
+ qemu_rec_mutex_lock(&ctx->lock);
+}
+
+void aio_context_release(AioContext *ctx)
+{
+ qemu_rec_mutex_unlock(&ctx->lock);
+}
+
+static __thread AioContext *my_aiocontext;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+ if (my_aiocontext) {
+ return my_aiocontext;
+ }
+ if (qemu_mutex_iothread_locked()) {
+ /* Possibly in a vCPU thread. */
+ return qemu_get_aio_context();
+ }
+ return NULL;
+}
+
+void qemu_set_current_aio_context(AioContext *ctx)
+{
+ assert(!my_aiocontext);
+ my_aiocontext = ctx;
+}
diff --git a/util/atomic64.c b/util/atomic64.c
new file mode 100644
index 000000000..93037d5b1
--- /dev/null
+++ b/util/atomic64.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+
+#ifdef CONFIG_ATOMIC64
+#error This file must only be compiled if !CONFIG_ATOMIC64
+#endif
+
+/*
+ * When !CONFIG_ATOMIC64, we serialize both reads and writes with spinlocks.
+ * We use an array of spinlocks, with padding computed at run-time based on
+ * the host's dcache line size.
+ * We point to the array with a void * to simplify the padding's computation.
+ * Each spinlock is located every lock_size bytes.
+ */
+static void *lock_array;
+static size_t lock_size;
+
+/*
+ * Systems without CONFIG_ATOMIC64 are unlikely to have many cores, so we use a
+ * small array of locks.
+ */
+#define NR_LOCKS 16
+
+static QemuSpin *addr_to_lock(const void *addr)
+{
+ uintptr_t a = (uintptr_t)addr;
+ uintptr_t idx;
+
+ idx = a >> qemu_dcache_linesize_log;
+ idx ^= (idx >> 8) ^ (idx >> 16);
+ idx &= NR_LOCKS - 1;
+ return lock_array + idx * lock_size;
+}
+
+#define GEN_READ(name, type) \
+ type name(const type *ptr) \
+ { \
+ QemuSpin *lock = addr_to_lock(ptr); \
+ type ret; \
+ \
+ qemu_spin_lock(lock); \
+ ret = *ptr; \
+ qemu_spin_unlock(lock); \
+ return ret; \
+ }
+
+GEN_READ(qatomic_read_i64, int64_t)
+GEN_READ(qatomic_read_u64, uint64_t)
+#undef GEN_READ
+
+#define GEN_SET(name, type) \
+ void name(type *ptr, type val) \
+ { \
+ QemuSpin *lock = addr_to_lock(ptr); \
+ \
+ qemu_spin_lock(lock); \
+ *ptr = val; \
+ qemu_spin_unlock(lock); \
+ }
+
+GEN_SET(qatomic_set_i64, int64_t)
+GEN_SET(qatomic_set_u64, uint64_t)
+#undef GEN_SET
+
+void qatomic64_init(void)
+{
+ int i;
+
+ lock_size = ROUND_UP(sizeof(QemuSpin), qemu_dcache_linesize);
+ lock_array = qemu_memalign(qemu_dcache_linesize, lock_size * NR_LOCKS);
+ for (i = 0; i < NR_LOCKS; i++) {
+ QemuSpin *lock = lock_array + i * lock_size;
+
+ qemu_spin_init(lock);
+ }
+}
diff --git a/util/base64.c b/util/base64.c
new file mode 100644
index 000000000..811111ac4
--- /dev/null
+++ b/util/base64.c
@@ -0,0 +1,60 @@
+/*
+ * QEMU base64 helpers
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/base64.h"
+
+static const char *base64_valid_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n";
+
+uint8_t *qbase64_decode(const char *input,
+ size_t in_len,
+ size_t *out_len,
+ Error **errp)
+{
+ *out_len = 0;
+
+ if (in_len != -1) {
+ /* Lack of NUL terminator is an error */
+ if (input[in_len] != '\0') {
+ error_setg(errp, "Base64 data is not NUL terminated");
+ return NULL;
+ }
+ /* Check there's no NULs embedded since we expect
+ * this to be valid base64 data */
+ if (memchr(input, '\0', in_len) != NULL) {
+ error_setg(errp, "Base64 data contains embedded NUL characters");
+ return NULL;
+ }
+
+ /* Now we know its a valid nul terminated string
+ * strspn is safe to use... */
+ } else {
+ in_len = strlen(input);
+ }
+
+ if (strspn(input, base64_valid_chars) != in_len) {
+ error_setg(errp, "Base64 data contains invalid characters");
+ return NULL;
+ }
+
+ return g_base64_decode(input, out_len);
+}
diff --git a/util/bitmap.c b/util/bitmap.c
new file mode 100644
index 000000000..1f201393a
--- /dev/null
+++ b/util/bitmap.c
@@ -0,0 +1,489 @@
+/*
+ * Bitmap Module
+ *
+ * Stolen from linux/src/lib/bitmap.c
+ *
+ * Copyright (C) 2010 Corentin Chary
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/bitmap.h"
+#include "qemu/atomic.h"
+
+/*
+ * bitmaps provide an array of bits, implemented using an
+ * array of unsigned longs. The number of valid bits in a
+ * given bitmap does _not_ need to be an exact multiple of
+ * BITS_PER_LONG.
+ *
+ * The possible unused bits in the last, partially used word
+ * of a bitmap are 'don't care'. The implementation makes
+ * no particular effort to keep them zero. It ensures that
+ * their value will not affect the results of any operation.
+ * The bitmap operations that return Boolean (bitmap_empty,
+ * for example) or scalar (bitmap_weight, for example) results
+ * carefully filter out these unused bits from impacting their
+ * results.
+ *
+ * These operations actually hold to a slightly stronger rule:
+ * if you don't input any bitmaps to these ops that have some
+ * unused bits set, then they won't output any set unused bits
+ * in output bitmaps.
+ *
+ * The byte ordering of bitmaps is more natural on little
+ * endian architectures.
+ */
+
+int slow_bitmap_empty(const unsigned long *bitmap, long bits)
+{
+ long k, lim = bits/BITS_PER_LONG;
+
+ for (k = 0; k < lim; ++k) {
+ if (bitmap[k]) {
+ return 0;
+ }
+ }
+ if (bits % BITS_PER_LONG) {
+ if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int slow_bitmap_full(const unsigned long *bitmap, long bits)
+{
+ long k, lim = bits/BITS_PER_LONG;
+
+ for (k = 0; k < lim; ++k) {
+ if (~bitmap[k]) {
+ return 0;
+ }
+ }
+
+ if (bits % BITS_PER_LONG) {
+ if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+int slow_bitmap_equal(const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k, lim = bits/BITS_PER_LONG;
+
+ for (k = 0; k < lim; ++k) {
+ if (bitmap1[k] != bitmap2[k]) {
+ return 0;
+ }
+ }
+
+ if (bits % BITS_PER_LONG) {
+ if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) {
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+void slow_bitmap_complement(unsigned long *dst, const unsigned long *src,
+ long bits)
+{
+ long k, lim = bits/BITS_PER_LONG;
+
+ for (k = 0; k < lim; ++k) {
+ dst[k] = ~src[k];
+ }
+
+ if (bits % BITS_PER_LONG) {
+ dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits);
+ }
+}
+
+int slow_bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k;
+ long nr = BITS_TO_LONGS(bits);
+ unsigned long result = 0;
+
+ for (k = 0; k < nr; k++) {
+ result |= (dst[k] = bitmap1[k] & bitmap2[k]);
+ }
+ return result != 0;
+}
+
+void slow_bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k;
+ long nr = BITS_TO_LONGS(bits);
+
+ for (k = 0; k < nr; k++) {
+ dst[k] = bitmap1[k] | bitmap2[k];
+ }
+}
+
+void slow_bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k;
+ long nr = BITS_TO_LONGS(bits);
+
+ for (k = 0; k < nr; k++) {
+ dst[k] = bitmap1[k] ^ bitmap2[k];
+ }
+}
+
+int slow_bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k;
+ long nr = BITS_TO_LONGS(bits);
+ unsigned long result = 0;
+
+ for (k = 0; k < nr; k++) {
+ result |= (dst[k] = bitmap1[k] & ~bitmap2[k]);
+ }
+ return result != 0;
+}
+
+void bitmap_set(unsigned long *map, long start, long nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const long size = start + nr;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+ assert(start >= 0 && nr >= 0);
+
+ while (nr - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ nr -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+ *p |= mask_to_set;
+ }
+}
+
+void bitmap_set_atomic(unsigned long *map, long start, long nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const long size = start + nr;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+ assert(start >= 0 && nr >= 0);
+
+ /* First word */
+ if (nr - bits_to_set > 0) {
+ qatomic_or(p, mask_to_set);
+ nr -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = ~0UL;
+ p++;
+ }
+
+ /* Full words */
+ if (bits_to_set == BITS_PER_LONG) {
+ while (nr >= BITS_PER_LONG) {
+ *p = ~0UL;
+ nr -= BITS_PER_LONG;
+ p++;
+ }
+ }
+
+ /* Last word */
+ if (nr) {
+ mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+ qatomic_or(p, mask_to_set);
+ } else {
+ /* If we avoided the full barrier in qatomic_or(), issue a
+ * barrier to account for the assignments in the while loop.
+ */
+ smp_mb();
+ }
+}
+
+void bitmap_clear(unsigned long *map, long start, long nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const long size = start + nr;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ assert(start >= 0 && nr >= 0);
+
+ while (nr - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ nr -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ *p &= ~mask_to_clear;
+ }
+}
+
+bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const long size = start + nr;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+ unsigned long dirty = 0;
+ unsigned long old_bits;
+
+ assert(start >= 0 && nr >= 0);
+
+ /* First word */
+ if (nr - bits_to_clear > 0) {
+ old_bits = qatomic_fetch_and(p, ~mask_to_clear);
+ dirty |= old_bits & mask_to_clear;
+ nr -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+
+ /* Full words */
+ if (bits_to_clear == BITS_PER_LONG) {
+ while (nr >= BITS_PER_LONG) {
+ if (*p) {
+ old_bits = qatomic_xchg(p, 0);
+ dirty |= old_bits;
+ }
+ nr -= BITS_PER_LONG;
+ p++;
+ }
+ }
+
+ /* Last word */
+ if (nr) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ old_bits = qatomic_fetch_and(p, ~mask_to_clear);
+ dirty |= old_bits & mask_to_clear;
+ } else {
+ if (!dirty) {
+ smp_mb();
+ }
+ }
+
+ return dirty != 0;
+}
+
+void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src,
+ long nr)
+{
+ while (nr > 0) {
+ *dst = qatomic_xchg(src, 0);
+ dst++;
+ src++;
+ nr -= BITS_PER_LONG;
+ }
+}
+
+#define ALIGN_MASK(x,mask) (((x)+(mask))&~(mask))
+
+/**
+ * bitmap_find_next_zero_area - find a contiguous aligned zero area
+ * @map: The address to base the search on
+ * @size: The bitmap size in bits
+ * @start: The bitnumber to start searching at
+ * @nr: The number of zeroed bits we're looking for
+ * @align_mask: Alignment mask for zero area
+ *
+ * The @align_mask should be one less than a power of 2; the effect is that
+ * the bit offset of all zero areas this function finds is multiples of that
+ * power of 2. A @align_mask of 0 means no alignment is required.
+ */
+unsigned long bitmap_find_next_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask)
+{
+ unsigned long index, end, i;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = ALIGN_MASK(index, align_mask);
+
+ end = index + nr;
+ if (end > size) {
+ return end;
+ }
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ start = i + 1;
+ goto again;
+ }
+ return index;
+}
+
+int slow_bitmap_intersects(const unsigned long *bitmap1,
+ const unsigned long *bitmap2, long bits)
+{
+ long k, lim = bits/BITS_PER_LONG;
+
+ for (k = 0; k < lim; ++k) {
+ if (bitmap1[k] & bitmap2[k]) {
+ return 1;
+ }
+ }
+
+ if (bits % BITS_PER_LONG) {
+ if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+long slow_bitmap_count_one(const unsigned long *bitmap, long nbits)
+{
+ long k, lim = nbits / BITS_PER_LONG, result = 0;
+
+ for (k = 0; k < lim; k++) {
+ result += ctpopl(bitmap[k]);
+ }
+
+ if (nbits % BITS_PER_LONG) {
+ result += ctpopl(bitmap[k] & BITMAP_LAST_WORD_MASK(nbits));
+ }
+
+ return result;
+}
+
+static void bitmap_to_from_le(unsigned long *dst,
+ const unsigned long *src, long nbits)
+{
+ long len = BITS_TO_LONGS(nbits);
+
+#ifdef HOST_WORDS_BIGENDIAN
+ long index;
+
+ for (index = 0; index < len; index++) {
+# if HOST_LONG_BITS == 64
+ dst[index] = bswap64(src[index]);
+# else
+ dst[index] = bswap32(src[index]);
+# endif
+ }
+#else
+ memcpy(dst, src, len * sizeof(unsigned long));
+#endif
+}
+
+void bitmap_from_le(unsigned long *dst, const unsigned long *src,
+ long nbits)
+{
+ bitmap_to_from_le(dst, src, nbits);
+}
+
+void bitmap_to_le(unsigned long *dst, const unsigned long *src,
+ long nbits)
+{
+ bitmap_to_from_le(dst, src, nbits);
+}
+
+/*
+ * Copy "src" bitmap with a positive offset and put it into the "dst"
+ * bitmap. The caller needs to make sure the bitmap size of "src"
+ * is bigger than (shift + nbits).
+ */
+void bitmap_copy_with_src_offset(unsigned long *dst, const unsigned long *src,
+ unsigned long shift, unsigned long nbits)
+{
+ unsigned long left_mask, right_mask, last_mask;
+
+ /* Proper shift src pointer to the first word to copy from */
+ src += BIT_WORD(shift);
+ shift %= BITS_PER_LONG;
+
+ if (!shift) {
+ /* Fast path */
+ bitmap_copy(dst, src, nbits);
+ return;
+ }
+
+ right_mask = (1ul << shift) - 1;
+ left_mask = ~right_mask;
+
+ while (nbits >= BITS_PER_LONG) {
+ *dst = (*src & left_mask) >> shift;
+ *dst |= (src[1] & right_mask) << (BITS_PER_LONG - shift);
+ dst++;
+ src++;
+ nbits -= BITS_PER_LONG;
+ }
+
+ if (nbits > BITS_PER_LONG - shift) {
+ *dst = (*src & left_mask) >> shift;
+ nbits -= BITS_PER_LONG - shift;
+ last_mask = (1ul << nbits) - 1;
+ *dst |= (src[1] & last_mask) << (BITS_PER_LONG - shift);
+ } else if (nbits) {
+ last_mask = (1ul << nbits) - 1;
+ *dst = (*src >> shift) & last_mask;
+ }
+}
+
+/*
+ * Copy "src" bitmap into the "dst" bitmap with an offset in the
+ * "dst". The caller needs to make sure the bitmap size of "dst" is
+ * bigger than (shift + nbits).
+ */
+void bitmap_copy_with_dst_offset(unsigned long *dst, const unsigned long *src,
+ unsigned long shift, unsigned long nbits)
+{
+ unsigned long left_mask, right_mask, last_mask;
+
+ /* Proper shift dst pointer to the first word to copy from */
+ dst += BIT_WORD(shift);
+ shift %= BITS_PER_LONG;
+
+ if (!shift) {
+ /* Fast path */
+ bitmap_copy(dst, src, nbits);
+ return;
+ }
+
+ right_mask = (1ul << (BITS_PER_LONG - shift)) - 1;
+ left_mask = ~right_mask;
+
+ *dst &= (1ul << shift) - 1;
+ while (nbits >= BITS_PER_LONG) {
+ *dst |= (*src & right_mask) << shift;
+ dst[1] = (*src & left_mask) >> (BITS_PER_LONG - shift);
+ dst++;
+ src++;
+ nbits -= BITS_PER_LONG;
+ }
+
+ if (nbits > BITS_PER_LONG - shift) {
+ *dst |= (*src & right_mask) << shift;
+ nbits -= BITS_PER_LONG - shift;
+ last_mask = ((1ul << nbits) - 1) << (BITS_PER_LONG - shift);
+ dst[1] = (*src & last_mask) >> (BITS_PER_LONG - shift);
+ } else if (nbits) {
+ last_mask = (1ul << nbits) - 1;
+ *dst |= (*src & last_mask) << shift;
+ }
+}
diff --git a/util/bitops.c b/util/bitops.c
new file mode 100644
index 000000000..3fe6b1c4f
--- /dev/null
+++ b/util/bitops.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ * Copyright (C) 2008 IBM Corporation
+ * Written by Rusty Russell <rusty@rustcorp.com.au>
+ * (Inspired by David Howell's find_next_bit implementation)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+
+/*
+ * Find the next set bit in a memory region.
+ */
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BIT_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size) {
+ return size;
+ }
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG) {
+ goto found_first;
+ }
+ if (tmp) {
+ goto found_middle;
+ }
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size >= 4*BITS_PER_LONG) {
+ unsigned long d1, d2, d3;
+ tmp = *p;
+ d1 = *(p+1);
+ d2 = *(p+2);
+ d3 = *(p+3);
+ if (tmp) {
+ goto found_middle;
+ }
+ if (d1 | d2 | d3) {
+ break;
+ }
+ p += 4;
+ result += 4*BITS_PER_LONG;
+ size -= 4*BITS_PER_LONG;
+ }
+ while (size >= BITS_PER_LONG) {
+ if ((tmp = *(p++))) {
+ goto found_middle;
+ }
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size) {
+ return result;
+ }
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) { /* Are any bits set? */
+ return result + size; /* Nope. */
+ }
+found_middle:
+ return result + ctzl(tmp);
+}
+
+/*
+ * This implementation of find_{first,next}_zero_bit was stolen from
+ * Linus' asm-alpha/bitops.h.
+ */
+unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BIT_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size) {
+ return size;
+ }
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp |= ~0UL >> (BITS_PER_LONG - offset);
+ if (size < BITS_PER_LONG) {
+ goto found_first;
+ }
+ if (~tmp) {
+ goto found_middle;
+ }
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if (~(tmp = *(p++))) {
+ goto found_middle;
+ }
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size) {
+ return result;
+ }
+ tmp = *p;
+
+found_first:
+ tmp |= ~0UL << size;
+ if (tmp == ~0UL) { /* Are any bits zero? */
+ return result + size; /* Nope. */
+ }
+found_middle:
+ return result + ctzl(~tmp);
+}
+
+unsigned long find_last_bit(const unsigned long *addr, unsigned long size)
+{
+ unsigned long words;
+ unsigned long tmp;
+
+ /* Start at final word. */
+ words = size / BITS_PER_LONG;
+
+ /* Partial final word? */
+ if (size & (BITS_PER_LONG-1)) {
+ tmp = (addr[words] & (~0UL >> (BITS_PER_LONG
+ - (size & (BITS_PER_LONG-1)))));
+ if (tmp) {
+ goto found;
+ }
+ }
+
+ while (words) {
+ tmp = addr[--words];
+ if (tmp) {
+ found:
+ return words * BITS_PER_LONG + BITS_PER_LONG - 1 - clzl(tmp);
+ }
+ }
+
+ /* Not found */
+ return size;
+}
diff --git a/util/block-helpers.c b/util/block-helpers.c
new file mode 100644
index 000000000..c4851432f
--- /dev/null
+++ b/util/block-helpers.c
@@ -0,0 +1,46 @@
+/*
+ * Block utility functions
+ *
+ * Copyright IBM, Corp. 2011
+ * Copyright (c) 2020 Coiby Xu <coiby.xu@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "block-helpers.h"
+
+/**
+ * check_block_size:
+ * @id: The unique ID of the object
+ * @name: The name of the property being validated
+ * @value: The block size in bytes
+ * @errp: A pointer to an area to store an error
+ *
+ * This function checks that the block size meets the following conditions:
+ * 1. At least MIN_BLOCK_SIZE
+ * 2. No larger than MAX_BLOCK_SIZE
+ * 3. A power of 2
+ */
+void check_block_size(const char *id, const char *name, int64_t value,
+ Error **errp)
+{
+ /* value of 0 means "unset" */
+ if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) {
+ error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
+ id, name, value, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE);
+ return;
+ }
+
+ /* We rely on power-of-2 blocksizes for bitmasks */
+ if ((value & (value - 1)) != 0) {
+ error_setg(errp,
+ "Property %s.%s doesn't take value '%" PRId64
+ "', it's not a power of 2",
+ id, name, value);
+ return;
+ }
+}
diff --git a/util/block-helpers.h b/util/block-helpers.h
new file mode 100644
index 000000000..b53295a52
--- /dev/null
+++ b/util/block-helpers.h
@@ -0,0 +1,19 @@
+#ifndef BLOCK_HELPERS_H
+#define BLOCK_HELPERS_H
+
+#include "qemu/units.h"
+
+/* lower limit is sector size */
+#define MIN_BLOCK_SIZE INT64_C(512)
+#define MIN_BLOCK_SIZE_STR "512 B"
+/*
+ * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, and
+ * matches qcow2 cluster size limit
+ */
+#define MAX_BLOCK_SIZE (2 * MiB)
+#define MAX_BLOCK_SIZE_STR "2 MiB"
+
+void check_block_size(const char *id, const char *name, int64_t value,
+ Error **errp);
+
+#endif /* BLOCK_HELPERS_H */
diff --git a/util/buffer.c b/util/buffer.c
new file mode 100644
index 000000000..743eaa930
--- /dev/null
+++ b/util/buffer.c
@@ -0,0 +1,173 @@
+/*
+ * QEMU generic buffers
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/buffer.h"
+#include "trace.h"
+
+#define BUFFER_MIN_INIT_SIZE 4096
+#define BUFFER_MIN_SHRINK_SIZE 65536
+
+/* define the factor alpha for the exponential smoothing
+ * that is used in the average size calculation. a shift
+ * of 7 results in an alpha of 1/2^7. */
+#define BUFFER_AVG_SIZE_SHIFT 7
+
+static size_t buffer_req_size(Buffer *buffer, size_t len)
+{
+ return MAX(BUFFER_MIN_INIT_SIZE,
+ pow2ceil(buffer->offset + len));
+}
+
+static void buffer_adj_size(Buffer *buffer, size_t len)
+{
+ size_t old = buffer->capacity;
+ buffer->capacity = buffer_req_size(buffer, len);
+ buffer->buffer = g_realloc(buffer->buffer, buffer->capacity);
+ trace_buffer_resize(buffer->name ?: "unnamed",
+ old, buffer->capacity);
+
+ /* make it even harder for the buffer to shrink, reset average size
+ * to current capacity if it is larger than the average. */
+ buffer->avg_size = MAX(buffer->avg_size,
+ buffer->capacity << BUFFER_AVG_SIZE_SHIFT);
+}
+
+void buffer_init(Buffer *buffer, const char *name, ...)
+{
+ va_list ap;
+
+ va_start(ap, name);
+ buffer->name = g_strdup_vprintf(name, ap);
+ va_end(ap);
+}
+
+static uint64_t buffer_get_avg_size(Buffer *buffer)
+{
+ return buffer->avg_size >> BUFFER_AVG_SIZE_SHIFT;
+}
+
+void buffer_shrink(Buffer *buffer)
+{
+ size_t new;
+
+ /* Calculate the average size of the buffer as
+ * avg_size = avg_size * ( 1 - a ) + required_size * a
+ * where a is 1 / 2 ^ BUFFER_AVG_SIZE_SHIFT. */
+ buffer->avg_size *= (1 << BUFFER_AVG_SIZE_SHIFT) - 1;
+ buffer->avg_size >>= BUFFER_AVG_SIZE_SHIFT;
+ buffer->avg_size += buffer_req_size(buffer, 0);
+
+ /* And then only shrink if the average size of the buffer is much
+ * too big, to avoid bumping up & down the buffers all the time.
+ * realloc() isn't exactly cheap ... */
+ new = buffer_req_size(buffer, buffer_get_avg_size(buffer));
+ if (new < buffer->capacity >> 3 &&
+ new >= BUFFER_MIN_SHRINK_SIZE) {
+ buffer_adj_size(buffer, buffer_get_avg_size(buffer));
+ }
+
+ buffer_adj_size(buffer, 0);
+}
+
+void buffer_reserve(Buffer *buffer, size_t len)
+{
+ if ((buffer->capacity - buffer->offset) < len) {
+ buffer_adj_size(buffer, len);
+ }
+}
+
+gboolean buffer_empty(Buffer *buffer)
+{
+ return buffer->offset == 0;
+}
+
+uint8_t *buffer_end(Buffer *buffer)
+{
+ return buffer->buffer + buffer->offset;
+}
+
+void buffer_reset(Buffer *buffer)
+{
+ buffer->offset = 0;
+ buffer_shrink(buffer);
+}
+
+void buffer_free(Buffer *buffer)
+{
+ trace_buffer_free(buffer->name ?: "unnamed", buffer->capacity);
+ g_free(buffer->buffer);
+ g_free(buffer->name);
+ buffer->offset = 0;
+ buffer->capacity = 0;
+ buffer->buffer = NULL;
+ buffer->name = NULL;
+}
+
+void buffer_append(Buffer *buffer, const void *data, size_t len)
+{
+ memcpy(buffer->buffer + buffer->offset, data, len);
+ buffer->offset += len;
+}
+
+void buffer_advance(Buffer *buffer, size_t len)
+{
+ memmove(buffer->buffer, buffer->buffer + len,
+ (buffer->offset - len));
+ buffer->offset -= len;
+ buffer_shrink(buffer);
+}
+
+void buffer_move_empty(Buffer *to, Buffer *from)
+{
+ trace_buffer_move_empty(to->name ?: "unnamed",
+ from->offset,
+ from->name ?: "unnamed");
+ assert(to->offset == 0);
+
+ g_free(to->buffer);
+ to->offset = from->offset;
+ to->capacity = from->capacity;
+ to->buffer = from->buffer;
+
+ from->offset = 0;
+ from->capacity = 0;
+ from->buffer = NULL;
+}
+
+void buffer_move(Buffer *to, Buffer *from)
+{
+ if (to->offset == 0) {
+ buffer_move_empty(to, from);
+ return;
+ }
+
+ trace_buffer_move(to->name ?: "unnamed",
+ from->offset,
+ from->name ?: "unnamed");
+ buffer_reserve(to, from->offset);
+ buffer_append(to, from->buffer, from->offset);
+
+ g_free(from->buffer);
+ from->offset = 0;
+ from->capacity = 0;
+ from->buffer = NULL;
+}
diff --git a/util/bufferiszero.c b/util/bufferiszero.c
new file mode 100644
index 000000000..695bb4ce2
--- /dev/null
+++ b/util/bufferiszero.c
@@ -0,0 +1,355 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/bswap.h"
+
+static bool
+buffer_zero_int(const void *buf, size_t len)
+{
+ if (unlikely(len < 8)) {
+ /* For a very small buffer, simply accumulate all the bytes. */
+ const unsigned char *p = buf;
+ const unsigned char *e = buf + len;
+ unsigned char t = 0;
+
+ do {
+ t |= *p++;
+ } while (p < e);
+
+ return t == 0;
+ } else {
+ /* Otherwise, use the unaligned memory access functions to
+ handle the beginning and end of the buffer, with a couple
+ of loops handling the middle aligned section. */
+ uint64_t t = ldq_he_p(buf);
+ const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8);
+ const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8);
+
+ for (; p + 8 <= e; p += 8) {
+ __builtin_prefetch(p + 8);
+ if (t) {
+ return false;
+ }
+ t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7];
+ }
+ while (p < e) {
+ t |= *p++;
+ }
+ t |= ldq_he_p(buf + len - 8);
+
+ return t == 0;
+ }
+}
+
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__)
+/* Do not use push_options pragmas unnecessarily, because clang
+ * does not support them.
+ */
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#endif
+#include <emmintrin.h>
+
+/* Note that each of these vectorized functions require len >= 64. */
+
+static bool
+buffer_zero_sse2(const void *buf, size_t len)
+{
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+ __m128i zero = _mm_setzero_si128();
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ t = _mm_cmpeq_epi8(t, zero);
+ if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
+
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
+
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
+
+ return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF;
+}
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#pragma GCC pop_options
+#endif
+
+#ifdef CONFIG_AVX2_OPT
+/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8,
+ * the includes have to be within the corresponding push_options region, and
+ * therefore the regions themselves have to be ordered with increasing ISA.
+ */
+#pragma GCC push_options
+#pragma GCC target("sse4")
+#include <smmintrin.h>
+
+static bool
+buffer_zero_sse4(const void *buf, size_t len)
+{
+ __m128i t = _mm_loadu_si128(buf);
+ __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16);
+ __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16);
+
+ /* Loop over 16-byte aligned blocks of 64. */
+ while (likely(p <= e)) {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm_testz_si128(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
+
+ /* Finish the aligned tail. */
+ t |= e[-3];
+ t |= e[-2];
+ t |= e[-1];
+
+ /* Finish the unaligned tail. */
+ t |= _mm_loadu_si128(buf + len - 16);
+
+ return _mm_testz_si128(t, t);
+}
+
+#pragma GCC pop_options
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#include <immintrin.h>
+
+static bool
+buffer_zero_avx2(const void *buf, size_t len)
+{
+ /* Begin with an unaligned head of 32 bytes. */
+ __m256i t = _mm256_loadu_si256(buf);
+ __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32);
+ __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32);
+
+ /* Loop over 32-byte aligned blocks of 128. */
+ while (p <= e) {
+ __builtin_prefetch(p);
+ if (unlikely(!_mm256_testz_si256(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ } ;
+
+ /* Finish the last block of 128 unaligned. */
+ t |= _mm256_loadu_si256(buf + len - 4 * 32);
+ t |= _mm256_loadu_si256(buf + len - 3 * 32);
+ t |= _mm256_loadu_si256(buf + len - 2 * 32);
+ t |= _mm256_loadu_si256(buf + len - 1 * 32);
+
+ return _mm256_testz_si256(t, t);
+}
+#pragma GCC pop_options
+#endif /* CONFIG_AVX2_OPT */
+
+#ifdef CONFIG_AVX512F_OPT
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#include <immintrin.h>
+
+static bool
+buffer_zero_avx512(const void *buf, size_t len)
+{
+ /* Begin with an unaligned head of 64 bytes. */
+ __m512i t = _mm512_loadu_si512(buf);
+ __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64);
+ __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64);
+
+ /* Loop over 64-byte aligned blocks of 256. */
+ while (p <= e) {
+ __builtin_prefetch(p);
+ if (unlikely(_mm512_test_epi64_mask(t, t))) {
+ return false;
+ }
+ t = p[-4] | p[-3] | p[-2] | p[-1];
+ p += 4;
+ }
+
+ t |= _mm512_loadu_si512(buf + len - 4 * 64);
+ t |= _mm512_loadu_si512(buf + len - 3 * 64);
+ t |= _mm512_loadu_si512(buf + len - 2 * 64);
+ t |= _mm512_loadu_si512(buf + len - 1 * 64);
+
+ return !_mm512_test_epi64_mask(t, t);
+
+}
+#pragma GCC pop_options
+#endif
+
+
+/* Note that for test_buffer_is_zero_next_accel, the most preferred
+ * ISA must have the least significant bit.
+ */
+#define CACHE_AVX512F 1
+#define CACHE_AVX2 2
+#define CACHE_SSE4 4
+#define CACHE_SSE2 8
+
+/* Make sure that these variables are appropriately initialized when
+ * SSE2 is enabled on the compiler command-line, but the compiler is
+ * too old to support CONFIG_AVX2_OPT.
+ */
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+# define INIT_CACHE 0
+# define INIT_ACCEL buffer_zero_int
+#else
+# ifndef __SSE2__
+# error "ISA selection confusion"
+# endif
+# define INIT_CACHE CACHE_SSE2
+# define INIT_ACCEL buffer_zero_sse2
+#endif
+
+static unsigned cpuid_cache = INIT_CACHE;
+static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL;
+static int length_to_accel = 64;
+
+static void init_accel(unsigned cache)
+{
+ bool (*fn)(const void *, size_t) = buffer_zero_int;
+ if (cache & CACHE_SSE2) {
+ fn = buffer_zero_sse2;
+ length_to_accel = 64;
+ }
+#ifdef CONFIG_AVX2_OPT
+ if (cache & CACHE_SSE4) {
+ fn = buffer_zero_sse4;
+ length_to_accel = 64;
+ }
+ if (cache & CACHE_AVX2) {
+ fn = buffer_zero_avx2;
+ length_to_accel = 128;
+ }
+#endif
+#ifdef CONFIG_AVX512F_OPT
+ if (cache & CACHE_AVX512F) {
+ fn = buffer_zero_avx512;
+ length_to_accel = 256;
+ }
+#endif
+ buffer_accel = fn;
+}
+
+#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT)
+#include "qemu/cpuid.h"
+
+static void __attribute__((constructor)) init_cpuid_cache(void)
+{
+ int max = __get_cpuid_max(0, NULL);
+ int a, b, c, d;
+ unsigned cache = 0;
+
+ if (max >= 1) {
+ __cpuid(1, a, b, c, d);
+ if (d & bit_SSE2) {
+ cache |= CACHE_SSE2;
+ }
+ if (c & bit_SSE4_1) {
+ cache |= CACHE_SSE4;
+ }
+
+ /* We must check that AVX is not just available, but usable. */
+ if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+ int bv;
+ __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+ __cpuid_count(7, 0, a, b, c, d);
+ if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) {
+ cache |= CACHE_AVX2;
+ }
+ /* 0xe6:
+ * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+ * and ZMM16-ZMM31 state are enabled by OS)
+ * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+ */
+ if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) {
+ cache |= CACHE_AVX512F;
+ }
+ }
+ }
+ cpuid_cache = cache;
+ init_accel(cache);
+}
+#endif /* CONFIG_AVX2_OPT */
+
+bool test_buffer_is_zero_next_accel(void)
+{
+ /* If no bits set, we just tested buffer_zero_int, and there
+ are no more acceleration options to test. */
+ if (cpuid_cache == 0) {
+ return false;
+ }
+ /* Disable the accelerator we used before and select a new one. */
+ cpuid_cache &= cpuid_cache - 1;
+ init_accel(cpuid_cache);
+ return true;
+}
+
+static bool select_accel_fn(const void *buf, size_t len)
+{
+ if (likely(len >= length_to_accel)) {
+ return buffer_accel(buf, len);
+ }
+ return buffer_zero_int(buf, len);
+}
+
+#else
+#define select_accel_fn buffer_zero_int
+bool test_buffer_is_zero_next_accel(void)
+{
+ return false;
+}
+#endif
+
+/*
+ * Checks if a buffer is all zeroes
+ */
+bool buffer_is_zero(const void *buf, size_t len)
+{
+ if (unlikely(len == 0)) {
+ return true;
+ }
+
+ /* Fetch the beginning of the buffer while we select the accelerator. */
+ __builtin_prefetch(buf);
+
+ /* Use an optimized zero check if possible. Note that this also
+ includes a check for an unrolled loop over 64-bit integers. */
+ return select_accel_fn(buf, len);
+}
diff --git a/util/cacheflush.c b/util/cacheflush.c
new file mode 100644
index 000000000..933355b0c
--- /dev/null
+++ b/util/cacheflush.c
@@ -0,0 +1,146 @@
+/*
+ * Flush the host cpu caches.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cacheflush.h"
+#include "qemu/bitops.h"
+
+
+#if defined(__i386__) || defined(__x86_64__) || defined(__s390__)
+
+/* Caches are coherent and do not require flushing; symbol inline. */
+
+#elif defined(__aarch64__)
+
+#ifdef CONFIG_DARWIN
+/* Apple does not expose CTR_EL0, so we must use system interfaces. */
+extern void sys_icache_invalidate(void *start, size_t len);
+extern void sys_dcache_flush(void *start, size_t len);
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ sys_dcache_flush((void *)rw, len);
+ sys_icache_invalidate((void *)rx, len);
+}
+#else
+
+/*
+ * TODO: unify this with cacheinfo.c.
+ * We want to save the whole contents of CTR_EL0, so that we
+ * have more than the linesize, but also IDC and DIC.
+ */
+static uint64_t save_ctr_el0;
+static void __attribute__((constructor)) init_ctr_el0(void)
+{
+ asm volatile("mrs\t%0, ctr_el0" : "=r"(save_ctr_el0));
+}
+
+/*
+ * This is a copy of gcc's __aarch64_sync_cache_range, modified
+ * to fit this three-operand interface.
+ */
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ const unsigned CTR_IDC = 1u << 28;
+ const unsigned CTR_DIC = 1u << 29;
+ const uint64_t ctr_el0 = save_ctr_el0;
+ const uintptr_t icache_lsize = 4 << extract64(ctr_el0, 0, 4);
+ const uintptr_t dcache_lsize = 4 << extract64(ctr_el0, 16, 4);
+ uintptr_t p;
+
+ /*
+ * If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification
+ * is not required for instruction to data coherence.
+ */
+ if (!(ctr_el0 & CTR_IDC)) {
+ /*
+ * Loop over the address range, clearing one cache line at once.
+ * Data cache must be flushed to unification first to make sure
+ * the instruction cache fetches the updated data.
+ */
+ for (p = rw & -dcache_lsize; p < rw + len; p += dcache_lsize) {
+ asm volatile("dc\tcvau, %0" : : "r" (p) : "memory");
+ }
+ asm volatile("dsb\tish" : : : "memory");
+ }
+
+ /*
+ * If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point
+ * of Unification is not required for instruction to data coherence.
+ */
+ if (!(ctr_el0 & CTR_DIC)) {
+ for (p = rx & -icache_lsize; p < rx + len; p += icache_lsize) {
+ asm volatile("ic\tivau, %0" : : "r"(p) : "memory");
+ }
+ asm volatile ("dsb\tish" : : : "memory");
+ }
+
+ asm volatile("isb" : : : "memory");
+}
+#endif /* CONFIG_DARWIN */
+
+#elif defined(__mips__)
+
+#ifdef __OpenBSD__
+#include <machine/sysarch.h>
+#else
+#include <sys/cachectl.h>
+#endif
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ if (rx != rw) {
+ cacheflush((void *)rw, len, DCACHE);
+ }
+ cacheflush((void *)rx, len, ICACHE);
+}
+
+#elif defined(__powerpc__)
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ uintptr_t p, b, e;
+ size_t dsize = qemu_dcache_linesize;
+ size_t isize = qemu_icache_linesize;
+
+ b = rw & ~(dsize - 1);
+ e = (rw + len + dsize - 1) & ~(dsize - 1);
+ for (p = b; p < e; p += dsize) {
+ asm volatile ("dcbst 0,%0" : : "r"(p) : "memory");
+ }
+ asm volatile ("sync" : : : "memory");
+
+ b = rx & ~(isize - 1);
+ e = (rx + len + isize - 1) & ~(isize - 1);
+ for (p = b; p < e; p += isize) {
+ asm volatile ("icbi 0,%0" : : "r"(p) : "memory");
+ }
+ asm volatile ("sync" : : : "memory");
+ asm volatile ("isync" : : : "memory");
+}
+
+#elif defined(__sparc__)
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ /* No additional data flush to the RW virtual address required. */
+ uintptr_t p, end = (rx + len + 7) & -8;
+ for (p = rx & -8; p < end; p += 8) {
+ __asm__ __volatile__("flush\t%0" : : "r" (p));
+ }
+}
+
+#else
+
+void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len)
+{
+ if (rw != rx) {
+ __builtin___clear_cache((char *)rw, (char *)rw + len);
+ }
+ __builtin___clear_cache((char *)rx, (char *)rx + len);
+}
+
+#endif
diff --git a/util/cacheinfo.c b/util/cacheinfo.c
new file mode 100644
index 000000000..b182f0b69
--- /dev/null
+++ b/util/cacheinfo.c
@@ -0,0 +1,199 @@
+/*
+ * cacheinfo.c - helpers to query the host about its caches
+ *
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include "qemu/atomic.h"
+
+int qemu_icache_linesize = 0;
+int qemu_icache_linesize_log;
+int qemu_dcache_linesize = 0;
+int qemu_dcache_linesize_log;
+
+/*
+ * Operating system specific detection mechanisms.
+ */
+
+#if defined(_WIN32)
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+ SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf;
+ DWORD size = 0;
+ BOOL success;
+ size_t i, n;
+
+ /* Check for the required buffer size first. Note that if the zero
+ size we use for the probe results in success, then there is no
+ data available; fail in that case. */
+ success = GetLogicalProcessorInformation(0, &size);
+ if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+ return;
+ }
+
+ n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+ size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
+ buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n);
+ if (!GetLogicalProcessorInformation(buf, &size)) {
+ goto fail;
+ }
+
+ for (i = 0; i < n; i++) {
+ if (buf[i].Relationship == RelationCache
+ && buf[i].Cache.Level == 1) {
+ switch (buf[i].Cache.Type) {
+ case CacheUnified:
+ *isize = *dsize = buf[i].Cache.LineSize;
+ break;
+ case CacheInstruction:
+ *isize = buf[i].Cache.LineSize;
+ break;
+ case CacheData:
+ *dsize = buf[i].Cache.LineSize;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ fail:
+ g_free(buf);
+}
+
+#elif defined(__APPLE__)
+# include <sys/sysctl.h>
+static void sys_cache_info(int *isize, int *dsize)
+{
+ /* There's only a single sysctl for both I/D cache line sizes. */
+ long size;
+ size_t len = sizeof(size);
+ if (!sysctlbyname("hw.cachelinesize", &size, &len, NULL, 0)) {
+ *isize = *dsize = size;
+ }
+}
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+# include <sys/sysctl.h>
+static void sys_cache_info(int *isize, int *dsize)
+{
+ /* There's only a single sysctl for both I/D cache line sizes. */
+ int size;
+ size_t len = sizeof(size);
+ if (!sysctlbyname("machdep.cacheline_size", &size, &len, NULL, 0)) {
+ *isize = *dsize = size;
+ }
+}
+#else
+/* POSIX */
+
+static void sys_cache_info(int *isize, int *dsize)
+{
+# ifdef _SC_LEVEL1_ICACHE_LINESIZE
+ int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE);
+ if (tmp_isize > 0) {
+ *isize = tmp_isize;
+ }
+# endif
+# ifdef _SC_LEVEL1_DCACHE_LINESIZE
+ int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+ if (tmp_dsize > 0) {
+ *dsize = tmp_dsize;
+ }
+# endif
+}
+#endif /* sys_cache_info */
+
+/*
+ * Architecture (+ OS) specific detection mechanisms.
+ */
+
+#if defined(__aarch64__)
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+ if (*isize == 0 || *dsize == 0) {
+ uint64_t ctr;
+
+ /* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1,
+ but (at least under Linux) these are marked protected by the
+ kernel. However, CTR_EL0 contains the minimum linesize in the
+ entire hierarchy, and is used by userspace cache flushing. */
+ asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr));
+ if (*isize == 0) {
+ *isize = 4 << (ctr & 0xf);
+ }
+ if (*dsize == 0) {
+ *dsize = 4 << ((ctr >> 16) & 0xf);
+ }
+ }
+}
+
+#elif defined(_ARCH_PPC) && defined(__linux__)
+# include "elf.h"
+
+static void arch_cache_info(int *isize, int *dsize)
+{
+ if (*isize == 0) {
+ *isize = qemu_getauxval(AT_ICACHEBSIZE);
+ }
+ if (*dsize == 0) {
+ *dsize = qemu_getauxval(AT_DCACHEBSIZE);
+ }
+}
+
+#else
+static void arch_cache_info(int *isize, int *dsize) { }
+#endif /* arch_cache_info */
+
+/*
+ * ... and if all else fails ...
+ */
+
+static void fallback_cache_info(int *isize, int *dsize)
+{
+ /* If we can only find one of the two, assume they're the same. */
+ if (*isize) {
+ if (*dsize) {
+ /* Success! */
+ } else {
+ *dsize = *isize;
+ }
+ } else if (*dsize) {
+ *isize = *dsize;
+ } else {
+#if defined(_ARCH_PPC)
+ /*
+ * For PPC, we're going to use the cache sizes computed for
+ * flush_idcache_range. Which means that we must use the
+ * architecture minimum.
+ */
+ *isize = *dsize = 16;
+#else
+ /* Otherwise, 64 bytes is not uncommon. */
+ *isize = *dsize = 64;
+#endif
+ }
+}
+
+static void __attribute__((constructor)) init_cache_info(void)
+{
+ int isize = 0, dsize = 0;
+
+ sys_cache_info(&isize, &dsize);
+ arch_cache_info(&isize, &dsize);
+ fallback_cache_info(&isize, &dsize);
+
+ assert((isize & (isize - 1)) == 0);
+ assert((dsize & (dsize - 1)) == 0);
+
+ qemu_icache_linesize = isize;
+ qemu_icache_linesize_log = ctz32(isize);
+ qemu_dcache_linesize = dsize;
+ qemu_dcache_linesize_log = ctz32(dsize);
+
+ qatomic64_init();
+}
diff --git a/util/compatfd.c b/util/compatfd.c
new file mode 100644
index 000000000..ab810c42a
--- /dev/null
+++ b/util/compatfd.c
@@ -0,0 +1,106 @@
+/*
+ * signalfd/eventfd compatibility
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+
+#if defined(CONFIG_SIGNALFD)
+#include <sys/signalfd.h>
+#endif
+
+struct sigfd_compat_info {
+ sigset_t mask;
+ int fd;
+};
+
+static void *sigwait_compat(void *opaque)
+{
+ struct sigfd_compat_info *info = opaque;
+
+ while (1) {
+ int sig;
+ int err;
+
+ err = sigwait(&info->mask, &sig);
+ if (err != 0) {
+ if (errno == EINTR) {
+ continue;
+ } else {
+ return NULL;
+ }
+ } else {
+ struct qemu_signalfd_siginfo buffer;
+ size_t offset = 0;
+
+ memset(&buffer, 0, sizeof(buffer));
+ buffer.ssi_signo = sig;
+
+ while (offset < sizeof(buffer)) {
+ ssize_t len;
+
+ len = write(info->fd, (char *)&buffer + offset,
+ sizeof(buffer) - offset);
+ if (len == -1 && errno == EINTR) {
+ continue;
+ }
+
+ if (len <= 0) {
+ return NULL;
+ }
+
+ offset += len;
+ }
+ }
+ }
+}
+
+static int qemu_signalfd_compat(const sigset_t *mask)
+{
+ struct sigfd_compat_info *info;
+ QemuThread thread;
+ int fds[2];
+
+ info = g_malloc(sizeof(*info));
+
+ if (pipe(fds) == -1) {
+ g_free(info);
+ return -1;
+ }
+
+ qemu_set_cloexec(fds[0]);
+ qemu_set_cloexec(fds[1]);
+
+ memcpy(&info->mask, mask, sizeof(*mask));
+ info->fd = fds[1];
+
+ qemu_thread_create(&thread, "signalfd_compat", sigwait_compat, info,
+ QEMU_THREAD_DETACHED);
+
+ return fds[0];
+}
+
+int qemu_signalfd(const sigset_t *mask)
+{
+#if defined(CONFIG_SIGNALFD)
+ int ret;
+
+ ret = signalfd(-1, mask, SFD_CLOEXEC);
+ if (ret != -1) {
+ return ret;
+ }
+#endif
+
+ return qemu_signalfd_compat(mask);
+}
diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c
new file mode 100644
index 000000000..e99b8a4f9
--- /dev/null
+++ b/util/coroutine-sigaltstack.c
@@ -0,0 +1,304 @@
+/*
+ * sigaltstack coroutine initialization code
+ *
+ * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com>
+ * Copyright (C) 2012 Alex Barcelo <abarcelo@ac.upc.edu>
+** This file is partly based on pth_mctx.c, from the GNU Portable Threads
+** Copyright (c) 1999-2006 Ralf S. Engelschall <rse@engelschall.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include "qemu/osdep.h"
+#include <pthread.h>
+#include "qemu-common.h"
+#include "qemu/coroutine_int.h"
+
+#ifdef CONFIG_SAFESTACK
+#error "SafeStack is not compatible with code run in alternate signal stacks"
+#endif
+
+typedef struct {
+ Coroutine base;
+ void *stack;
+ size_t stack_size;
+ sigjmp_buf env;
+} CoroutineSigAltStack;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+typedef struct {
+ /** Currently executing coroutine */
+ Coroutine *current;
+
+ /** The default coroutine */
+ CoroutineSigAltStack leader;
+
+ /** Information for the signal handler (trampoline) */
+ sigjmp_buf tr_reenter;
+ volatile sig_atomic_t tr_called;
+ void *tr_handler;
+} CoroutineThreadState;
+
+static pthread_key_t thread_state_key;
+
+static CoroutineThreadState *coroutine_get_thread_state(void)
+{
+ CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+ if (!s) {
+ s = g_malloc0(sizeof(*s));
+ s->current = &s->leader.base;
+ pthread_setspecific(thread_state_key, s);
+ }
+ return s;
+}
+
+static void qemu_coroutine_thread_cleanup(void *opaque)
+{
+ CoroutineThreadState *s = opaque;
+
+ g_free(s);
+}
+
+static void __attribute__((constructor)) coroutine_init(void)
+{
+ int ret;
+
+ ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup);
+ if (ret != 0) {
+ fprintf(stderr, "unable to create leader key: %s\n", strerror(errno));
+ abort();
+ }
+}
+
+/* "boot" function
+ * This is what starts the coroutine, is called from the trampoline
+ * (from the signal handler when it is not signal handling, read ahead
+ * for more information).
+ */
+static void coroutine_bootstrap(CoroutineSigAltStack *self, Coroutine *co)
+{
+ /* Initialize longjmp environment and switch back the caller */
+ if (!sigsetjmp(self->env, 0)) {
+ siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+ }
+
+ while (true) {
+ co->entry(co->entry_arg);
+ qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+ }
+}
+
+/*
+ * This is used as the signal handler. This is called with the brand new stack
+ * (thanks to sigaltstack). We have to return, given that this is a signal
+ * handler and the sigmask and some other things are changed.
+ */
+static void coroutine_trampoline(int signal)
+{
+ CoroutineSigAltStack *self;
+ Coroutine *co;
+ CoroutineThreadState *coTS;
+
+ /* Get the thread specific information */
+ coTS = coroutine_get_thread_state();
+ self = coTS->tr_handler;
+ coTS->tr_called = 1;
+ co = &self->base;
+
+ /*
+ * Here we have to do a bit of a ping pong between the caller, given that
+ * this is a signal handler and we have to do a return "soon". Then the
+ * caller can reestablish everything and do a siglongjmp here again.
+ */
+ if (!sigsetjmp(coTS->tr_reenter, 0)) {
+ return;
+ }
+
+ /*
+ * Ok, the caller has siglongjmp'ed back to us, so now prepare
+ * us for the real machine state switching. We have to jump
+ * into another function here to get a new stack context for
+ * the auto variables (which have to be auto-variables
+ * because the start of the thread happens later). Else with
+ * PIC (i.e. Position Independent Code which is used when PTH
+ * is built as a shared library) most platforms would
+ * horrible core dump as experience showed.
+ */
+ coroutine_bootstrap(self, co);
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+ CoroutineSigAltStack *co;
+ CoroutineThreadState *coTS;
+ struct sigaction sa;
+ struct sigaction osa;
+ stack_t ss;
+ stack_t oss;
+ sigset_t sigs;
+ sigset_t osigs;
+ sigjmp_buf old_env;
+ static pthread_mutex_t sigusr2_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+ /* The way to manipulate stack is with the sigaltstack function. We
+ * prepare a stack, with it delivering a signal to ourselves and then
+ * put sigsetjmp/siglongjmp where needed.
+ * This has been done keeping coroutine-ucontext as a model and with the
+ * pth ideas (GNU Portable Threads). See coroutine-ucontext for the basics
+ * of the coroutines and see pth_mctx.c (from the pth project) for the
+ * sigaltstack way of manipulating stacks.
+ */
+
+ co = g_malloc0(sizeof(*co));
+ co->stack_size = COROUTINE_STACK_SIZE;
+ co->stack = qemu_alloc_stack(&co->stack_size);
+ co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+ coTS = coroutine_get_thread_state();
+ coTS->tr_handler = co;
+
+ /*
+ * Preserve the SIGUSR2 signal state, block SIGUSR2,
+ * and establish our signal handler. The signal will
+ * later transfer control onto the signal stack.
+ */
+ sigemptyset(&sigs);
+ sigaddset(&sigs, SIGUSR2);
+ pthread_sigmask(SIG_BLOCK, &sigs, &osigs);
+ sa.sa_handler = coroutine_trampoline;
+ sigfillset(&sa.sa_mask);
+ sa.sa_flags = SA_ONSTACK;
+
+ /*
+ * sigaction() is a process-global operation. We must not run
+ * this code in multiple threads at once.
+ */
+ pthread_mutex_lock(&sigusr2_mutex);
+ if (sigaction(SIGUSR2, &sa, &osa) != 0) {
+ abort();
+ }
+
+ /*
+ * Set the new stack.
+ */
+ ss.ss_sp = co->stack;
+ ss.ss_size = co->stack_size;
+ ss.ss_flags = 0;
+ if (sigaltstack(&ss, &oss) < 0) {
+ abort();
+ }
+
+ /*
+ * Now transfer control onto the signal stack and set it up.
+ * It will return immediately via "return" after the sigsetjmp()
+ * was performed. Be careful here with race conditions. The
+ * signal can be delivered the first time sigsuspend() is
+ * called.
+ */
+ coTS->tr_called = 0;
+ pthread_kill(pthread_self(), SIGUSR2);
+ sigfillset(&sigs);
+ sigdelset(&sigs, SIGUSR2);
+ while (!coTS->tr_called) {
+ sigsuspend(&sigs);
+ }
+
+ /*
+ * Inform the system that we are back off the signal stack by
+ * removing the alternative signal stack. Be careful here: It
+ * first has to be disabled, before it can be removed.
+ */
+ sigaltstack(NULL, &ss);
+ ss.ss_flags = SS_DISABLE;
+ if (sigaltstack(&ss, NULL) < 0) {
+ abort();
+ }
+ sigaltstack(NULL, &ss);
+ if (!(oss.ss_flags & SS_DISABLE)) {
+ sigaltstack(&oss, NULL);
+ }
+
+ /*
+ * Restore the old SIGUSR2 signal handler and mask
+ */
+ sigaction(SIGUSR2, &osa, NULL);
+ pthread_mutex_unlock(&sigusr2_mutex);
+
+ pthread_sigmask(SIG_SETMASK, &osigs, NULL);
+
+ /*
+ * Now enter the trampoline again, but this time not as a signal
+ * handler. Instead we jump into it directly. The functionally
+ * redundant ping-pong pointer arithmetic is necessary to avoid
+ * type-conversion warnings related to the `volatile' qualifier and
+ * the fact that `jmp_buf' usually is an array type.
+ */
+ if (!sigsetjmp(old_env, 0)) {
+ siglongjmp(coTS->tr_reenter, 1);
+ }
+
+ /*
+ * Ok, we returned again, so now we're finished
+ */
+
+ return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+ CoroutineSigAltStack *co = DO_UPCAST(CoroutineSigAltStack, base, co_);
+
+ qemu_free_stack(co->stack, co->stack_size);
+ g_free(co);
+}
+
+CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+ CoroutineAction action)
+{
+ CoroutineSigAltStack *from = DO_UPCAST(CoroutineSigAltStack, base, from_);
+ CoroutineSigAltStack *to = DO_UPCAST(CoroutineSigAltStack, base, to_);
+ CoroutineThreadState *s = coroutine_get_thread_state();
+ int ret;
+
+ s->current = to_;
+
+ ret = sigsetjmp(from->env, 0);
+ if (ret == 0) {
+ siglongjmp(to->env, action);
+ }
+ return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+ CoroutineThreadState *s = coroutine_get_thread_state();
+
+ return s->current;
+}
+
+bool qemu_in_coroutine(void)
+{
+ CoroutineThreadState *s = pthread_getspecific(thread_state_key);
+
+ return s && s->current->caller;
+}
+
diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c
new file mode 100644
index 000000000..904b37519
--- /dev/null
+++ b/util/coroutine-ucontext.c
@@ -0,0 +1,332 @@
+/*
+ * ucontext coroutine initialization code
+ *
+ * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws>
+ * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.0 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */
+#ifdef _FORTIFY_SOURCE
+#undef _FORTIFY_SOURCE
+#endif
+#include "qemu/osdep.h"
+#include <ucontext.h>
+#include "qemu/coroutine_int.h"
+
+#ifdef CONFIG_VALGRIND_H
+#include <valgrind/valgrind.h>
+#endif
+
+#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer)
+#ifdef CONFIG_ASAN_IFACE_FIBER
+#define CONFIG_ASAN 1
+#include <sanitizer/asan_interface.h>
+#endif
+#endif
+
+#ifdef CONFIG_TSAN
+#include <sanitizer/tsan_interface.h>
+#endif
+
+typedef struct {
+ Coroutine base;
+ void *stack;
+ size_t stack_size;
+#ifdef CONFIG_SAFESTACK
+ /* Need an unsafe stack for each coroutine */
+ void *unsafe_stack;
+ size_t unsafe_stack_size;
+#endif
+ sigjmp_buf env;
+
+#ifdef CONFIG_TSAN
+ void *tsan_co_fiber;
+ void *tsan_caller_fiber;
+#endif
+
+#ifdef CONFIG_VALGRIND_H
+ unsigned int valgrind_stack_id;
+#endif
+
+} CoroutineUContext;
+
+/**
+ * Per-thread coroutine bookkeeping
+ */
+static __thread CoroutineUContext leader;
+static __thread Coroutine *current;
+
+/*
+ * va_args to makecontext() must be type 'int', so passing
+ * the pointer we need may require several int args. This
+ * union is a quick hack to let us do that
+ */
+union cc_arg {
+ void *p;
+ int i[2];
+};
+
+/*
+ * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it.
+ * always_inline is required to avoid TSan runtime fatal errors.
+ */
+static inline __attribute__((always_inline))
+void on_new_fiber(CoroutineUContext *co)
+{
+#ifdef CONFIG_TSAN
+ co->tsan_co_fiber = __tsan_create_fiber(0); /* flags: sync on switch */
+ co->tsan_caller_fiber = __tsan_get_current_fiber();
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void finish_switch_fiber(void *fake_stack_save)
+{
+#ifdef CONFIG_ASAN
+ const void *bottom_old;
+ size_t size_old;
+
+ __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old);
+
+ if (!leader.stack) {
+ leader.stack = (void *)bottom_old;
+ leader.stack_size = size_old;
+ }
+#endif
+#ifdef CONFIG_TSAN
+ if (fake_stack_save) {
+ __tsan_release(fake_stack_save);
+ __tsan_switch_to_fiber(fake_stack_save, 0); /* 0=synchronize */
+ }
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void start_switch_fiber_asan(CoroutineAction action, void **fake_stack_save,
+ const void *bottom, size_t size)
+{
+#ifdef CONFIG_ASAN
+ __sanitizer_start_switch_fiber(
+ action == COROUTINE_TERMINATE ? NULL : fake_stack_save,
+ bottom, size);
+#endif
+}
+
+/* always_inline is required to avoid TSan runtime fatal errors. */
+static inline __attribute__((always_inline))
+void start_switch_fiber_tsan(void **fake_stack_save,
+ CoroutineUContext *co,
+ bool caller)
+{
+#ifdef CONFIG_TSAN
+ void *new_fiber = caller ?
+ co->tsan_caller_fiber :
+ co->tsan_co_fiber;
+ void *curr_fiber = __tsan_get_current_fiber();
+ __tsan_acquire(curr_fiber);
+
+ *fake_stack_save = curr_fiber;
+ __tsan_switch_to_fiber(new_fiber, 0); /* 0=synchronize */
+#endif
+}
+
+static void coroutine_trampoline(int i0, int i1)
+{
+ union cc_arg arg;
+ CoroutineUContext *self;
+ Coroutine *co;
+ void *fake_stack_save = NULL;
+
+ finish_switch_fiber(NULL);
+
+ arg.i[0] = i0;
+ arg.i[1] = i1;
+ self = arg.p;
+ co = &self->base;
+
+ /* Initialize longjmp environment and switch back the caller */
+ if (!sigsetjmp(self->env, 0)) {
+ start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, leader.stack,
+ leader.stack_size);
+ start_switch_fiber_tsan(&fake_stack_save, self, true); /* true=caller */
+ siglongjmp(*(sigjmp_buf *)co->entry_arg, 1);
+ }
+
+ finish_switch_fiber(fake_stack_save);
+
+ while (true) {
+ co->entry(co->entry_arg);
+ qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+ }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+ CoroutineUContext *co;
+ ucontext_t old_uc, uc;
+ sigjmp_buf old_env;
+ union cc_arg arg = {0};
+ void *fake_stack_save = NULL;
+
+ /* The ucontext functions preserve signal masks which incurs a
+ * system call overhead. sigsetjmp(buf, 0)/siglongjmp() does not
+ * preserve signal masks but only works on the current stack.
+ * Since we need a way to create and switch to a new stack, use
+ * the ucontext functions for that but sigsetjmp()/siglongjmp() for
+ * everything else.
+ */
+
+ if (getcontext(&uc) == -1) {
+ abort();
+ }
+
+ co = g_malloc0(sizeof(*co));
+ co->stack_size = COROUTINE_STACK_SIZE;
+ co->stack = qemu_alloc_stack(&co->stack_size);
+#ifdef CONFIG_SAFESTACK
+ co->unsafe_stack_size = COROUTINE_STACK_SIZE;
+ co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size);
+#endif
+ co->base.entry_arg = &old_env; /* stash away our jmp_buf */
+
+ uc.uc_link = &old_uc;
+ uc.uc_stack.ss_sp = co->stack;
+ uc.uc_stack.ss_size = co->stack_size;
+ uc.uc_stack.ss_flags = 0;
+
+#ifdef CONFIG_VALGRIND_H
+ co->valgrind_stack_id =
+ VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size);
+#endif
+
+ arg.p = co;
+
+ on_new_fiber(co);
+ makecontext(&uc, (void (*)(void))coroutine_trampoline,
+ 2, arg.i[0], arg.i[1]);
+
+ /* swapcontext() in, siglongjmp() back out */
+ if (!sigsetjmp(old_env, 0)) {
+ start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, co->stack,
+ co->stack_size);
+ start_switch_fiber_tsan(&fake_stack_save,
+ co, false); /* false=not caller */
+
+#ifdef CONFIG_SAFESTACK
+ /*
+ * Before we swap the context, set the new unsafe stack
+ * The unsafe stack grows just like the normal stack, so start from
+ * the last usable location of the memory area.
+ * NOTE: we don't have to re-set the usp afterwards because we are
+ * coming back to this context through a siglongjmp.
+ * The compiler already wrapped the corresponding sigsetjmp call with
+ * code that saves the usp on the (safe) stack before the call, and
+ * restores it right after (which is where we return with siglongjmp).
+ */
+ void *usp = co->unsafe_stack + co->unsafe_stack_size;
+ __safestack_unsafe_stack_ptr = usp;
+#endif
+
+ swapcontext(&old_uc, &uc);
+ }
+
+ finish_switch_fiber(fake_stack_save);
+
+ return &co->base;
+}
+
+#ifdef CONFIG_VALGRIND_H
+/* Work around an unused variable in the valgrind.h macro... */
+#if !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#endif
+static inline void valgrind_stack_deregister(CoroutineUContext *co)
+{
+ VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id);
+}
+#if !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+#endif
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+ CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_);
+
+#ifdef CONFIG_VALGRIND_H
+ valgrind_stack_deregister(co);
+#endif
+
+ qemu_free_stack(co->stack, co->stack_size);
+#ifdef CONFIG_SAFESTACK
+ qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size);
+#endif
+ g_free(co);
+}
+
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the sigsetjmp() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+ CoroutineAction action)
+{
+ CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_);
+ CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_);
+ int ret;
+ void *fake_stack_save = NULL;
+
+ current = to_;
+
+ ret = sigsetjmp(from->env, 0);
+ if (ret == 0) {
+ start_switch_fiber_asan(action, &fake_stack_save, to->stack,
+ to->stack_size);
+ start_switch_fiber_tsan(&fake_stack_save,
+ to, false); /* false=not caller */
+ siglongjmp(to->env, action);
+ }
+
+ finish_switch_fiber(fake_stack_save);
+
+ return ret;
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+ if (!current) {
+ current = &leader.base;
+ }
+#ifdef CONFIG_TSAN
+ if (!leader.tsan_co_fiber) {
+ leader.tsan_co_fiber = __tsan_get_current_fiber();
+ }
+#endif
+ return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+ return current && current->caller;
+}
diff --git a/util/coroutine-win32.c b/util/coroutine-win32.c
new file mode 100644
index 000000000..de6bd4fd3
--- /dev/null
+++ b/util/coroutine-win32.c
@@ -0,0 +1,102 @@
+/*
+ * Win32 coroutine initialization code
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/coroutine_int.h"
+
+typedef struct
+{
+ Coroutine base;
+
+ LPVOID fiber;
+ CoroutineAction action;
+} CoroutineWin32;
+
+static __thread CoroutineWin32 leader;
+static __thread Coroutine *current;
+
+/* This function is marked noinline to prevent GCC from inlining it
+ * into coroutine_trampoline(). If we allow it to do that then it
+ * hoists the code to get the address of the TLS variable "current"
+ * out of the while() loop. This is an invalid transformation because
+ * the SwitchToFiber() call may be called when running thread A but
+ * return in thread B, and so we might be in a different thread
+ * context each time round the loop.
+ */
+CoroutineAction __attribute__((noinline))
+qemu_coroutine_switch(Coroutine *from_, Coroutine *to_,
+ CoroutineAction action)
+{
+ CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_);
+ CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_);
+
+ current = to_;
+
+ to->action = action;
+ SwitchToFiber(to->fiber);
+ return from->action;
+}
+
+static void CALLBACK coroutine_trampoline(void *co_)
+{
+ Coroutine *co = co_;
+
+ while (true) {
+ co->entry(co->entry_arg);
+ qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE);
+ }
+}
+
+Coroutine *qemu_coroutine_new(void)
+{
+ const size_t stack_size = COROUTINE_STACK_SIZE;
+ CoroutineWin32 *co;
+
+ co = g_malloc0(sizeof(*co));
+ co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base);
+ return &co->base;
+}
+
+void qemu_coroutine_delete(Coroutine *co_)
+{
+ CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_);
+
+ DeleteFiber(co->fiber);
+ g_free(co);
+}
+
+Coroutine *qemu_coroutine_self(void)
+{
+ if (!current) {
+ current = &leader.base;
+ leader.fiber = ConvertThreadToFiber(NULL);
+ }
+ return current;
+}
+
+bool qemu_in_coroutine(void)
+{
+ return current && current->caller;
+}
diff --git a/util/crc-ccitt.c b/util/crc-ccitt.c
new file mode 100644
index 000000000..b981d8ac5
--- /dev/null
+++ b/util/crc-ccitt.c
@@ -0,0 +1,127 @@
+/*
+ * CRC16 (CCITT) Checksum Algorithm
+ *
+ * Copyright (c) 2021 Wind River Systems, Inc.
+ *
+ * Author:
+ * Bin Meng <bin.meng@windriver.com>
+ *
+ * From Linux kernel v5.10 lib/crc-ccitt.c
+ *
+ * SPDX-License-Identifier: GPL-2.0-only
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/crc-ccitt.h"
+
+/*
+ * This mysterious table is just the CRC of each possible byte. It can be
+ * computed using the standard bit-at-a-time methods. The polynomial can
+ * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12.
+ * Add the implicit x^16, and you have the standard CRC-CCITT.
+ */
+uint16_t const crc_ccitt_table[256] = {
+ 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf,
+ 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7,
+ 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e,
+ 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876,
+ 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd,
+ 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5,
+ 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c,
+ 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974,
+ 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb,
+ 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3,
+ 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a,
+ 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72,
+ 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9,
+ 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1,
+ 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738,
+ 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70,
+ 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7,
+ 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff,
+ 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036,
+ 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e,
+ 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5,
+ 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd,
+ 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134,
+ 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c,
+ 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3,
+ 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb,
+ 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232,
+ 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a,
+ 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1,
+ 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9,
+ 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330,
+ 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78
+};
+
+/*
+ * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE
+ * Reflected bits order, does not augment final value.
+ */
+uint16_t const crc_ccitt_false_table[256] = {
+ 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
+ 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
+ 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
+ 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
+ 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
+ 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
+ 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
+ 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
+ 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
+ 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
+ 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
+ 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
+ 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
+ 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
+ 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
+ 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
+ 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
+ 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
+ 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
+ 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
+ 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
+ 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
+ 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
+ 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
+ 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
+ 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
+ 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
+ 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
+ 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
+ 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
+ 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
+ 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
+};
+
+/**
+ * crc_ccitt - recompute the CRC (CRC-CCITT variant)
+ * for the data buffer
+ *
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ */
+uint16_t crc_ccitt(uint16_t crc, uint8_t const *buffer, size_t len)
+{
+ while (len--) {
+ crc = crc_ccitt_byte(crc, *buffer++);
+ }
+ return crc;
+}
+
+/**
+ * crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant)
+ * for the data buffer
+ *
+ * @crc: previous CRC value
+ * @buffer: data pointer
+ * @len: number of bytes in the buffer
+ */
+uint16_t crc_ccitt_false(uint16_t crc, uint8_t const *buffer, size_t len)
+{
+ while (len--) {
+ crc = crc_ccitt_false_byte(crc, *buffer++);
+ }
+ return crc;
+}
diff --git a/util/crc32c.c b/util/crc32c.c
new file mode 100644
index 000000000..762657d85
--- /dev/null
+++ b/util/crc32c.c
@@ -0,0 +1,115 @@
+/*
+ * Castagnoli CRC32C Checksum Algorithm
+ *
+ * Polynomial: 0x11EDC6F41
+ *
+ * Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman
+ * "Optimization of Cyclic Redundancy-Check Codes with 24
+ * and 32 Parity Bits",IEEE Transactions on Communication,
+ * Volume 41, Number 6, June 1993
+ *
+ * Copyright (c) 2013 Red Hat, Inc.,
+ *
+ * Authors:
+ * Jeff Cody <jcody@redhat.com>
+ *
+ * Based on the Linux kernel cryptographic crc32c module,
+ *
+ * Copyright (c) 2004 Cisco Systems, Inc.
+ * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/crc32c.h"
+
+/*
+ * This is the CRC-32C table
+ * Generated with:
+ * width = 32 bits
+ * poly = 0x1EDC6F41
+ * reflect input bytes = true
+ * reflect output bytes = true
+ */
+
+static const uint32_t crc32c_table[256] = {
+ 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+ 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+ 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+ 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+ 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+ 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+ 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+ 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+ 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+ 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+ 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+ 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+ 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+ 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+ 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+ 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+ 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+ 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+ 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+ 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+ 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+ 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+ 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+ 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+ 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+ 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+ 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+ 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+ 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+ 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+ 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+ 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+ 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+ 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+ 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+ 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+ 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+ 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+ 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+ 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+ 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+ 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+ 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+ 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+ 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+ 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+ 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+ 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+ 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+ 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+ 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+ 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+ 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+ 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+ 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+ 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+ 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+ 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+ 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+ 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+ 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+ 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+ 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+ 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+
+uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length)
+{
+ while (length--) {
+ crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8);
+ }
+ return crc^0xffffffff;
+}
+
diff --git a/util/cutils.c b/util/cutils.c
new file mode 100644
index 000000000..c9b91e753
--- /dev/null
+++ b/util/cutils.c
@@ -0,0 +1,1059 @@
+/*
+ * Simple C functions to supplement the C library
+ *
+ * Copyright (c) 2006 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+#include <math.h>
+
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "net/net.h"
+#include "qemu/ctype.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+void strpadcpy(char *buf, int buf_size, const char *str, char pad)
+{
+ int len = qemu_strnlen(str, buf_size);
+ memcpy(buf, str, len);
+ memset(buf + len, pad, buf_size - len);
+}
+
+void pstrcpy(char *buf, int buf_size, const char *str)
+{
+ int c;
+ char *q = buf;
+
+ if (buf_size <= 0)
+ return;
+
+ for(;;) {
+ c = *str++;
+ if (c == 0 || q >= buf + buf_size - 1)
+ break;
+ *q++ = c;
+ }
+ *q = '\0';
+}
+
+/* strcat and truncate. */
+char *pstrcat(char *buf, int buf_size, const char *s)
+{
+ int len;
+ len = strlen(buf);
+ if (len < buf_size)
+ pstrcpy(buf + len, buf_size - len, s);
+ return buf;
+}
+
+int strstart(const char *str, const char *val, const char **ptr)
+{
+ const char *p, *q;
+ p = str;
+ q = val;
+ while (*q != '\0') {
+ if (*p != *q)
+ return 0;
+ p++;
+ q++;
+ }
+ if (ptr)
+ *ptr = p;
+ return 1;
+}
+
+int stristart(const char *str, const char *val, const char **ptr)
+{
+ const char *p, *q;
+ p = str;
+ q = val;
+ while (*q != '\0') {
+ if (qemu_toupper(*p) != qemu_toupper(*q))
+ return 0;
+ p++;
+ q++;
+ }
+ if (ptr)
+ *ptr = p;
+ return 1;
+}
+
+/* XXX: use host strnlen if available ? */
+int qemu_strnlen(const char *s, int max_len)
+{
+ int i;
+
+ for(i = 0; i < max_len; i++) {
+ if (s[i] == '\0') {
+ break;
+ }
+ }
+ return i;
+}
+
+char *qemu_strsep(char **input, const char *delim)
+{
+ char *result = *input;
+ if (result != NULL) {
+ char *p;
+
+ for (p = result; *p != '\0'; p++) {
+ if (strchr(delim, *p)) {
+ break;
+ }
+ }
+ if (*p == '\0') {
+ *input = NULL;
+ } else {
+ *p = '\0';
+ *input = p + 1;
+ }
+ }
+ return result;
+}
+
+time_t mktimegm(struct tm *tm)
+{
+ time_t t;
+ int y = tm->tm_year + 1900, m = tm->tm_mon + 1, d = tm->tm_mday;
+ if (m < 3) {
+ m += 12;
+ y--;
+ }
+ t = 86400ULL * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 +
+ y / 400 - 719469);
+ t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec;
+ return t;
+}
+
+/*
+ * Make sure data goes on disk, but if possible do not bother to
+ * write out the inode just for timestamp updates.
+ *
+ * Unfortunately even in 2009 many operating systems do not support
+ * fdatasync and have to fall back to fsync.
+ */
+int qemu_fdatasync(int fd)
+{
+#ifdef CONFIG_FDATASYNC
+ return fdatasync(fd);
+#else
+ return fsync(fd);
+#endif
+}
+
+/**
+ * Sync changes made to the memory mapped file back to the backing
+ * storage. For POSIX compliant systems this will fallback
+ * to regular msync call. Otherwise it will trigger whole file sync
+ * (including the metadata case there is no support to skip that otherwise)
+ *
+ * @addr - start of the memory area to be synced
+ * @length - length of the are to be synced
+ * @fd - file descriptor for the file to be synced
+ * (mandatory only for POSIX non-compliant systems)
+ */
+int qemu_msync(void *addr, size_t length, int fd)
+{
+#ifdef CONFIG_POSIX
+ size_t align_mask = ~(qemu_real_host_page_size - 1);
+
+ /**
+ * There are no strict reqs as per the length of mapping
+ * to be synced. Still the length needs to follow the address
+ * alignment changes. Additionally - round the size to the multiple
+ * of PAGE_SIZE
+ */
+ length += ((uintptr_t)addr & (qemu_real_host_page_size - 1));
+ length = (length + ~align_mask) & align_mask;
+
+ addr = (void *)((uintptr_t)addr & align_mask);
+
+ return msync(addr, length, MS_SYNC);
+#else /* CONFIG_POSIX */
+ /**
+ * Perform the sync based on the file descriptor
+ * The sync range will most probably be wider than the one
+ * requested - but it will still get the job done
+ */
+ return qemu_fdatasync(fd);
+#endif /* CONFIG_POSIX */
+}
+
+#ifndef _WIN32
+/* Sets a specific flag */
+int fcntl_setfl(int fd, int flag)
+{
+ int flags;
+
+ flags = fcntl(fd, F_GETFL);
+ if (flags == -1)
+ return -errno;
+
+ if (fcntl(fd, F_SETFL, flags | flag) == -1)
+ return -errno;
+
+ return 0;
+}
+#endif
+
+static int64_t suffix_mul(char suffix, int64_t unit)
+{
+ switch (qemu_toupper(suffix)) {
+ case 'B':
+ return 1;
+ case 'K':
+ return unit;
+ case 'M':
+ return unit * unit;
+ case 'G':
+ return unit * unit * unit;
+ case 'T':
+ return unit * unit * unit * unit;
+ case 'P':
+ return unit * unit * unit * unit * unit;
+ case 'E':
+ return unit * unit * unit * unit * unit * unit;
+ }
+ return -1;
+}
+
+/*
+ * Convert size string to bytes.
+ *
+ * The size parsing supports the following syntaxes
+ * - 12345 - decimal, scale determined by @default_suffix and @unit
+ * - 12345{bBkKmMgGtTpPeE} - decimal, scale determined by suffix and @unit
+ * - 12345.678{kKmMgGtTpPeE} - decimal, scale determined by suffix, and
+ * fractional portion is truncated to byte
+ * - 0x7fEE - hexadecimal, unit determined by @default_suffix
+ *
+ * The following cause a deprecation warning, and may be removed in the future
+ * - 0xabc{kKmMgGtTpP} - hex with scaling suffix
+ *
+ * The following are intentionally not supported
+ * - octal, such as 08
+ * - fractional hex, such as 0x1.8
+ * - floating point exponents, such as 1e3
+ *
+ * The end pointer will be returned in *end, if not NULL. If there is
+ * no fraction, the input can be decimal or hexadecimal; if there is a
+ * fraction, then the input must be decimal and there must be a suffix
+ * (possibly by @default_suffix) larger than Byte, and the fractional
+ * portion may suffer from precision loss or rounding. The input must
+ * be positive.
+ *
+ * Return -ERANGE on overflow (with *@end advanced), and -EINVAL on
+ * other error (with *@end left unchanged).
+ */
+static int do_strtosz(const char *nptr, const char **end,
+ const char default_suffix, int64_t unit,
+ uint64_t *result)
+{
+ int retval;
+ const char *endptr, *f;
+ unsigned char c;
+ bool hex = false;
+ uint64_t val, valf = 0;
+ int64_t mul;
+
+ /* Parse integral portion as decimal. */
+ retval = qemu_strtou64(nptr, &endptr, 10, &val);
+ if (retval) {
+ goto out;
+ }
+ if (memchr(nptr, '-', endptr - nptr) != NULL) {
+ endptr = nptr;
+ retval = -EINVAL;
+ goto out;
+ }
+ if (val == 0 && (*endptr == 'x' || *endptr == 'X')) {
+ /* Input looks like hex, reparse, and insist on no fraction. */
+ retval = qemu_strtou64(nptr, &endptr, 16, &val);
+ if (retval) {
+ goto out;
+ }
+ if (*endptr == '.') {
+ endptr = nptr;
+ retval = -EINVAL;
+ goto out;
+ }
+ hex = true;
+ } else if (*endptr == '.') {
+ /*
+ * Input looks like a fraction. Make sure even 1.k works
+ * without fractional digits. If we see an exponent, treat
+ * the entire input as invalid instead.
+ */
+ double fraction;
+
+ f = endptr;
+ retval = qemu_strtod_finite(f, &endptr, &fraction);
+ if (retval) {
+ endptr++;
+ } else if (memchr(f, 'e', endptr - f) || memchr(f, 'E', endptr - f)) {
+ endptr = nptr;
+ retval = -EINVAL;
+ goto out;
+ } else {
+ /* Extract into a 64-bit fixed-point fraction. */
+ valf = (uint64_t)(fraction * 0x1p64);
+ }
+ }
+ c = *endptr;
+ mul = suffix_mul(c, unit);
+ if (mul > 0) {
+ if (hex) {
+ warn_report("Using a multiplier suffix on hex numbers "
+ "is deprecated: %s", nptr);
+ }
+ endptr++;
+ } else {
+ mul = suffix_mul(default_suffix, unit);
+ assert(mul > 0);
+ }
+ if (mul == 1) {
+ /* When a fraction is present, a scale is required. */
+ if (valf != 0) {
+ endptr = nptr;
+ retval = -EINVAL;
+ goto out;
+ }
+ } else {
+ uint64_t valh, tmp;
+
+ /* Compute exact result: 64.64 x 64.0 -> 128.64 fixed point */
+ mulu64(&val, &valh, val, mul);
+ mulu64(&valf, &tmp, valf, mul);
+ val += tmp;
+ valh += val < tmp;
+
+ /* Round 0.5 upward. */
+ tmp = valf >> 63;
+ val += tmp;
+ valh += val < tmp;
+
+ /* Report overflow. */
+ if (valh != 0) {
+ retval = -ERANGE;
+ goto out;
+ }
+ }
+
+ retval = 0;
+
+out:
+ if (end) {
+ *end = endptr;
+ } else if (*endptr) {
+ retval = -EINVAL;
+ }
+ if (retval == 0) {
+ *result = val;
+ }
+
+ return retval;
+}
+
+int qemu_strtosz(const char *nptr, const char **end, uint64_t *result)
+{
+ return do_strtosz(nptr, end, 'B', 1024, result);
+}
+
+int qemu_strtosz_MiB(const char *nptr, const char **end, uint64_t *result)
+{
+ return do_strtosz(nptr, end, 'M', 1024, result);
+}
+
+int qemu_strtosz_metric(const char *nptr, const char **end, uint64_t *result)
+{
+ return do_strtosz(nptr, end, 'B', 1000, result);
+}
+
+/**
+ * Helper function for error checking after strtol() and the like
+ */
+static int check_strtox_error(const char *nptr, char *ep,
+ const char **endptr, bool check_zero,
+ int libc_errno)
+{
+ assert(ep >= nptr);
+
+ /* Windows has a bug in that it fails to parse 0 from "0x" in base 16 */
+ if (check_zero && ep == nptr && libc_errno == 0) {
+ char *tmp;
+
+ errno = 0;
+ if (strtol(nptr, &tmp, 10) == 0 && errno == 0 &&
+ (*tmp == 'x' || *tmp == 'X')) {
+ ep = tmp;
+ }
+ }
+
+ if (endptr) {
+ *endptr = ep;
+ }
+
+ /* Turn "no conversion" into an error */
+ if (libc_errno == 0 && ep == nptr) {
+ return -EINVAL;
+ }
+
+ /* Fail when we're expected to consume the string, but didn't */
+ if (!endptr && *ep) {
+ return -EINVAL;
+ }
+
+ return -libc_errno;
+}
+
+/**
+ * Convert string @nptr to an integer, and store it in @result.
+ *
+ * This is a wrapper around strtol() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtol() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store INT_MAX in @result,
+ * and return -ERANGE.
+ *
+ * If the conversion underflows @result, store INT_MIN in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtoi(const char *nptr, const char **endptr, int base,
+ int *result)
+{
+ char *ep;
+ long long lresult;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ errno = 0;
+ lresult = strtoll(nptr, &ep, base);
+ if (lresult < INT_MIN) {
+ *result = INT_MIN;
+ errno = ERANGE;
+ } else if (lresult > INT_MAX) {
+ *result = INT_MAX;
+ errno = ERANGE;
+ } else {
+ *result = lresult;
+ }
+ return check_strtox_error(nptr, ep, endptr, lresult == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an unsigned integer, and store it in @result.
+ *
+ * This is a wrapper around strtoul() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtoul() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store UINT_MAX in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ *
+ * Note that a number with a leading minus sign gets converted without
+ * the minus sign, checked for overflow (see above), then negated (in
+ * @result's type). This is exactly how strtoul() works.
+ */
+int qemu_strtoui(const char *nptr, const char **endptr, int base,
+ unsigned int *result)
+{
+ char *ep;
+ long long lresult;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ errno = 0;
+ lresult = strtoull(nptr, &ep, base);
+
+ /* Windows returns 1 for negative out-of-range values. */
+ if (errno == ERANGE) {
+ *result = -1;
+ } else {
+ if (lresult > UINT_MAX) {
+ *result = UINT_MAX;
+ errno = ERANGE;
+ } else if (lresult < INT_MIN) {
+ *result = UINT_MAX;
+ errno = ERANGE;
+ } else {
+ *result = lresult;
+ }
+ }
+ return check_strtox_error(nptr, ep, endptr, lresult == 0, errno);
+}
+
+/**
+ * Convert string @nptr to a long integer, and store it in @result.
+ *
+ * This is a wrapper around strtol() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtol() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store LONG_MAX in @result,
+ * and return -ERANGE.
+ *
+ * If the conversion underflows @result, store LONG_MIN in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtol(const char *nptr, const char **endptr, int base,
+ long *result)
+{
+ char *ep;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ errno = 0;
+ *result = strtol(nptr, &ep, base);
+ return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an unsigned long, and store it in @result.
+ *
+ * This is a wrapper around strtoul() that is harder to misuse.
+ * Semantics of @nptr, @endptr, @base match strtoul() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows @result, store ULONG_MAX in @result,
+ * and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ *
+ * Note that a number with a leading minus sign gets converted without
+ * the minus sign, checked for overflow (see above), then negated (in
+ * @result's type). This is exactly how strtoul() works.
+ */
+int qemu_strtoul(const char *nptr, const char **endptr, int base,
+ unsigned long *result)
+{
+ char *ep;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ errno = 0;
+ *result = strtoul(nptr, &ep, base);
+ /* Windows returns 1 for negative out-of-range values. */
+ if (errno == ERANGE) {
+ *result = -1;
+ }
+ return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an int64_t.
+ *
+ * Works like qemu_strtol(), except it stores INT64_MAX on overflow,
+ * and INT64_MIN on underflow.
+ */
+int qemu_strtoi64(const char *nptr, const char **endptr, int base,
+ int64_t *result)
+{
+ char *ep;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ /* This assumes int64_t is long long TODO relax */
+ QEMU_BUILD_BUG_ON(sizeof(int64_t) != sizeof(long long));
+ errno = 0;
+ *result = strtoll(nptr, &ep, base);
+ return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to an uint64_t.
+ *
+ * Works like qemu_strtoul(), except it stores UINT64_MAX on overflow.
+ */
+int qemu_strtou64(const char *nptr, const char **endptr, int base,
+ uint64_t *result)
+{
+ char *ep;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ /* This assumes uint64_t is unsigned long long TODO relax */
+ QEMU_BUILD_BUG_ON(sizeof(uint64_t) != sizeof(unsigned long long));
+ errno = 0;
+ *result = strtoull(nptr, &ep, base);
+ /* Windows returns 1 for negative out-of-range values. */
+ if (errno == ERANGE) {
+ *result = -1;
+ }
+ return check_strtox_error(nptr, ep, endptr, *result == 0, errno);
+}
+
+/**
+ * Convert string @nptr to a double.
+ *
+ * This is a wrapper around strtod() that is harder to misuse.
+ * Semantics of @nptr and @endptr match strtod() with differences
+ * noted below.
+ *
+ * @nptr may be null, and no conversion is performed then.
+ *
+ * If no conversion is performed, store @nptr in *@endptr and return
+ * -EINVAL.
+ *
+ * If @endptr is null, and the string isn't fully converted, return
+ * -EINVAL. This is the case when the pointer that would be stored in
+ * a non-null @endptr points to a character other than '\0'.
+ *
+ * If the conversion overflows, store +/-HUGE_VAL in @result, depending
+ * on the sign, and return -ERANGE.
+ *
+ * If the conversion underflows, store +/-0.0 in @result, depending on the
+ * sign, and return -ERANGE.
+ *
+ * Else store the converted value in @result, and return zero.
+ */
+int qemu_strtod(const char *nptr, const char **endptr, double *result)
+{
+ char *ep;
+
+ if (!nptr) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ return -EINVAL;
+ }
+
+ errno = 0;
+ *result = strtod(nptr, &ep);
+ return check_strtox_error(nptr, ep, endptr, false, errno);
+}
+
+/**
+ * Convert string @nptr to a finite double.
+ *
+ * Works like qemu_strtod(), except that "NaN" and "inf" are rejected
+ * with -EINVAL and no conversion is performed.
+ */
+int qemu_strtod_finite(const char *nptr, const char **endptr, double *result)
+{
+ double tmp;
+ int ret;
+
+ ret = qemu_strtod(nptr, endptr, &tmp);
+ if (!ret && !isfinite(tmp)) {
+ if (endptr) {
+ *endptr = nptr;
+ }
+ ret = -EINVAL;
+ }
+
+ if (ret != -EINVAL) {
+ *result = tmp;
+ }
+ return ret;
+}
+
+/**
+ * Searches for the first occurrence of 'c' in 's', and returns a pointer
+ * to the trailing null byte if none was found.
+ */
+#ifndef HAVE_STRCHRNUL
+const char *qemu_strchrnul(const char *s, int c)
+{
+ const char *e = strchr(s, c);
+ if (!e) {
+ e = s + strlen(s);
+ }
+ return e;
+}
+#endif
+
+/**
+ * parse_uint:
+ *
+ * @s: String to parse
+ * @value: Destination for parsed integer value
+ * @endptr: Destination for pointer to first character not consumed
+ * @base: integer base, between 2 and 36 inclusive, or 0
+ *
+ * Parse unsigned integer
+ *
+ * Parsed syntax is like strtoull()'s: arbitrary whitespace, a single optional
+ * '+' or '-', an optional "0x" if @base is 0 or 16, one or more digits.
+ *
+ * If @s is null, or @base is invalid, or @s doesn't start with an
+ * integer in the syntax above, set *@value to 0, *@endptr to @s, and
+ * return -EINVAL.
+ *
+ * Set *@endptr to point right beyond the parsed integer (even if the integer
+ * overflows or is negative, all digits will be parsed and *@endptr will
+ * point right beyond them).
+ *
+ * If the integer is negative, set *@value to 0, and return -ERANGE.
+ *
+ * If the integer overflows unsigned long long, set *@value to
+ * ULLONG_MAX, and return -ERANGE.
+ *
+ * Else, set *@value to the parsed integer, and return 0.
+ */
+int parse_uint(const char *s, unsigned long long *value, char **endptr,
+ int base)
+{
+ int r = 0;
+ char *endp = (char *)s;
+ unsigned long long val = 0;
+
+ assert((unsigned) base <= 36 && base != 1);
+ if (!s) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ errno = 0;
+ val = strtoull(s, &endp, base);
+ if (errno) {
+ r = -errno;
+ goto out;
+ }
+
+ if (endp == s) {
+ r = -EINVAL;
+ goto out;
+ }
+
+ /* make sure we reject negative numbers: */
+ while (qemu_isspace(*s)) {
+ s++;
+ }
+ if (*s == '-') {
+ val = 0;
+ r = -ERANGE;
+ goto out;
+ }
+
+out:
+ *value = val;
+ *endptr = endp;
+ return r;
+}
+
+/**
+ * parse_uint_full:
+ *
+ * @s: String to parse
+ * @value: Destination for parsed integer value
+ * @base: integer base, between 2 and 36 inclusive, or 0
+ *
+ * Parse unsigned integer from entire string
+ *
+ * Have the same behavior of parse_uint(), but with an additional check
+ * for additional data after the parsed number. If extra characters are present
+ * after the parsed number, the function will return -EINVAL, and *@v will
+ * be set to 0.
+ */
+int parse_uint_full(const char *s, unsigned long long *value, int base)
+{
+ char *endp;
+ int r;
+
+ r = parse_uint(s, value, &endp, base);
+ if (r < 0) {
+ return r;
+ }
+ if (*endp) {
+ *value = 0;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int qemu_parse_fd(const char *param)
+{
+ long fd;
+ char *endptr;
+
+ errno = 0;
+ fd = strtol(param, &endptr, 10);
+ if (param == endptr /* no conversion performed */ ||
+ errno != 0 /* not representable as long; possibly others */ ||
+ *endptr != '\0' /* final string not empty */ ||
+ fd < 0 /* invalid as file descriptor */ ||
+ fd > INT_MAX /* not representable as int */) {
+ return -1;
+ }
+ return fd;
+}
+
+/*
+ * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128)
+ * Input is limited to 14-bit numbers
+ */
+int uleb128_encode_small(uint8_t *out, uint32_t n)
+{
+ g_assert(n <= 0x3fff);
+ if (n < 0x80) {
+ *out = n;
+ return 1;
+ } else {
+ *out++ = (n & 0x7f) | 0x80;
+ *out = n >> 7;
+ return 2;
+ }
+}
+
+int uleb128_decode_small(const uint8_t *in, uint32_t *n)
+{
+ if (!(*in & 0x80)) {
+ *n = *in;
+ return 1;
+ } else {
+ *n = *in++ & 0x7f;
+ /* we exceed 14 bit number */
+ if (*in & 0x80) {
+ return -1;
+ }
+ *n |= *in << 7;
+ return 2;
+ }
+}
+
+/*
+ * helper to parse debug environment variables
+ */
+int parse_debug_env(const char *name, int max, int initial)
+{
+ char *debug_env = getenv(name);
+ char *inv = NULL;
+ long debug;
+
+ if (!debug_env) {
+ return initial;
+ }
+ errno = 0;
+ debug = strtol(debug_env, &inv, 10);
+ if (inv == debug_env) {
+ return initial;
+ }
+ if (debug < 0 || debug > max || errno != 0) {
+ warn_report("%s not in [0, %d]", name, max);
+ return initial;
+ }
+ return debug;
+}
+
+/*
+ * Helper to print ethernet mac address
+ */
+const char *qemu_ether_ntoa(const MACAddr *mac)
+{
+ static char ret[18];
+
+ snprintf(ret, sizeof(ret), "%02x:%02x:%02x:%02x:%02x:%02x",
+ mac->a[0], mac->a[1], mac->a[2], mac->a[3], mac->a[4], mac->a[5]);
+
+ return ret;
+}
+
+/*
+ * Return human readable string for size @val.
+ * @val can be anything that uint64_t allows (no more than "16 EiB").
+ * Use IEC binary units like KiB, MiB, and so forth.
+ * Caller is responsible for passing it to g_free().
+ */
+char *size_to_str(uint64_t val)
+{
+ static const char *suffixes[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei" };
+ uint64_t div;
+ int i;
+
+ /*
+ * The exponent (returned in i) minus one gives us
+ * floor(log2(val * 1024 / 1000). The correction makes us
+ * switch to the higher power when the integer part is >= 1000.
+ * (see e41b509d68afb1f for more info)
+ */
+ frexp(val / (1000.0 / 1024.0), &i);
+ i = (i - 1) / 10;
+ div = 1ULL << (i * 10);
+
+ return g_strdup_printf("%0.3g %sB", (double)val / div, suffixes[i]);
+}
+
+char *freq_to_str(uint64_t freq_hz)
+{
+ static const char *const suffixes[] = { "", "K", "M", "G", "T", "P", "E" };
+ double freq = freq_hz;
+ size_t idx = 0;
+
+ while (freq >= 1000.0) {
+ freq /= 1000.0;
+ idx++;
+ }
+ assert(idx < ARRAY_SIZE(suffixes));
+
+ return g_strdup_printf("%0.3g %sHz", freq, suffixes[idx]);
+}
+
+int qemu_pstrcmp0(const char **str1, const char **str2)
+{
+ return g_strcmp0(*str1, *str2);
+}
+
+static inline bool starts_with_prefix(const char *dir)
+{
+ size_t prefix_len = strlen(CONFIG_PREFIX);
+ return !memcmp(dir, CONFIG_PREFIX, prefix_len) &&
+ (!dir[prefix_len] || G_IS_DIR_SEPARATOR(dir[prefix_len]));
+}
+
+/* Return the next path component in dir, and store its length in *p_len. */
+static inline const char *next_component(const char *dir, int *p_len)
+{
+ int len;
+ while ((*dir && G_IS_DIR_SEPARATOR(*dir)) ||
+ (*dir == '.' && (G_IS_DIR_SEPARATOR(dir[1]) || dir[1] == '\0'))) {
+ dir++;
+ }
+ len = 0;
+ while (dir[len] && !G_IS_DIR_SEPARATOR(dir[len])) {
+ len++;
+ }
+ *p_len = len;
+ return dir;
+}
+
+char *get_relocated_path(const char *dir)
+{
+ size_t prefix_len = strlen(CONFIG_PREFIX);
+ const char *bindir = CONFIG_BINDIR;
+ const char *exec_dir = qemu_get_exec_dir();
+ GString *result;
+ int len_dir, len_bindir;
+
+ /* Fail if qemu_init_exec_dir was not called. */
+ assert(exec_dir[0]);
+ if (!starts_with_prefix(dir) || !starts_with_prefix(bindir)) {
+ return g_strdup(dir);
+ }
+
+ result = g_string_new(exec_dir);
+
+ /* Advance over common components. */
+ len_dir = len_bindir = prefix_len;
+ do {
+ dir += len_dir;
+ bindir += len_bindir;
+ dir = next_component(dir, &len_dir);
+ bindir = next_component(bindir, &len_bindir);
+ } while (len_dir && len_dir == len_bindir && !memcmp(dir, bindir, len_dir));
+
+ /* Ascend from bindir to the common prefix with dir. */
+ while (len_bindir) {
+ bindir += len_bindir;
+ g_string_append(result, "/..");
+ bindir = next_component(bindir, &len_bindir);
+ }
+
+ if (*dir) {
+ assert(G_IS_DIR_SEPARATOR(dir[-1]));
+ g_string_append(result, dir - 1);
+ }
+ return g_string_free(result, false);
+}
diff --git a/util/dbus.c b/util/dbus.c
new file mode 100644
index 000000000..9099dc5b4
--- /dev/null
+++ b/util/dbus.c
@@ -0,0 +1,57 @@
+/*
+ * Helpers for using D-Bus
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/dbus.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+/*
+ * qemu_dbus_get_queued_owners() - return the list of queued unique names
+ * @connection: A GDBusConnection
+ * @name: a service name
+ *
+ * Return: a GStrv of unique names, or NULL on failure.
+ */
+GStrv
+qemu_dbus_get_queued_owners(GDBusConnection *connection, const char *name,
+ Error **errp)
+{
+ g_autoptr(GDBusProxy) proxy = NULL;
+ g_autoptr(GVariant) result = NULL;
+ g_autoptr(GVariant) child = NULL;
+ g_autoptr(GError) err = NULL;
+
+ proxy = g_dbus_proxy_new_sync(connection, G_DBUS_PROXY_FLAGS_NONE, NULL,
+ "org.freedesktop.DBus",
+ "/org/freedesktop/DBus",
+ "org.freedesktop.DBus",
+ NULL, &err);
+ if (!proxy) {
+ error_setg(errp, "Failed to create DBus proxy: %s", err->message);
+ return NULL;
+ }
+
+ result = g_dbus_proxy_call_sync(proxy, "ListQueuedOwners",
+ g_variant_new("(s)", name),
+ G_DBUS_CALL_FLAGS_NO_AUTO_START,
+ -1, NULL, &err);
+ if (!result) {
+ if (g_error_matches(err,
+ G_DBUS_ERROR,
+ G_DBUS_ERROR_NAME_HAS_NO_OWNER)) {
+ return g_new0(char *, 1);
+ }
+ error_setg(errp, "Failed to call ListQueuedOwners: %s", err->message);
+ return NULL;
+ }
+
+ child = g_variant_get_child_value(result, 0);
+ return g_variant_dup_strv(child, NULL);
+}
diff --git a/util/drm.c b/util/drm.c
new file mode 100644
index 000000000..dae8ffebc
--- /dev/null
+++ b/util/drm.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2015-2016 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+#include "qemu/osdep.h"
+#include "qemu/drm.h"
+
+#include <glob.h>
+#include <dirent.h>
+
+int qemu_drm_rendernode_open(const char *rendernode)
+{
+ DIR *dir;
+ struct dirent *e;
+ struct stat st;
+ int r, fd, ret;
+ char *p;
+
+ if (rendernode) {
+ return open(rendernode, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK);
+ }
+
+ dir = opendir("/dev/dri");
+ if (!dir) {
+ return -1;
+ }
+
+ fd = -1;
+ while ((e = readdir(dir))) {
+ if (strncmp(e->d_name, "renderD", 7)) {
+ continue;
+ }
+
+ p = g_strdup_printf("/dev/dri/%s", e->d_name);
+
+ r = open(p, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK);
+ if (r < 0) {
+ g_free(p);
+ continue;
+ }
+
+ /*
+ * prefer fstat() over checking e->d_type == DT_CHR for
+ * portability reasons
+ */
+ ret = fstat(r, &st);
+ if (ret < 0 || (st.st_mode & S_IFMT) != S_IFCHR) {
+ close(r);
+ g_free(p);
+ continue;
+ }
+
+ fd = r;
+ g_free(p);
+ break;
+ }
+
+ closedir(dir);
+ if (fd < 0) {
+ return -1;
+ }
+ return fd;
+}
diff --git a/util/envlist.c b/util/envlist.c
new file mode 100644
index 000000000..2bcc13f09
--- /dev/null
+++ b/util/envlist.c
@@ -0,0 +1,232 @@
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/envlist.h"
+
+struct envlist_entry {
+ const char *ev_var; /* actual env value */
+ QLIST_ENTRY(envlist_entry) ev_link;
+};
+
+struct envlist {
+ QLIST_HEAD(, envlist_entry) el_entries; /* actual entries */
+ size_t el_count; /* number of entries */
+};
+
+static int envlist_parse(envlist_t *envlist,
+ const char *env, int (*)(envlist_t *, const char *));
+
+/*
+ * Allocates new envlist and returns pointer to it.
+ */
+envlist_t *
+envlist_create(void)
+{
+ envlist_t *envlist;
+
+ envlist = g_malloc(sizeof(*envlist));
+
+ QLIST_INIT(&envlist->el_entries);
+ envlist->el_count = 0;
+
+ return (envlist);
+}
+
+/*
+ * Releases given envlist and its entries.
+ */
+void
+envlist_free(envlist_t *envlist)
+{
+ struct envlist_entry *entry;
+
+ assert(envlist != NULL);
+
+ while (envlist->el_entries.lh_first != NULL) {
+ entry = envlist->el_entries.lh_first;
+ QLIST_REMOVE(entry, ev_link);
+
+ g_free((char *)entry->ev_var);
+ g_free(entry);
+ }
+ g_free(envlist);
+}
+
+/*
+ * Parses comma separated list of set/modify environment
+ * variable entries and updates given enlist accordingly.
+ *
+ * For example:
+ * envlist_parse(el, "HOME=foo,SHELL=/bin/sh");
+ *
+ * inserts/sets environment variables HOME and SHELL.
+ *
+ * Returns 0 on success, errno otherwise.
+ */
+int
+envlist_parse_set(envlist_t *envlist, const char *env)
+{
+ return (envlist_parse(envlist, env, &envlist_setenv));
+}
+
+/*
+ * Parses comma separated list of unset environment variable
+ * entries and removes given variables from given envlist.
+ *
+ * Returns 0 on success, errno otherwise.
+ */
+int
+envlist_parse_unset(envlist_t *envlist, const char *env)
+{
+ return (envlist_parse(envlist, env, &envlist_unsetenv));
+}
+
+/*
+ * Parses comma separated list of set, modify or unset entries
+ * and calls given callback for each entry.
+ *
+ * Returns 0 in case of success, errno otherwise.
+ */
+static int
+envlist_parse(envlist_t *envlist, const char *env,
+ int (*callback)(envlist_t *, const char *))
+{
+ char *tmpenv, *envvar;
+ char *envsave = NULL;
+ int ret = 0;
+ assert(callback != NULL);
+
+ if ((envlist == NULL) || (env == NULL))
+ return (EINVAL);
+
+ tmpenv = g_strdup(env);
+ envsave = tmpenv;
+
+ do {
+ envvar = strchr(tmpenv, ',');
+ if (envvar != NULL) {
+ *envvar = '\0';
+ }
+ if ((*callback)(envlist, tmpenv) != 0) {
+ ret = errno;
+ break;
+ }
+ tmpenv = envvar + 1;
+ } while (envvar != NULL);
+
+ g_free(envsave);
+ return ret;
+}
+
+/*
+ * Sets environment value to envlist in similar manner
+ * than putenv(3).
+ *
+ * Returns 0 in success, errno otherwise.
+ */
+int
+envlist_setenv(envlist_t *envlist, const char *env)
+{
+ struct envlist_entry *entry = NULL;
+ const char *eq_sign;
+ size_t envname_len;
+
+ if ((envlist == NULL) || (env == NULL))
+ return (EINVAL);
+
+ /* find out first equals sign in given env */
+ if ((eq_sign = strchr(env, '=')) == NULL)
+ return (EINVAL);
+ envname_len = eq_sign - env + 1;
+
+ /*
+ * If there already exists variable with given name
+ * we remove and release it before allocating a whole
+ * new entry.
+ */
+ for (entry = envlist->el_entries.lh_first; entry != NULL;
+ entry = entry->ev_link.le_next) {
+ if (strncmp(entry->ev_var, env, envname_len) == 0)
+ break;
+ }
+
+ if (entry != NULL) {
+ QLIST_REMOVE(entry, ev_link);
+ g_free((char *)entry->ev_var);
+ g_free(entry);
+ } else {
+ envlist->el_count++;
+ }
+
+ entry = g_malloc(sizeof(*entry));
+ entry->ev_var = g_strdup(env);
+ QLIST_INSERT_HEAD(&envlist->el_entries, entry, ev_link);
+
+ return (0);
+}
+
+/*
+ * Removes given env value from envlist in similar manner
+ * than unsetenv(3). Returns 0 in success, errno otherwise.
+ */
+int
+envlist_unsetenv(envlist_t *envlist, const char *env)
+{
+ struct envlist_entry *entry;
+ size_t envname_len;
+
+ if ((envlist == NULL) || (env == NULL))
+ return (EINVAL);
+
+ /* env is not allowed to contain '=' */
+ if (strchr(env, '=') != NULL)
+ return (EINVAL);
+
+ /*
+ * Find out the requested entry and remove
+ * it from the list.
+ */
+ envname_len = strlen(env);
+ for (entry = envlist->el_entries.lh_first; entry != NULL;
+ entry = entry->ev_link.le_next) {
+ if (strncmp(entry->ev_var, env, envname_len) == 0)
+ break;
+ }
+ if (entry != NULL) {
+ QLIST_REMOVE(entry, ev_link);
+ g_free((char *)entry->ev_var);
+ g_free(entry);
+
+ envlist->el_count--;
+ }
+ return (0);
+}
+
+/*
+ * Returns given envlist as array of strings (in same form that
+ * global variable environ is). Caller must free returned memory
+ * by calling g_free for each element and the array.
+ * Returned array and given envlist are not related (no common
+ * references).
+ *
+ * If caller provides count pointer, number of items in array is
+ * stored there.
+ */
+char **
+envlist_to_environ(const envlist_t *envlist, size_t *count)
+{
+ struct envlist_entry *entry;
+ char **env, **penv;
+
+ penv = env = g_malloc((envlist->el_count + 1) * sizeof(char *));
+
+ for (entry = envlist->el_entries.lh_first; entry != NULL;
+ entry = entry->ev_link.le_next) {
+ *(penv++) = g_strdup(entry->ev_var);
+ }
+ *penv = NULL; /* NULL terminate the list */
+
+ if (count != NULL)
+ *count = envlist->el_count;
+
+ return (env);
+}
diff --git a/util/error.c b/util/error.c
new file mode 100644
index 000000000..b6c89d141
--- /dev/null
+++ b/util/error.c
@@ -0,0 +1,305 @@
+/*
+ * QEMU Error Objects
+ *
+ * Copyright IBM, Corp. 2011
+ * Copyright (C) 2011-2015 Red Hat, Inc.
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2. See
+ * the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+
+struct Error
+{
+ char *msg;
+ ErrorClass err_class;
+ const char *src, *func;
+ int line;
+ GString *hint;
+};
+
+Error *error_abort;
+Error *error_fatal;
+
+static void error_handle_fatal(Error **errp, Error *err)
+{
+ if (errp == &error_abort) {
+ fprintf(stderr, "Unexpected error in %s() at %s:%d:\n",
+ err->func, err->src, err->line);
+ error_report("%s", error_get_pretty(err));
+ if (err->hint) {
+ error_printf("%s", err->hint->str);
+ }
+ abort();
+ }
+ if (errp == &error_fatal) {
+ error_report_err(err);
+ exit(1);
+ }
+}
+
+static void error_setv(Error **errp,
+ const char *src, int line, const char *func,
+ ErrorClass err_class, const char *fmt, va_list ap,
+ const char *suffix)
+{
+ Error *err;
+ int saved_errno = errno;
+
+ if (errp == NULL) {
+ return;
+ }
+ assert(*errp == NULL);
+
+ err = g_malloc0(sizeof(*err));
+ err->msg = g_strdup_vprintf(fmt, ap);
+ if (suffix) {
+ char *msg = err->msg;
+ err->msg = g_strdup_printf("%s: %s", msg, suffix);
+ g_free(msg);
+ }
+ err->err_class = err_class;
+ err->src = src;
+ err->line = line;
+ err->func = func;
+
+ error_handle_fatal(errp, err);
+ *errp = err;
+
+ errno = saved_errno;
+}
+
+void error_set_internal(Error **errp,
+ const char *src, int line, const char *func,
+ ErrorClass err_class, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_setv(errp, src, line, func, err_class, fmt, ap, NULL);
+ va_end(ap);
+}
+
+void error_setg_internal(Error **errp,
+ const char *src, int line, const char *func,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap, NULL);
+ va_end(ap);
+}
+
+void error_setg_errno_internal(Error **errp,
+ const char *src, int line, const char *func,
+ int os_errno, const char *fmt, ...)
+{
+ va_list ap;
+ int saved_errno = errno;
+
+ va_start(ap, fmt);
+ error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap,
+ os_errno != 0 ? strerror(os_errno) : NULL);
+ va_end(ap);
+
+ errno = saved_errno;
+}
+
+void error_setg_file_open_internal(Error **errp,
+ const char *src, int line, const char *func,
+ int os_errno, const char *filename)
+{
+ error_setg_errno_internal(errp, src, line, func, os_errno,
+ "Could not open '%s'", filename);
+}
+
+void error_vprepend(Error *const *errp, const char *fmt, va_list ap)
+{
+ GString *newmsg;
+
+ if (!errp) {
+ return;
+ }
+
+ newmsg = g_string_new(NULL);
+ g_string_vprintf(newmsg, fmt, ap);
+ g_string_append(newmsg, (*errp)->msg);
+ g_free((*errp)->msg);
+ (*errp)->msg = g_string_free(newmsg, 0);
+}
+
+void error_prepend(Error *const *errp, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_vprepend(errp, fmt, ap);
+ va_end(ap);
+}
+
+void error_append_hint(Error *const *errp, const char *fmt, ...)
+{
+ va_list ap;
+ int saved_errno = errno;
+ Error *err;
+
+ if (!errp) {
+ return;
+ }
+ err = *errp;
+ assert(err && errp != &error_abort && errp != &error_fatal);
+
+ if (!err->hint) {
+ err->hint = g_string_new(NULL);
+ }
+ va_start(ap, fmt);
+ g_string_append_vprintf(err->hint, fmt, ap);
+ va_end(ap);
+
+ errno = saved_errno;
+}
+
+#ifdef _WIN32
+
+void error_setg_win32_internal(Error **errp,
+ const char *src, int line, const char *func,
+ int win32_err, const char *fmt, ...)
+{
+ va_list ap;
+ char *suffix = NULL;
+
+ if (errp == NULL) {
+ return;
+ }
+
+ if (win32_err != 0) {
+ suffix = g_win32_error_message(win32_err);
+ }
+
+ va_start(ap, fmt);
+ error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR,
+ fmt, ap, suffix);
+ va_end(ap);
+
+ g_free(suffix);
+}
+
+#endif
+
+Error *error_copy(const Error *err)
+{
+ Error *err_new;
+
+ err_new = g_malloc0(sizeof(*err));
+ err_new->msg = g_strdup(err->msg);
+ err_new->err_class = err->err_class;
+ err_new->src = err->src;
+ err_new->line = err->line;
+ err_new->func = err->func;
+ if (err->hint) {
+ err_new->hint = g_string_new(err->hint->str);
+ }
+
+ return err_new;
+}
+
+ErrorClass error_get_class(const Error *err)
+{
+ return err->err_class;
+}
+
+const char *error_get_pretty(const Error *err)
+{
+ return err->msg;
+}
+
+void error_report_err(Error *err)
+{
+ error_report("%s", error_get_pretty(err));
+ if (err->hint) {
+ error_printf("%s", err->hint->str);
+ }
+ error_free(err);
+}
+
+void warn_report_err(Error *err)
+{
+ warn_report("%s", error_get_pretty(err));
+ if (err->hint) {
+ error_printf("%s", err->hint->str);
+ }
+ error_free(err);
+}
+
+void error_reportf_err(Error *err, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_vprepend(&err, fmt, ap);
+ va_end(ap);
+ error_report_err(err);
+}
+
+
+void warn_reportf_err(Error *err, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ error_vprepend(&err, fmt, ap);
+ va_end(ap);
+ warn_report_err(err);
+}
+
+void error_free(Error *err)
+{
+ if (err) {
+ g_free(err->msg);
+ if (err->hint) {
+ g_string_free(err->hint, true);
+ }
+ g_free(err);
+ }
+}
+
+void error_free_or_abort(Error **errp)
+{
+ assert(errp && *errp);
+ error_free(*errp);
+ *errp = NULL;
+}
+
+void error_propagate(Error **dst_errp, Error *local_err)
+{
+ if (!local_err) {
+ return;
+ }
+ error_handle_fatal(dst_errp, local_err);
+ if (dst_errp && !*dst_errp) {
+ *dst_errp = local_err;
+ } else {
+ error_free(local_err);
+ }
+}
+
+void error_propagate_prepend(Error **dst_errp, Error *err,
+ const char *fmt, ...)
+{
+ va_list ap;
+
+ if (dst_errp && !*dst_errp) {
+ va_start(ap, fmt);
+ error_vprepend(&err, fmt, ap);
+ va_end(ap);
+ } /* else error is being ignored, don't bother with prepending */
+ error_propagate(dst_errp, err);
+}
diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c
new file mode 100644
index 000000000..8307013c5
--- /dev/null
+++ b/util/event_notifier-posix.c
@@ -0,0 +1,140 @@
+/*
+ * event notifier support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ * Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
+
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
+#ifdef CONFIG_EVENTFD
+/*
+ * Initialize @e with existing file descriptor @fd.
+ * @fd must be a genuine eventfd object, emulation with pipe won't do.
+ */
+void event_notifier_init_fd(EventNotifier *e, int fd)
+{
+ e->rfd = fd;
+ e->wfd = fd;
+ e->initialized = true;
+}
+#endif
+
+int event_notifier_init(EventNotifier *e, int active)
+{
+ int fds[2];
+ int ret;
+
+#ifdef CONFIG_EVENTFD
+ ret = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+#else
+ ret = -1;
+ errno = ENOSYS;
+#endif
+ if (ret >= 0) {
+ e->rfd = e->wfd = ret;
+ } else {
+ if (errno != ENOSYS) {
+ return -errno;
+ }
+ if (qemu_pipe(fds) < 0) {
+ return -errno;
+ }
+ ret = fcntl_setfl(fds[0], O_NONBLOCK);
+ if (ret < 0) {
+ ret = -errno;
+ goto fail;
+ }
+ ret = fcntl_setfl(fds[1], O_NONBLOCK);
+ if (ret < 0) {
+ ret = -errno;
+ goto fail;
+ }
+ e->rfd = fds[0];
+ e->wfd = fds[1];
+ }
+ e->initialized = true;
+ if (active) {
+ event_notifier_set(e);
+ }
+ return 0;
+
+fail:
+ close(fds[0]);
+ close(fds[1]);
+ return ret;
+}
+
+void event_notifier_cleanup(EventNotifier *e)
+{
+ if (!e->initialized) {
+ return;
+ }
+
+ if (e->rfd != e->wfd) {
+ close(e->rfd);
+ }
+
+ e->rfd = -1;
+ close(e->wfd);
+ e->wfd = -1;
+ e->initialized = false;
+}
+
+int event_notifier_get_fd(const EventNotifier *e)
+{
+ return e->rfd;
+}
+
+int event_notifier_set(EventNotifier *e)
+{
+ static const uint64_t value = 1;
+ ssize_t ret;
+
+ if (!e->initialized) {
+ return -1;
+ }
+
+ do {
+ ret = write(e->wfd, &value, sizeof(value));
+ } while (ret < 0 && errno == EINTR);
+
+ /* EAGAIN is fine, a read must be pending. */
+ if (ret < 0 && errno != EAGAIN) {
+ return -errno;
+ }
+ return 0;
+}
+
+int event_notifier_test_and_clear(EventNotifier *e)
+{
+ int value;
+ ssize_t len;
+ char buffer[512];
+
+ if (!e->initialized) {
+ return 0;
+ }
+
+ /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */
+ value = 0;
+ do {
+ len = read(e->rfd, buffer, sizeof(buffer));
+ value |= (len > 0);
+ } while ((len == -1 && errno == EINTR) || len == sizeof(buffer));
+
+ return value;
+}
diff --git a/util/event_notifier-win32.c b/util/event_notifier-win32.c
new file mode 100644
index 000000000..62c53b0a9
--- /dev/null
+++ b/util/event_notifier-win32.c
@@ -0,0 +1,50 @@
+/*
+ * event notifier support
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Authors:
+ * Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/event_notifier.h"
+#include "qemu/main-loop.h"
+
+int event_notifier_init(EventNotifier *e, int active)
+{
+ e->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+ assert(e->event);
+ return 0;
+}
+
+void event_notifier_cleanup(EventNotifier *e)
+{
+ CloseHandle(e->event);
+ e->event = NULL;
+}
+
+HANDLE event_notifier_get_handle(EventNotifier *e)
+{
+ return e->event;
+}
+
+int event_notifier_set(EventNotifier *e)
+{
+ SetEvent(e->event);
+ return 0;
+}
+
+int event_notifier_test_and_clear(EventNotifier *e)
+{
+ int ret = WaitForSingleObject(e->event, 0);
+ if (ret == WAIT_OBJECT_0) {
+ ResetEvent(e->event);
+ return true;
+ }
+ return false;
+}
diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c
new file mode 100644
index 000000000..e11a8a022
--- /dev/null
+++ b/util/fdmon-epoll.c
@@ -0,0 +1,155 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * epoll(7) file descriptor monitoring
+ */
+
+#include "qemu/osdep.h"
+#include <sys/epoll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+/* The fd number threshold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+void fdmon_epoll_disable(AioContext *ctx)
+{
+ if (ctx->epollfd >= 0) {
+ close(ctx->epollfd);
+ ctx->epollfd = -1;
+ }
+
+ /* Switch back */
+ ctx->fdmon_ops = &fdmon_poll_ops;
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+ (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static void fdmon_epoll_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ struct epoll_event event = {
+ .data.ptr = new_node,
+ .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0,
+ };
+ int r;
+
+ if (!new_node) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event);
+ } else if (!old_node) {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event);
+ } else {
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event);
+ }
+
+ if (r) {
+ fdmon_epoll_disable(ctx);
+ }
+}
+
+static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ GPollFD pfd = {
+ .fd = ctx->epollfd,
+ .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR,
+ };
+ AioHandler *node;
+ int i, ret = 0;
+ struct epoll_event events[128];
+
+ /* Fall back while external clients are disabled */
+ if (qatomic_read(&ctx->external_disable_cnt)) {
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+ }
+
+ if (timeout > 0) {
+ ret = qemu_poll_ns(&pfd, 1, timeout);
+ if (ret > 0) {
+ timeout = 0;
+ }
+ }
+ if (timeout <= 0 || ret > 0) {
+ ret = epoll_wait(ctx->epollfd, events,
+ ARRAY_SIZE(events),
+ timeout);
+ if (ret <= 0) {
+ goto out;
+ }
+ for (i = 0; i < ret; i++) {
+ int ev = events[i].events;
+ int revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+ (ev & EPOLLOUT ? G_IO_OUT : 0) |
+ (ev & EPOLLHUP ? G_IO_HUP : 0) |
+ (ev & EPOLLERR ? G_IO_ERR : 0);
+
+ node = events[i].data.ptr;
+ aio_add_ready_handler(ready_list, node, revents);
+ }
+ }
+out:
+ return ret;
+}
+
+static const FDMonOps fdmon_epoll_ops = {
+ .update = fdmon_epoll_update,
+ .wait = fdmon_epoll_wait,
+ .need_wait = aio_poll_disabled,
+};
+
+static bool fdmon_epoll_try_enable(AioContext *ctx)
+{
+ AioHandler *node;
+ struct epoll_event event;
+
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ int r;
+ if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) {
+ continue;
+ }
+ event.events = epoll_events_from_pfd(node->pfd.events);
+ event.data.ptr = node;
+ r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+ if (r) {
+ return false;
+ }
+ }
+
+ ctx->fdmon_ops = &fdmon_epoll_ops;
+ return true;
+}
+
+bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd)
+{
+ if (ctx->epollfd < 0) {
+ return false;
+ }
+
+ /* Do not upgrade while external clients are disabled */
+ if (qatomic_read(&ctx->external_disable_cnt)) {
+ return false;
+ }
+
+ if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+ if (fdmon_epoll_try_enable(ctx)) {
+ return true;
+ } else {
+ fdmon_epoll_disable(ctx);
+ }
+ }
+ return false;
+}
+
+void fdmon_epoll_setup(AioContext *ctx)
+{
+ ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+ if (ctx->epollfd == -1) {
+ fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno));
+ }
+}
diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c
new file mode 100644
index 000000000..1461dfa40
--- /dev/null
+++ b/util/fdmon-io_uring.c
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Linux io_uring file descriptor monitoring
+ *
+ * The Linux io_uring API supports file descriptor monitoring with a few
+ * advantages over existing APIs like poll(2) and epoll(7):
+ *
+ * 1. Userspace polling of events is possible because the completion queue (cq
+ * ring) is shared between the kernel and userspace. This allows
+ * applications that rely on userspace polling to also monitor file
+ * descriptors in the same userspace polling loop.
+ *
+ * 2. Submission and completion is batched and done together in a single system
+ * call. This minimizes the number of system calls.
+ *
+ * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than
+ * poll(2).
+ *
+ * 4. Nanosecond timeouts are supported so it requires fewer syscalls than
+ * epoll(7).
+ *
+ * This code only monitors file descriptors and does not do asynchronous disk
+ * I/O. Implementing disk I/O efficiently has other requirements and should
+ * use a separate io_uring so it does not make sense to unify the code.
+ *
+ * File descriptor monitoring is implemented using the following operations:
+ *
+ * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored.
+ * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When
+ * the poll mask changes for a file descriptor it is first removed and then
+ * re-added with the new poll mask, so this operation is also used as part
+ * of modifying an existing monitored file descriptor.
+ * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait
+ * for events. This operation self-cancels if another event completes
+ * before the timeout.
+ *
+ * io_uring calls the submission queue the "sq ring" and the completion queue
+ * the "cq ring". Ring entries are called "sqe" and "cqe", respectively.
+ *
+ * The code is structured so that sq/cq rings are only modified within
+ * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on
+ * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD
+ * and/or IORING_OP_POLL_REMOVE sqes for them.
+ */
+
+#include "qemu/osdep.h"
+#include <poll.h>
+#include "qemu/rcu_queue.h"
+#include "aio-posix.h"
+
+enum {
+ FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */
+
+ /* AioHandler::flags */
+ FDMON_IO_URING_PENDING = (1 << 0),
+ FDMON_IO_URING_ADD = (1 << 1),
+ FDMON_IO_URING_REMOVE = (1 << 2),
+};
+
+static inline int poll_events_from_pfd(int pfd_events)
+{
+ return (pfd_events & G_IO_IN ? POLLIN : 0) |
+ (pfd_events & G_IO_OUT ? POLLOUT : 0) |
+ (pfd_events & G_IO_HUP ? POLLHUP : 0) |
+ (pfd_events & G_IO_ERR ? POLLERR : 0);
+}
+
+static inline int pfd_events_from_poll(int poll_events)
+{
+ return (poll_events & POLLIN ? G_IO_IN : 0) |
+ (poll_events & POLLOUT ? G_IO_OUT : 0) |
+ (poll_events & POLLHUP ? G_IO_HUP : 0) |
+ (poll_events & POLLERR ? G_IO_ERR : 0);
+}
+
+/*
+ * Returns an sqe for submitting a request. Only be called within
+ * fdmon_io_uring_wait().
+ */
+static struct io_uring_sqe *get_sqe(AioContext *ctx)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_sqe *sqe = io_uring_get_sqe(ring);
+ int ret;
+
+ if (likely(sqe)) {
+ return sqe;
+ }
+
+ /* No free sqes left, submit pending sqes first */
+ do {
+ ret = io_uring_submit(ring);
+ } while (ret == -EINTR);
+
+ assert(ret > 1);
+ sqe = io_uring_get_sqe(ring);
+ assert(sqe);
+ return sqe;
+}
+
+/* Atomically enqueue an AioHandler for sq ring submission */
+static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags)
+{
+ unsigned old_flags;
+
+ old_flags = qatomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags);
+ if (!(old_flags & FDMON_IO_URING_PENDING)) {
+ QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted);
+ }
+}
+
+/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */
+static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags)
+{
+ AioHandler *node = QSLIST_FIRST(head);
+
+ if (!node) {
+ return NULL;
+ }
+
+ /* Doesn't need to be atomic since fill_sq_ring() moves the list */
+ QSLIST_REMOVE_HEAD(head, node_submitted);
+
+ /*
+ * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two
+ * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and
+ * telling process_cqe() to delete the AioHandler when its
+ * IORING_OP_POLL_ADD completes.
+ */
+ *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING |
+ FDMON_IO_URING_ADD));
+ return node;
+}
+
+static void fdmon_io_uring_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ if (new_node) {
+ enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD);
+ }
+
+ if (old_node) {
+ /*
+ * Deletion is tricky because IORING_OP_POLL_ADD and
+ * IORING_OP_POLL_REMOVE are async. We need to wait for the original
+ * IORING_OP_POLL_ADD to complete before this handler can be freed
+ * safely.
+ *
+ * It's possible that the file descriptor becomes ready and the
+ * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is
+ * submitted, too.
+ *
+ * Mark this handler deleted right now but don't place it on
+ * ctx->deleted_aio_handlers yet. Instead, manually fudge the list
+ * entry to make QLIST_IS_INSERTED() think this handler has been
+ * inserted and other code recognizes this AioHandler as deleted.
+ *
+ * Once the original IORING_OP_POLL_ADD completes we enqueue the
+ * handler on the real ctx->deleted_aio_handlers list to be freed.
+ */
+ assert(!QLIST_IS_INSERTED(old_node, node_deleted));
+ old_node->node_deleted.le_prev = &old_node->node_deleted.le_next;
+
+ enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE);
+ }
+}
+
+static void add_poll_add_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+ int events = poll_events_from_pfd(node->pfd.events);
+
+ io_uring_prep_poll_add(sqe, node->pfd.fd, events);
+ io_uring_sqe_set_data(sqe, node);
+}
+
+static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node)
+{
+ struct io_uring_sqe *sqe = get_sqe(ctx);
+
+ io_uring_prep_poll_remove(sqe, node);
+}
+
+/* Add a timeout that self-cancels when another cqe becomes ready */
+static void add_timeout_sqe(AioContext *ctx, int64_t ns)
+{
+ struct io_uring_sqe *sqe;
+ struct __kernel_timespec ts = {
+ .tv_sec = ns / NANOSECONDS_PER_SECOND,
+ .tv_nsec = ns % NANOSECONDS_PER_SECOND,
+ };
+
+ sqe = get_sqe(ctx);
+ io_uring_prep_timeout(sqe, &ts, 1, 0);
+}
+
+/* Add sqes from ctx->submit_list for submission */
+static void fill_sq_ring(AioContext *ctx)
+{
+ AioHandlerSList submit_list;
+ AioHandler *node;
+ unsigned flags;
+
+ QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list);
+
+ while ((node = dequeue(&submit_list, &flags))) {
+ /* Order matters, just in case both flags were set */
+ if (flags & FDMON_IO_URING_ADD) {
+ add_poll_add_sqe(ctx, node);
+ }
+ if (flags & FDMON_IO_URING_REMOVE) {
+ add_poll_remove_sqe(ctx, node);
+ }
+ }
+}
+
+/* Returns true if a handler became ready */
+static bool process_cqe(AioContext *ctx,
+ AioHandlerList *ready_list,
+ struct io_uring_cqe *cqe)
+{
+ AioHandler *node = io_uring_cqe_get_data(cqe);
+ unsigned flags;
+
+ /* poll_timeout and poll_remove have a zero user_data field */
+ if (!node) {
+ return false;
+ }
+
+ /*
+ * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race
+ * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE
+ * bit before IORING_OP_POLL_REMOVE is submitted.
+ */
+ flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE);
+ if (flags & FDMON_IO_URING_REMOVE) {
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+ return false;
+ }
+
+ aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res));
+
+ /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */
+ add_poll_add_sqe(ctx, node);
+ return true;
+}
+
+static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list)
+{
+ struct io_uring *ring = &ctx->fdmon_io_uring;
+ struct io_uring_cqe *cqe;
+ unsigned num_cqes = 0;
+ unsigned num_ready = 0;
+ unsigned head;
+
+ io_uring_for_each_cqe(ring, head, cqe) {
+ if (process_cqe(ctx, ready_list, cqe)) {
+ num_ready++;
+ }
+
+ num_cqes++;
+ }
+
+ io_uring_cq_advance(ring, num_cqes);
+ return num_ready;
+}
+
+static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ unsigned wait_nr = 1; /* block until at least one cqe is ready */
+ int ret;
+
+ /* Fall back while external clients are disabled */
+ if (qatomic_read(&ctx->external_disable_cnt)) {
+ return fdmon_poll_ops.wait(ctx, ready_list, timeout);
+ }
+
+ if (timeout == 0) {
+ wait_nr = 0; /* non-blocking */
+ } else if (timeout > 0) {
+ add_timeout_sqe(ctx, timeout);
+ }
+
+ fill_sq_ring(ctx);
+
+ do {
+ ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr);
+ } while (ret == -EINTR);
+
+ assert(ret >= 0);
+
+ return process_cq_ring(ctx, ready_list);
+}
+
+static bool fdmon_io_uring_need_wait(AioContext *ctx)
+{
+ /* Have io_uring events completed? */
+ if (io_uring_cq_ready(&ctx->fdmon_io_uring)) {
+ return true;
+ }
+
+ /* Are there pending sqes to submit? */
+ if (io_uring_sq_ready(&ctx->fdmon_io_uring)) {
+ return true;
+ }
+
+ /* Do we need to process AioHandlers for io_uring changes? */
+ if (!QSLIST_EMPTY_RCU(&ctx->submit_list)) {
+ return true;
+ }
+
+ /* Are we falling back to fdmon-poll? */
+ return qatomic_read(&ctx->external_disable_cnt);
+}
+
+static const FDMonOps fdmon_io_uring_ops = {
+ .update = fdmon_io_uring_update,
+ .wait = fdmon_io_uring_wait,
+ .need_wait = fdmon_io_uring_need_wait,
+};
+
+bool fdmon_io_uring_setup(AioContext *ctx)
+{
+ int ret;
+
+ ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0);
+ if (ret != 0) {
+ return false;
+ }
+
+ QSLIST_INIT(&ctx->submit_list);
+ ctx->fdmon_ops = &fdmon_io_uring_ops;
+ return true;
+}
+
+void fdmon_io_uring_destroy(AioContext *ctx)
+{
+ if (ctx->fdmon_ops == &fdmon_io_uring_ops) {
+ AioHandler *node;
+
+ io_uring_queue_exit(&ctx->fdmon_io_uring);
+
+ /* Move handlers due to be removed onto the deleted list */
+ while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) {
+ unsigned flags = qatomic_fetch_and(&node->flags,
+ ~(FDMON_IO_URING_PENDING |
+ FDMON_IO_URING_ADD |
+ FDMON_IO_URING_REMOVE));
+
+ if (flags & FDMON_IO_URING_REMOVE) {
+ QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted);
+ }
+
+ QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted);
+ }
+
+ ctx->fdmon_ops = &fdmon_poll_ops;
+ }
+}
diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c
new file mode 100644
index 000000000..5fe3b4786
--- /dev/null
+++ b/util/fdmon-poll.c
@@ -0,0 +1,108 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * poll(2) file descriptor monitoring
+ *
+ * Uses ppoll(2) when available, g_poll() otherwise.
+ */
+
+#include "qemu/osdep.h"
+#include "aio-posix.h"
+#include "qemu/rcu_queue.h"
+
+/*
+ * These thread-local variables are used only in fdmon_poll_wait() around the
+ * call to the poll() system call. In particular they are not used while
+ * aio_poll is performing callbacks, which makes it much easier to think about
+ * reentrancy!
+ *
+ * Stack-allocated arrays would be perfect but they have size limitations;
+ * heap allocation is expensive enough that we want to reuse arrays across
+ * calls to aio_poll(). And because poll() has to be called without holding
+ * any lock, the arrays cannot be stored in AioContext. Thread-local data
+ * has none of the disadvantages of these three options.
+ */
+static __thread GPollFD *pollfds;
+static __thread AioHandler **nodes;
+static __thread unsigned npfd, nalloc;
+static __thread Notifier pollfds_cleanup_notifier;
+
+static void pollfds_cleanup(Notifier *n, void *unused)
+{
+ g_assert(npfd == 0);
+ g_free(pollfds);
+ g_free(nodes);
+ nalloc = 0;
+}
+
+static void add_pollfd(AioHandler *node)
+{
+ if (npfd == nalloc) {
+ if (nalloc == 0) {
+ pollfds_cleanup_notifier.notify = pollfds_cleanup;
+ qemu_thread_atexit_add(&pollfds_cleanup_notifier);
+ nalloc = 8;
+ } else {
+ g_assert(nalloc <= INT_MAX);
+ nalloc *= 2;
+ }
+ pollfds = g_renew(GPollFD, pollfds, nalloc);
+ nodes = g_renew(AioHandler *, nodes, nalloc);
+ }
+ nodes[npfd] = node;
+ pollfds[npfd] = (GPollFD) {
+ .fd = node->pfd.fd,
+ .events = node->pfd.events,
+ };
+ npfd++;
+}
+
+static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list,
+ int64_t timeout)
+{
+ AioHandler *node;
+ int ret;
+
+ assert(npfd == 0);
+
+ QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) {
+ if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events
+ && aio_node_check(ctx, node->is_external)) {
+ add_pollfd(node);
+ }
+ }
+
+ /* epoll(7) is faster above a certain number of fds */
+ if (fdmon_epoll_try_upgrade(ctx, npfd)) {
+ npfd = 0; /* we won't need pollfds[], reset npfd */
+ return ctx->fdmon_ops->wait(ctx, ready_list, timeout);
+ }
+
+ ret = qemu_poll_ns(pollfds, npfd, timeout);
+ if (ret > 0) {
+ int i;
+
+ for (i = 0; i < npfd; i++) {
+ int revents = pollfds[i].revents;
+
+ if (revents) {
+ aio_add_ready_handler(ready_list, nodes[i], revents);
+ }
+ }
+ }
+
+ npfd = 0;
+ return ret;
+}
+
+static void fdmon_poll_update(AioContext *ctx,
+ AioHandler *old_node,
+ AioHandler *new_node)
+{
+ /* Do nothing, AioHandler already contains the state we'll need */
+}
+
+const FDMonOps fdmon_poll_ops = {
+ .update = fdmon_poll_update,
+ .wait = fdmon_poll_wait,
+ .need_wait = aio_poll_disabled,
+};
diff --git a/util/fifo8.c b/util/fifo8.c
new file mode 100644
index 000000000..d4d1c135e
--- /dev/null
+++ b/util/fifo8.c
@@ -0,0 +1,118 @@
+/*
+ * Generic FIFO component, implemented as a circular buffer.
+ *
+ * Copyright (c) 2012 Peter A. G. Crosthwaite
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "migration/vmstate.h"
+#include "qemu/fifo8.h"
+
+void fifo8_create(Fifo8 *fifo, uint32_t capacity)
+{
+ fifo->data = g_new(uint8_t, capacity);
+ fifo->capacity = capacity;
+ fifo->head = 0;
+ fifo->num = 0;
+}
+
+void fifo8_destroy(Fifo8 *fifo)
+{
+ g_free(fifo->data);
+}
+
+void fifo8_push(Fifo8 *fifo, uint8_t data)
+{
+ assert(fifo->num < fifo->capacity);
+ fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data;
+ fifo->num++;
+}
+
+void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num)
+{
+ uint32_t start, avail;
+
+ assert(fifo->num + num <= fifo->capacity);
+
+ start = (fifo->head + fifo->num) % fifo->capacity;
+
+ if (start + num <= fifo->capacity) {
+ memcpy(&fifo->data[start], data, num);
+ } else {
+ avail = fifo->capacity - start;
+ memcpy(&fifo->data[start], data, avail);
+ memcpy(&fifo->data[0], &data[avail], num - avail);
+ }
+
+ fifo->num += num;
+}
+
+uint8_t fifo8_pop(Fifo8 *fifo)
+{
+ uint8_t ret;
+
+ assert(fifo->num > 0);
+ ret = fifo->data[fifo->head++];
+ fifo->head %= fifo->capacity;
+ fifo->num--;
+ return ret;
+}
+
+const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num)
+{
+ uint8_t *ret;
+
+ assert(max > 0 && max <= fifo->num);
+ *num = MIN(fifo->capacity - fifo->head, max);
+ ret = &fifo->data[fifo->head];
+ fifo->head += *num;
+ fifo->head %= fifo->capacity;
+ fifo->num -= *num;
+ return ret;
+}
+
+void fifo8_reset(Fifo8 *fifo)
+{
+ fifo->num = 0;
+ fifo->head = 0;
+}
+
+bool fifo8_is_empty(Fifo8 *fifo)
+{
+ return (fifo->num == 0);
+}
+
+bool fifo8_is_full(Fifo8 *fifo)
+{
+ return (fifo->num == fifo->capacity);
+}
+
+uint32_t fifo8_num_free(Fifo8 *fifo)
+{
+ return fifo->capacity - fifo->num;
+}
+
+uint32_t fifo8_num_used(Fifo8 *fifo)
+{
+ return fifo->num;
+}
+
+const VMStateDescription vmstate_fifo8 = {
+ .name = "Fifo8",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity),
+ VMSTATE_UINT32(head, Fifo8),
+ VMSTATE_UINT32(num, Fifo8),
+ VMSTATE_END_OF_LIST()
+ }
+};
diff --git a/util/filemonitor-inotify.c b/util/filemonitor-inotify.c
new file mode 100644
index 000000000..2c45f7f17
--- /dev/null
+++ b/util/filemonitor-inotify.c
@@ -0,0 +1,339 @@
+/*
+ * QEMU file monitor Linux inotify impl
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/filemonitor.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "trace.h"
+
+#include <sys/inotify.h>
+
+struct QFileMonitor {
+ int fd;
+ QemuMutex lock; /* protects dirs & idmap */
+ GHashTable *dirs; /* dirname => QFileMonitorDir */
+ GHashTable *idmap; /* inotify ID => dirname */
+};
+
+
+typedef struct {
+ int64_t id; /* watch ID */
+ char *filename; /* optional filter */
+ QFileMonitorHandler cb;
+ void *opaque;
+} QFileMonitorWatch;
+
+
+typedef struct {
+ char *path;
+ int inotify_id; /* inotify ID */
+ int next_file_id; /* file ID counter */
+ GArray *watches; /* QFileMonitorWatch elements */
+} QFileMonitorDir;
+
+
+static void qemu_file_monitor_watch(void *arg)
+{
+ QFileMonitor *mon = arg;
+ char buf[4096]
+ __attribute__ ((aligned(__alignof__(struct inotify_event))));
+ int used = 0;
+ int len;
+
+ qemu_mutex_lock(&mon->lock);
+
+ if (mon->fd == -1) {
+ qemu_mutex_unlock(&mon->lock);
+ return;
+ }
+
+ len = read(mon->fd, buf, sizeof(buf));
+
+ if (len < 0) {
+ if (errno != EAGAIN) {
+ error_report("Failure monitoring inotify FD '%s',"
+ "disabling events", strerror(errno));
+ goto cleanup;
+ }
+
+ /* no more events right now */
+ goto cleanup;
+ }
+
+ /* Loop over all events in the buffer */
+ while (used < len) {
+ struct inotify_event *ev =
+ (struct inotify_event *)(buf + used);
+ const char *name = ev->len ? ev->name : "";
+ QFileMonitorDir *dir = g_hash_table_lookup(mon->idmap,
+ GINT_TO_POINTER(ev->wd));
+ uint32_t iev = ev->mask &
+ (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED |
+ IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
+ int qev;
+ gsize i;
+
+ used += sizeof(struct inotify_event) + ev->len;
+
+ if (!dir) {
+ continue;
+ }
+
+ /*
+ * During a rename operation, the old name gets
+ * IN_MOVED_FROM and the new name gets IN_MOVED_TO.
+ * To simplify life for callers, we turn these into
+ * DELETED and CREATED events
+ */
+ switch (iev) {
+ case IN_CREATE:
+ case IN_MOVED_TO:
+ qev = QFILE_MONITOR_EVENT_CREATED;
+ break;
+ case IN_MODIFY:
+ qev = QFILE_MONITOR_EVENT_MODIFIED;
+ break;
+ case IN_DELETE:
+ case IN_MOVED_FROM:
+ qev = QFILE_MONITOR_EVENT_DELETED;
+ break;
+ case IN_ATTRIB:
+ qev = QFILE_MONITOR_EVENT_ATTRIBUTES;
+ break;
+ case IN_IGNORED:
+ qev = QFILE_MONITOR_EVENT_IGNORED;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+
+ trace_qemu_file_monitor_event(mon, dir->path, name, ev->mask,
+ dir->inotify_id);
+ for (i = 0; i < dir->watches->len; i++) {
+ QFileMonitorWatch *watch = &g_array_index(dir->watches,
+ QFileMonitorWatch,
+ i);
+
+ if (watch->filename == NULL ||
+ (name && g_str_equal(watch->filename, name))) {
+ trace_qemu_file_monitor_dispatch(mon, dir->path, name,
+ qev, watch->cb,
+ watch->opaque, watch->id);
+ watch->cb(watch->id, qev, name, watch->opaque);
+ }
+ }
+ }
+
+ cleanup:
+ qemu_mutex_unlock(&mon->lock);
+}
+
+
+static void
+qemu_file_monitor_dir_free(void *data)
+{
+ QFileMonitorDir *dir = data;
+ gsize i;
+
+ for (i = 0; i < dir->watches->len; i++) {
+ QFileMonitorWatch *watch = &g_array_index(dir->watches,
+ QFileMonitorWatch, i);
+ g_free(watch->filename);
+ }
+ g_array_unref(dir->watches);
+ g_free(dir->path);
+ g_free(dir);
+}
+
+
+QFileMonitor *
+qemu_file_monitor_new(Error **errp)
+{
+ int fd;
+ QFileMonitor *mon;
+
+ fd = inotify_init1(IN_NONBLOCK);
+ if (fd < 0) {
+ error_setg_errno(errp, errno,
+ "Unable to initialize inotify");
+ return NULL;
+ }
+
+ mon = g_new0(QFileMonitor, 1);
+ qemu_mutex_init(&mon->lock);
+ mon->fd = fd;
+
+ mon->dirs = g_hash_table_new_full(g_str_hash, g_str_equal, NULL,
+ qemu_file_monitor_dir_free);
+ mon->idmap = g_hash_table_new(g_direct_hash, g_direct_equal);
+
+ trace_qemu_file_monitor_new(mon, mon->fd);
+
+ return mon;
+}
+
+static gboolean
+qemu_file_monitor_free_idle(void *opaque)
+{
+ QFileMonitor *mon = opaque;
+
+ if (!mon) {
+ return G_SOURCE_REMOVE;
+ }
+
+ qemu_mutex_lock(&mon->lock);
+
+ g_hash_table_unref(mon->idmap);
+ g_hash_table_unref(mon->dirs);
+
+ qemu_mutex_unlock(&mon->lock);
+
+ qemu_mutex_destroy(&mon->lock);
+ g_free(mon);
+
+ return G_SOURCE_REMOVE;
+}
+
+void
+qemu_file_monitor_free(QFileMonitor *mon)
+{
+ if (!mon) {
+ return;
+ }
+
+ qemu_mutex_lock(&mon->lock);
+ if (mon->fd != -1) {
+ qemu_set_fd_handler(mon->fd, NULL, NULL, NULL);
+ close(mon->fd);
+ mon->fd = -1;
+ }
+ qemu_mutex_unlock(&mon->lock);
+
+ /*
+ * Can't free it yet, because another thread
+ * may be running event loop, so the inotify
+ * callback might be pending. Using an idle
+ * source ensures we'll only free after the
+ * pending callback is done
+ */
+ g_idle_add((GSourceFunc)qemu_file_monitor_free_idle, mon);
+}
+
+int64_t
+qemu_file_monitor_add_watch(QFileMonitor *mon,
+ const char *dirpath,
+ const char *filename,
+ QFileMonitorHandler cb,
+ void *opaque,
+ Error **errp)
+{
+ QFileMonitorDir *dir;
+ QFileMonitorWatch watch;
+ int64_t ret = -1;
+
+ qemu_mutex_lock(&mon->lock);
+ dir = g_hash_table_lookup(mon->dirs, dirpath);
+ if (!dir) {
+ int rv = inotify_add_watch(mon->fd, dirpath,
+ IN_CREATE | IN_DELETE | IN_MODIFY |
+ IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB);
+
+ if (rv < 0) {
+ error_setg_errno(errp, errno, "Unable to watch '%s'", dirpath);
+ goto cleanup;
+ }
+
+ trace_qemu_file_monitor_enable_watch(mon, dirpath, rv);
+
+ dir = g_new0(QFileMonitorDir, 1);
+ dir->path = g_strdup(dirpath);
+ dir->inotify_id = rv;
+ dir->watches = g_array_new(FALSE, TRUE, sizeof(QFileMonitorWatch));
+
+ g_hash_table_insert(mon->dirs, dir->path, dir);
+ g_hash_table_insert(mon->idmap, GINT_TO_POINTER(rv), dir);
+
+ if (g_hash_table_size(mon->dirs) == 1) {
+ qemu_set_fd_handler(mon->fd, qemu_file_monitor_watch, NULL, mon);
+ }
+ }
+
+ watch.id = (((int64_t)dir->inotify_id) << 32) | dir->next_file_id++;
+ watch.filename = g_strdup(filename);
+ watch.cb = cb;
+ watch.opaque = opaque;
+
+ g_array_append_val(dir->watches, watch);
+
+ trace_qemu_file_monitor_add_watch(mon, dirpath,
+ filename ? filename : "<none>",
+ cb, opaque, watch.id);
+
+ ret = watch.id;
+
+ cleanup:
+ qemu_mutex_unlock(&mon->lock);
+ return ret;
+}
+
+
+void qemu_file_monitor_remove_watch(QFileMonitor *mon,
+ const char *dirpath,
+ int64_t id)
+{
+ QFileMonitorDir *dir;
+ gsize i;
+
+ qemu_mutex_lock(&mon->lock);
+
+ trace_qemu_file_monitor_remove_watch(mon, dirpath, id);
+
+ dir = g_hash_table_lookup(mon->dirs, dirpath);
+ if (!dir) {
+ goto cleanup;
+ }
+
+ for (i = 0; i < dir->watches->len; i++) {
+ QFileMonitorWatch *watch = &g_array_index(dir->watches,
+ QFileMonitorWatch, i);
+ if (watch->id == id) {
+ g_free(watch->filename);
+ g_array_remove_index(dir->watches, i);
+ break;
+ }
+ }
+
+ if (dir->watches->len == 0) {
+ inotify_rm_watch(mon->fd, dir->inotify_id);
+ trace_qemu_file_monitor_disable_watch(mon, dir->path, dir->inotify_id);
+
+ g_hash_table_remove(mon->idmap, GINT_TO_POINTER(dir->inotify_id));
+ g_hash_table_remove(mon->dirs, dir->path);
+
+ if (g_hash_table_size(mon->dirs) == 0) {
+ qemu_set_fd_handler(mon->fd, NULL, NULL, NULL);
+ }
+ }
+
+ cleanup:
+ qemu_mutex_unlock(&mon->lock);
+}
diff --git a/util/filemonitor-stub.c b/util/filemonitor-stub.c
new file mode 100644
index 000000000..93fef6534
--- /dev/null
+++ b/util/filemonitor-stub.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU file monitor stub impl
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/filemonitor.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+
+QFileMonitor *
+qemu_file_monitor_new(Error **errp)
+{
+ error_setg(errp, "File monitoring not available on this platform");
+ return NULL;
+}
+
+
+void
+qemu_file_monitor_free(QFileMonitor *mon G_GNUC_UNUSED)
+{
+}
+
+
+int64_t
+qemu_file_monitor_add_watch(QFileMonitor *mon G_GNUC_UNUSED,
+ const char *dirpath G_GNUC_UNUSED,
+ const char *filename G_GNUC_UNUSED,
+ QFileMonitorHandler cb G_GNUC_UNUSED,
+ void *opaque G_GNUC_UNUSED,
+ Error **errp)
+{
+ error_setg(errp, "File monitoring not available on this platform");
+ return -1;
+}
+
+
+void
+qemu_file_monitor_remove_watch(QFileMonitor *mon G_GNUC_UNUSED,
+ const char *dirpath G_GNUC_UNUSED,
+ int64_t id G_GNUC_UNUSED)
+{
+}
diff --git a/util/getauxval.c b/util/getauxval.c
new file mode 100644
index 000000000..b124107d6
--- /dev/null
+++ b/util/getauxval.c
@@ -0,0 +1,118 @@
+/*
+ * QEMU access to the auxiliary vector
+ *
+ * Copyright (C) 2013 Red Hat, Inc
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#ifdef CONFIG_GETAUXVAL
+/* Don't inline this in qemu/osdep.h, because pulling in <sys/auxv.h> for
+ the system declaration of getauxval pulls in the system <elf.h>, which
+ conflicts with qemu's version. */
+
+#include <sys/auxv.h>
+
+unsigned long qemu_getauxval(unsigned long key)
+{
+ return getauxval(key);
+}
+#elif defined(__linux__)
+#include "elf.h"
+
+/* Our elf.h doesn't contain Elf32_auxv_t and Elf64_auxv_t, which is ok because
+ that just makes it easier to define it properly for the host here. */
+typedef struct {
+ unsigned long a_type;
+ unsigned long a_val;
+} ElfW_auxv_t;
+
+static const ElfW_auxv_t *auxv;
+
+static const ElfW_auxv_t *qemu_init_auxval(void)
+{
+ ElfW_auxv_t *a;
+ ssize_t size = 512, r, ofs;
+ int fd;
+
+ /* Allocate some initial storage. Make sure the first entry is set
+ to end-of-list, so that we've got a valid list in case of error. */
+ auxv = a = g_malloc(size);
+ a[0].a_type = 0;
+ a[0].a_val = 0;
+
+ fd = open("/proc/self/auxv", O_RDONLY);
+ if (fd < 0) {
+ return a;
+ }
+
+ /* Read the first SIZE bytes. Hopefully, this covers everything. */
+ r = read(fd, a, size);
+
+ if (r == size) {
+ /* Continue to expand until we do get a partial read. */
+ do {
+ ofs = size;
+ size *= 2;
+ auxv = a = g_realloc(a, size);
+ r = read(fd, (char *)a + ofs, ofs);
+ } while (r == ofs);
+ }
+
+ close(fd);
+ return a;
+}
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+ const ElfW_auxv_t *a = auxv;
+
+ if (unlikely(a == NULL)) {
+ a = qemu_init_auxval();
+ }
+
+ for (; a->a_type != 0; a++) {
+ if (a->a_type == type) {
+ return a->a_val;
+ }
+ }
+
+ return 0;
+}
+
+#elif defined(__FreeBSD__)
+#include <sys/auxv.h>
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+ unsigned long aux = 0;
+ elf_aux_info(type, &aux, sizeof(aux));
+ return aux;
+}
+
+#else
+
+unsigned long qemu_getauxval(unsigned long type)
+{
+ return 0;
+}
+
+#endif
diff --git a/util/guest-random.c b/util/guest-random.c
new file mode 100644
index 000000000..23643f86c
--- /dev/null
+++ b/util/guest-random.c
@@ -0,0 +1,101 @@
+/*
+ * QEMU guest-visible random functions
+ *
+ * Copyright 2019 Linaro, Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qemu/guest-random.h"
+#include "crypto/random.h"
+#include "sysemu/replay.h"
+
+
+static __thread GRand *thread_rand;
+static bool deterministic;
+
+
+static int glib_random_bytes(void *buf, size_t len)
+{
+ GRand *rand = thread_rand;
+ size_t i;
+ uint32_t x;
+
+ if (unlikely(rand == NULL)) {
+ /* Thread not initialized for a cpu, or main w/o -seed. */
+ thread_rand = rand = g_rand_new();
+ }
+
+ for (i = 0; i + 4 <= len; i += 4) {
+ x = g_rand_int(rand);
+ __builtin_memcpy(buf + i, &x, 4);
+ }
+ if (i < len) {
+ x = g_rand_int(rand);
+ __builtin_memcpy(buf + i, &x, len - i);
+ }
+ return 0;
+}
+
+int qemu_guest_getrandom(void *buf, size_t len, Error **errp)
+{
+ int ret;
+ if (replay_mode == REPLAY_MODE_PLAY) {
+ return replay_read_random(buf, len);
+ }
+ if (unlikely(deterministic)) {
+ /* Deterministic implementation using Glib's Mersenne Twister. */
+ ret = glib_random_bytes(buf, len);
+ } else {
+ /* Non-deterministic implementation using crypto routines. */
+ ret = qcrypto_random_bytes(buf, len, errp);
+ }
+ if (replay_mode == REPLAY_MODE_RECORD) {
+ replay_save_random(ret, buf, len);
+ }
+ return ret;
+}
+
+void qemu_guest_getrandom_nofail(void *buf, size_t len)
+{
+ (void)qemu_guest_getrandom(buf, len, &error_fatal);
+}
+
+uint64_t qemu_guest_random_seed_thread_part1(void)
+{
+ if (deterministic) {
+ uint64_t ret;
+ glib_random_bytes(&ret, sizeof(ret));
+ return ret;
+ }
+ return 0;
+}
+
+void qemu_guest_random_seed_thread_part2(uint64_t seed)
+{
+ g_assert(thread_rand == NULL);
+ if (deterministic) {
+ thread_rand =
+ g_rand_new_with_seed_array((const guint32 *)&seed,
+ sizeof(seed) / sizeof(guint32));
+ }
+}
+
+int qemu_guest_random_seed_main(const char *optarg, Error **errp)
+{
+ unsigned long long seed;
+ if (parse_uint_full(optarg, &seed, 0)) {
+ error_setg(errp, "Invalid seed number: %s", optarg);
+ return -1;
+ } else {
+ deterministic = true;
+ qemu_guest_random_seed_thread_part2(seed);
+ return 0;
+ }
+}
diff --git a/util/hbitmap.c b/util/hbitmap.c
new file mode 100644
index 000000000..305b894a6
--- /dev/null
+++ b/util/hbitmap.c
@@ -0,0 +1,933 @@
+/*
+ * Hierarchical Bitmap Data Type
+ *
+ * Copyright Red Hat, Inc., 2012
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/hbitmap.h"
+#include "qemu/host-utils.h"
+#include "trace.h"
+#include "crypto/hash.h"
+
+/* HBitmaps provides an array of bits. The bits are stored as usual in an
+ * array of unsigned longs, but HBitmap is also optimized to provide fast
+ * iteration over set bits; going from one bit to the next is O(logB n)
+ * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough
+ * that the number of levels is in fact fixed.
+ *
+ * In order to do this, it stacks multiple bitmaps with progressively coarser
+ * granularity; in all levels except the last, bit N is set iff the N-th
+ * unsigned long is nonzero in the immediately next level. When iteration
+ * completes on the last level it can examine the 2nd-last level to quickly
+ * skip entire words, and even do so recursively to skip blocks of 64 words or
+ * powers thereof (32 on 32-bit machines).
+ *
+ * Given an index in the bitmap, it can be split in group of bits like
+ * this (for the 64-bit case):
+ *
+ * bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word
+ * bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word
+ * bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word
+ *
+ * So it is easy to move up simply by shifting the index right by
+ * log2(BITS_PER_LONG) bits. To move down, you shift the index left
+ * similarly, and add the word index within the group. Iteration uses
+ * ffs (find first set bit) to find the next word to examine; this
+ * operation can be done in constant time in most current architectures.
+ *
+ * Setting or clearing a range of m bits on all levels, the work to perform
+ * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap.
+ *
+ * When iterating on a bitmap, each bit (on any level) is only visited
+ * once. Hence, The total cost of visiting a bitmap with m bits in it is
+ * the number of bits that are set in all bitmaps. Unless the bitmap is
+ * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized
+ * cost of advancing from one bit to the next is usually constant (worst case
+ * O(logB n) as in the non-amortized complexity).
+ */
+
+struct HBitmap {
+ /*
+ * Size of the bitmap, as requested in hbitmap_alloc or in hbitmap_truncate.
+ */
+ uint64_t orig_size;
+
+ /* Number of total bits in the bottom level. */
+ uint64_t size;
+
+ /* Number of set bits in the bottom level. */
+ uint64_t count;
+
+ /* A scaling factor. Given a granularity of G, each bit in the bitmap will
+ * will actually represent a group of 2^G elements. Each operation on a
+ * range of bits first rounds the bits to determine which group they land
+ * in, and then affect the entire page; iteration will only visit the first
+ * bit of each group. Here is an example of operations in a size-16,
+ * granularity-1 HBitmap:
+ *
+ * initial state 00000000
+ * set(start=0, count=9) 11111000 (iter: 0, 2, 4, 6, 8)
+ * reset(start=1, count=3) 00111000 (iter: 4, 6, 8)
+ * set(start=9, count=2) 00111100 (iter: 4, 6, 8, 10)
+ * reset(start=5, count=5) 00000000
+ *
+ * From an implementation point of view, when setting or resetting bits,
+ * the bitmap will scale bit numbers right by this amount of bits. When
+ * iterating, the bitmap will scale bit numbers left by this amount of
+ * bits.
+ */
+ int granularity;
+
+ /* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */
+ HBitmap *meta;
+
+ /* A number of progressively less coarse bitmaps (i.e. level 0 is the
+ * coarsest). Each bit in level N represents a word in level N+1 that
+ * has a set bit, except the last level where each bit represents the
+ * actual bitmap.
+ *
+ * Note that all bitmaps have the same number of levels. Even a 1-bit
+ * bitmap will still allocate HBITMAP_LEVELS arrays.
+ */
+ unsigned long *levels[HBITMAP_LEVELS];
+
+ /* The length of each levels[] array. */
+ uint64_t sizes[HBITMAP_LEVELS];
+};
+
+/* Advance hbi to the next nonzero word and return it. hbi->pos
+ * is updated. Returns zero if we reach the end of the bitmap.
+ */
+static unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi)
+{
+ size_t pos = hbi->pos;
+ const HBitmap *hb = hbi->hb;
+ unsigned i = HBITMAP_LEVELS - 1;
+
+ unsigned long cur;
+ do {
+ i--;
+ pos >>= BITS_PER_LEVEL;
+ cur = hbi->cur[i] & hb->levels[i][pos];
+ } while (cur == 0);
+
+ /* Check for end of iteration. We always use fewer than BITS_PER_LONG
+ * bits in the level 0 bitmap; thus we can repurpose the most significant
+ * bit as a sentinel. The sentinel is set in hbitmap_alloc and ensures
+ * that the above loop ends even without an explicit check on i.
+ */
+
+ if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) {
+ return 0;
+ }
+ for (; i < HBITMAP_LEVELS - 1; i++) {
+ /* Shift back pos to the left, matching the right shifts above.
+ * The index of this word's least significant set bit provides
+ * the low-order bits.
+ */
+ assert(cur);
+ pos = (pos << BITS_PER_LEVEL) + ctzl(cur);
+ hbi->cur[i] = cur & (cur - 1);
+
+ /* Set up next level for iteration. */
+ cur = hb->levels[i + 1][pos];
+ }
+
+ hbi->pos = pos;
+ trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur);
+
+ assert(cur);
+ return cur;
+}
+
+int64_t hbitmap_iter_next(HBitmapIter *hbi)
+{
+ unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] &
+ hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos];
+ int64_t item;
+
+ if (cur == 0) {
+ cur = hbitmap_iter_skip_words(hbi);
+ if (cur == 0) {
+ return -1;
+ }
+ }
+
+ /* The next call will resume work from the next bit. */
+ hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1);
+ item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur);
+
+ return item << hbi->granularity;
+}
+
+void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first)
+{
+ unsigned i, bit;
+ uint64_t pos;
+
+ hbi->hb = hb;
+ pos = first >> hb->granularity;
+ assert(pos < hb->size);
+ hbi->pos = pos >> BITS_PER_LEVEL;
+ hbi->granularity = hb->granularity;
+
+ for (i = HBITMAP_LEVELS; i-- > 0; ) {
+ bit = pos & (BITS_PER_LONG - 1);
+ pos >>= BITS_PER_LEVEL;
+
+ /* Drop bits representing items before first. */
+ hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1);
+
+ /* We have already added level i+1, so the lowest set bit has
+ * been processed. Clear it.
+ */
+ if (i != HBITMAP_LEVELS - 1) {
+ hbi->cur[i] &= ~(1UL << bit);
+ }
+ }
+}
+
+int64_t hbitmap_next_dirty(const HBitmap *hb, int64_t start, int64_t count)
+{
+ HBitmapIter hbi;
+ int64_t first_dirty_off;
+ uint64_t end;
+
+ assert(start >= 0 && count >= 0);
+
+ if (start >= hb->orig_size || count == 0) {
+ return -1;
+ }
+
+ end = count > hb->orig_size - start ? hb->orig_size : start + count;
+
+ hbitmap_iter_init(&hbi, hb, start);
+ first_dirty_off = hbitmap_iter_next(&hbi);
+
+ if (first_dirty_off < 0 || first_dirty_off >= end) {
+ return -1;
+ }
+
+ return MAX(start, first_dirty_off);
+}
+
+int64_t hbitmap_next_zero(const HBitmap *hb, int64_t start, int64_t count)
+{
+ size_t pos = (start >> hb->granularity) >> BITS_PER_LEVEL;
+ unsigned long *last_lev = hb->levels[HBITMAP_LEVELS - 1];
+ unsigned long cur = last_lev[pos];
+ unsigned start_bit_offset;
+ uint64_t end_bit, sz;
+ int64_t res;
+
+ assert(start >= 0 && count >= 0);
+
+ if (start >= hb->orig_size || count == 0) {
+ return -1;
+ }
+
+ end_bit = count > hb->orig_size - start ?
+ hb->size :
+ ((start + count - 1) >> hb->granularity) + 1;
+ sz = (end_bit + BITS_PER_LONG - 1) >> BITS_PER_LEVEL;
+
+ /* There may be some zero bits in @cur before @start. We are not interested
+ * in them, let's set them.
+ */
+ start_bit_offset = (start >> hb->granularity) & (BITS_PER_LONG - 1);
+ cur |= (1UL << start_bit_offset) - 1;
+ assert((start >> hb->granularity) < hb->size);
+
+ if (cur == (unsigned long)-1) {
+ do {
+ pos++;
+ } while (pos < sz && last_lev[pos] == (unsigned long)-1);
+
+ if (pos >= sz) {
+ return -1;
+ }
+
+ cur = last_lev[pos];
+ }
+
+ res = (pos << BITS_PER_LEVEL) + ctol(cur);
+ if (res >= end_bit) {
+ return -1;
+ }
+
+ res = res << hb->granularity;
+ if (res < start) {
+ assert(((start - res) >> hb->granularity) == 0);
+ return start;
+ }
+
+ return res;
+}
+
+bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t start, int64_t end,
+ int64_t max_dirty_count,
+ int64_t *dirty_start, int64_t *dirty_count)
+{
+ int64_t next_zero;
+
+ assert(start >= 0 && end >= 0 && max_dirty_count > 0);
+
+ end = MIN(end, hb->orig_size);
+ if (start >= end) {
+ return false;
+ }
+
+ start = hbitmap_next_dirty(hb, start, end - start);
+ if (start < 0) {
+ return false;
+ }
+
+ end = start + MIN(end - start, max_dirty_count);
+
+ next_zero = hbitmap_next_zero(hb, start, end - start);
+ if (next_zero >= 0) {
+ end = next_zero;
+ }
+
+ *dirty_start = start;
+ *dirty_count = end - start;
+
+ return true;
+}
+
+bool hbitmap_empty(const HBitmap *hb)
+{
+ return hb->count == 0;
+}
+
+int hbitmap_granularity(const HBitmap *hb)
+{
+ return hb->granularity;
+}
+
+uint64_t hbitmap_count(const HBitmap *hb)
+{
+ return hb->count << hb->granularity;
+}
+
+/**
+ * hbitmap_iter_next_word:
+ * @hbi: HBitmapIter to operate on.
+ * @p_cur: Location where to store the next non-zero word.
+ *
+ * Return the index of the next nonzero word that is set in @hbi's
+ * associated HBitmap, and set *p_cur to the content of that word
+ * (bits before the index that was passed to hbitmap_iter_init are
+ * trimmed on the first call). Return -1, and set *p_cur to zero,
+ * if all remaining words are zero.
+ */
+static size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur)
+{
+ unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1];
+
+ if (cur == 0) {
+ cur = hbitmap_iter_skip_words(hbi);
+ if (cur == 0) {
+ *p_cur = 0;
+ return -1;
+ }
+ }
+
+ /* The next call will resume work from the next word. */
+ hbi->cur[HBITMAP_LEVELS - 1] = 0;
+ *p_cur = cur;
+ return hbi->pos;
+}
+
+/* Count the number of set bits between start and end, not accounting for
+ * the granularity. Also an example of how to use hbitmap_iter_next_word.
+ */
+static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last)
+{
+ HBitmapIter hbi;
+ uint64_t count = 0;
+ uint64_t end = last + 1;
+ unsigned long cur;
+ size_t pos;
+
+ hbitmap_iter_init(&hbi, hb, start << hb->granularity);
+ for (;;) {
+ pos = hbitmap_iter_next_word(&hbi, &cur);
+ if (pos >= (end >> BITS_PER_LEVEL)) {
+ break;
+ }
+ count += ctpopl(cur);
+ }
+
+ if (pos == (end >> BITS_PER_LEVEL)) {
+ /* Drop bits representing the END-th and subsequent items. */
+ int bit = end & (BITS_PER_LONG - 1);
+ cur &= (1UL << bit) - 1;
+ count += ctpopl(cur);
+ }
+
+ return count;
+}
+
+/* Setting starts at the last layer and propagates up if an element
+ * changes.
+ */
+static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+ unsigned long mask;
+ unsigned long old;
+
+ assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+ assert(start <= last);
+
+ mask = 2UL << (last & (BITS_PER_LONG - 1));
+ mask -= 1UL << (start & (BITS_PER_LONG - 1));
+ old = *elem;
+ *elem |= mask;
+ return old != *elem;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_set_between(HBitmap *hb, int level, uint64_t start,
+ uint64_t last)
+{
+ size_t pos = start >> BITS_PER_LEVEL;
+ size_t lastpos = last >> BITS_PER_LEVEL;
+ bool changed = false;
+ size_t i;
+
+ i = pos;
+ if (i < lastpos) {
+ uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+ changed |= hb_set_elem(&hb->levels[level][i], start, next - 1);
+ for (;;) {
+ start = next;
+ next += BITS_PER_LONG;
+ if (++i == lastpos) {
+ break;
+ }
+ changed |= (hb->levels[level][i] == 0);
+ hb->levels[level][i] = ~0UL;
+ }
+ }
+ changed |= hb_set_elem(&hb->levels[level][i], start, last);
+
+ /* If there was any change in this layer, we may have to update
+ * the one above.
+ */
+ if (level > 0 && changed) {
+ hb_set_between(hb, level - 1, pos, lastpos);
+ }
+ return changed;
+}
+
+void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count)
+{
+ /* Compute range in the last layer. */
+ uint64_t first, n;
+ uint64_t last = start + count - 1;
+
+ if (count == 0) {
+ return;
+ }
+
+ trace_hbitmap_set(hb, start, count,
+ start >> hb->granularity, last >> hb->granularity);
+
+ first = start >> hb->granularity;
+ last >>= hb->granularity;
+ assert(last < hb->size);
+ n = last - first + 1;
+
+ hb->count += n - hb_count_between(hb, first, last);
+ if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+ hb->meta) {
+ hbitmap_set(hb->meta, start, count);
+ }
+}
+
+/* Resetting works the other way round: propagate up if the new
+ * value is zero.
+ */
+static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last)
+{
+ unsigned long mask;
+ bool blanked;
+
+ assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL));
+ assert(start <= last);
+
+ mask = 2UL << (last & (BITS_PER_LONG - 1));
+ mask -= 1UL << (start & (BITS_PER_LONG - 1));
+ blanked = *elem != 0 && ((*elem & ~mask) == 0);
+ *elem &= ~mask;
+ return blanked;
+}
+
+/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)...
+ * Returns true if at least one bit is changed. */
+static bool hb_reset_between(HBitmap *hb, int level, uint64_t start,
+ uint64_t last)
+{
+ size_t pos = start >> BITS_PER_LEVEL;
+ size_t lastpos = last >> BITS_PER_LEVEL;
+ bool changed = false;
+ size_t i;
+
+ i = pos;
+ if (i < lastpos) {
+ uint64_t next = (start | (BITS_PER_LONG - 1)) + 1;
+
+ /* Here we need a more complex test than when setting bits. Even if
+ * something was changed, we must not blank bits in the upper level
+ * unless the lower-level word became entirely zero. So, remove pos
+ * from the upper-level range if bits remain set.
+ */
+ if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) {
+ changed = true;
+ } else {
+ pos++;
+ }
+
+ for (;;) {
+ start = next;
+ next += BITS_PER_LONG;
+ if (++i == lastpos) {
+ break;
+ }
+ changed |= (hb->levels[level][i] != 0);
+ hb->levels[level][i] = 0UL;
+ }
+ }
+
+ /* Same as above, this time for lastpos. */
+ if (hb_reset_elem(&hb->levels[level][i], start, last)) {
+ changed = true;
+ } else {
+ lastpos--;
+ }
+
+ if (level > 0 && changed) {
+ hb_reset_between(hb, level - 1, pos, lastpos);
+ }
+
+ return changed;
+
+}
+
+void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count)
+{
+ /* Compute range in the last layer. */
+ uint64_t first;
+ uint64_t last = start + count - 1;
+ uint64_t gran = 1ULL << hb->granularity;
+
+ if (count == 0) {
+ return;
+ }
+
+ assert(QEMU_IS_ALIGNED(start, gran));
+ assert(QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size));
+
+ trace_hbitmap_reset(hb, start, count,
+ start >> hb->granularity, last >> hb->granularity);
+
+ first = start >> hb->granularity;
+ last >>= hb->granularity;
+ assert(last < hb->size);
+
+ hb->count -= hb_count_between(hb, first, last);
+ if (hb_reset_between(hb, HBITMAP_LEVELS - 1, first, last) &&
+ hb->meta) {
+ hbitmap_set(hb->meta, start, count);
+ }
+}
+
+void hbitmap_reset_all(HBitmap *hb)
+{
+ unsigned int i;
+
+ /* Same as hbitmap_alloc() except for memset() instead of malloc() */
+ for (i = HBITMAP_LEVELS; --i >= 1; ) {
+ memset(hb->levels[i], 0, hb->sizes[i] * sizeof(unsigned long));
+ }
+
+ hb->levels[0][0] = 1UL << (BITS_PER_LONG - 1);
+ hb->count = 0;
+}
+
+bool hbitmap_is_serializable(const HBitmap *hb)
+{
+ /* Every serialized chunk must be aligned to 64 bits so that endianness
+ * requirements can be fulfilled on both 64 bit and 32 bit hosts.
+ * We have hbitmap_serialization_align() which converts this
+ * alignment requirement from bitmap bits to items covered (e.g. sectors).
+ * That value is:
+ * 64 << hb->granularity
+ * Since this value must not exceed UINT64_MAX, hb->granularity must be
+ * less than 58 (== 64 - 6, where 6 is ld(64), i.e. 1 << 6 == 64).
+ *
+ * In order for hbitmap_serialization_align() to always return a
+ * meaningful value, bitmaps that are to be serialized must have a
+ * granularity of less than 58. */
+
+ return hb->granularity < 58;
+}
+
+bool hbitmap_get(const HBitmap *hb, uint64_t item)
+{
+ /* Compute position and bit in the last layer. */
+ uint64_t pos = item >> hb->granularity;
+ unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1));
+ assert(pos < hb->size);
+
+ return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0;
+}
+
+uint64_t hbitmap_serialization_align(const HBitmap *hb)
+{
+ assert(hbitmap_is_serializable(hb));
+
+ /* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit
+ * hosts. */
+ return UINT64_C(64) << hb->granularity;
+}
+
+/* Start should be aligned to serialization granularity, chunk size should be
+ * aligned to serialization granularity too, except for last chunk.
+ */
+static void serialization_chunk(const HBitmap *hb,
+ uint64_t start, uint64_t count,
+ unsigned long **first_el, uint64_t *el_count)
+{
+ uint64_t last = start + count - 1;
+ uint64_t gran = hbitmap_serialization_align(hb);
+
+ assert((start & (gran - 1)) == 0);
+ assert((last >> hb->granularity) < hb->size);
+ if ((last >> hb->granularity) != hb->size - 1) {
+ assert((count & (gran - 1)) == 0);
+ }
+
+ start = (start >> hb->granularity) >> BITS_PER_LEVEL;
+ last = (last >> hb->granularity) >> BITS_PER_LEVEL;
+
+ *first_el = &hb->levels[HBITMAP_LEVELS - 1][start];
+ *el_count = last - start + 1;
+}
+
+uint64_t hbitmap_serialization_size(const HBitmap *hb,
+ uint64_t start, uint64_t count)
+{
+ uint64_t el_count;
+ unsigned long *cur;
+
+ if (!count) {
+ return 0;
+ }
+ serialization_chunk(hb, start, count, &cur, &el_count);
+
+ return el_count * sizeof(unsigned long);
+}
+
+void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf,
+ uint64_t start, uint64_t count)
+{
+ uint64_t el_count;
+ unsigned long *cur, *end;
+
+ if (!count) {
+ return;
+ }
+ serialization_chunk(hb, start, count, &cur, &el_count);
+ end = cur + el_count;
+
+ while (cur != end) {
+ unsigned long el =
+ (BITS_PER_LONG == 32 ? cpu_to_le32(*cur) : cpu_to_le64(*cur));
+
+ memcpy(buf, &el, sizeof(el));
+ buf += sizeof(el);
+ cur++;
+ }
+}
+
+void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf,
+ uint64_t start, uint64_t count,
+ bool finish)
+{
+ uint64_t el_count;
+ unsigned long *cur, *end;
+
+ if (!count) {
+ return;
+ }
+ serialization_chunk(hb, start, count, &cur, &el_count);
+ end = cur + el_count;
+
+ while (cur != end) {
+ memcpy(cur, buf, sizeof(*cur));
+
+ if (BITS_PER_LONG == 32) {
+ le32_to_cpus((uint32_t *)cur);
+ } else {
+ le64_to_cpus((uint64_t *)cur);
+ }
+
+ buf += sizeof(unsigned long);
+ cur++;
+ }
+ if (finish) {
+ hbitmap_deserialize_finish(hb);
+ }
+}
+
+void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count,
+ bool finish)
+{
+ uint64_t el_count;
+ unsigned long *first;
+
+ if (!count) {
+ return;
+ }
+ serialization_chunk(hb, start, count, &first, &el_count);
+
+ memset(first, 0, el_count * sizeof(unsigned long));
+ if (finish) {
+ hbitmap_deserialize_finish(hb);
+ }
+}
+
+void hbitmap_deserialize_ones(HBitmap *hb, uint64_t start, uint64_t count,
+ bool finish)
+{
+ uint64_t el_count;
+ unsigned long *first;
+
+ if (!count) {
+ return;
+ }
+ serialization_chunk(hb, start, count, &first, &el_count);
+
+ memset(first, 0xff, el_count * sizeof(unsigned long));
+ if (finish) {
+ hbitmap_deserialize_finish(hb);
+ }
+}
+
+void hbitmap_deserialize_finish(HBitmap *bitmap)
+{
+ int64_t i, size, prev_size;
+ int lev;
+
+ /* restore levels starting from penultimate to zero level, assuming
+ * that the last level is ok */
+ size = MAX((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+ for (lev = HBITMAP_LEVELS - 1; lev-- > 0; ) {
+ prev_size = size;
+ size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+ memset(bitmap->levels[lev], 0, size * sizeof(unsigned long));
+
+ for (i = 0; i < prev_size; ++i) {
+ if (bitmap->levels[lev + 1][i]) {
+ bitmap->levels[lev][i >> BITS_PER_LEVEL] |=
+ 1UL << (i & (BITS_PER_LONG - 1));
+ }
+ }
+ }
+
+ bitmap->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+ bitmap->count = hb_count_between(bitmap, 0, bitmap->size - 1);
+}
+
+void hbitmap_free(HBitmap *hb)
+{
+ unsigned i;
+ assert(!hb->meta);
+ for (i = HBITMAP_LEVELS; i-- > 0; ) {
+ g_free(hb->levels[i]);
+ }
+ g_free(hb);
+}
+
+HBitmap *hbitmap_alloc(uint64_t size, int granularity)
+{
+ HBitmap *hb = g_new0(struct HBitmap, 1);
+ unsigned i;
+
+ assert(size <= INT64_MAX);
+ hb->orig_size = size;
+
+ assert(granularity >= 0 && granularity < 64);
+ size = (size + (1ULL << granularity) - 1) >> granularity;
+ assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+
+ hb->size = size;
+ hb->granularity = granularity;
+ for (i = HBITMAP_LEVELS; i-- > 0; ) {
+ size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1);
+ hb->sizes[i] = size;
+ hb->levels[i] = g_new0(unsigned long, size);
+ }
+
+ /* We necessarily have free bits in level 0 due to the definition
+ * of HBITMAP_LEVELS, so use one for a sentinel. This speeds up
+ * hbitmap_iter_skip_words.
+ */
+ assert(size == 1);
+ hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1);
+ return hb;
+}
+
+void hbitmap_truncate(HBitmap *hb, uint64_t size)
+{
+ bool shrink;
+ unsigned i;
+ uint64_t num_elements = size;
+ uint64_t old;
+
+ assert(size <= INT64_MAX);
+ hb->orig_size = size;
+
+ /* Size comes in as logical elements, adjust for granularity. */
+ size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity;
+ assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE));
+ shrink = size < hb->size;
+
+ /* bit sizes are identical; nothing to do. */
+ if (size == hb->size) {
+ return;
+ }
+
+ /* If we're losing bits, let's clear those bits before we invalidate all of
+ * our invariants. This helps keep the bitcount consistent, and will prevent
+ * us from carrying around garbage bits beyond the end of the map.
+ */
+ if (shrink) {
+ /* Don't clear partial granularity groups;
+ * start at the first full one. */
+ uint64_t start = ROUND_UP(num_elements, UINT64_C(1) << hb->granularity);
+ uint64_t fix_count = (hb->size << hb->granularity) - start;
+
+ assert(fix_count);
+ hbitmap_reset(hb, start, fix_count);
+ }
+
+ hb->size = size;
+ for (i = HBITMAP_LEVELS; i-- > 0; ) {
+ size = MAX(BITS_TO_LONGS(size), 1);
+ if (hb->sizes[i] == size) {
+ break;
+ }
+ old = hb->sizes[i];
+ hb->sizes[i] = size;
+ hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long));
+ if (!shrink) {
+ memset(&hb->levels[i][old], 0x00,
+ (size - old) * sizeof(*hb->levels[i]));
+ }
+ }
+ if (hb->meta) {
+ hbitmap_truncate(hb->meta, hb->size << hb->granularity);
+ }
+}
+
+bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b)
+{
+ return (a->orig_size == b->orig_size);
+}
+
+/**
+ * hbitmap_sparse_merge: performs dst = dst | src
+ * works with differing granularities.
+ * best used when src is sparsely populated.
+ */
+static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src)
+{
+ int64_t offset;
+ int64_t count;
+
+ for (offset = 0;
+ hbitmap_next_dirty_area(src, offset, src->orig_size, INT64_MAX,
+ &offset, &count);
+ offset += count)
+ {
+ hbitmap_set(dst, offset, count);
+ }
+}
+
+/**
+ * Given HBitmaps A and B, let R := A (BITOR) B.
+ * Bitmaps A and B will not be modified,
+ * except when bitmap R is an alias of A or B.
+ *
+ * @return true if the merge was successful,
+ * false if it was not attempted.
+ */
+bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result)
+{
+ int i;
+ uint64_t j;
+
+ if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) {
+ return false;
+ }
+ assert(hbitmap_can_merge(b, result));
+
+ if ((!hbitmap_count(a) && result == b) ||
+ (!hbitmap_count(b) && result == a)) {
+ return true;
+ }
+
+ if (!hbitmap_count(a) && !hbitmap_count(b)) {
+ hbitmap_reset_all(result);
+ return true;
+ }
+
+ if (a->granularity != b->granularity) {
+ if ((a != result) && (b != result)) {
+ hbitmap_reset_all(result);
+ }
+ if (a != result) {
+ hbitmap_sparse_merge(result, a);
+ }
+ if (b != result) {
+ hbitmap_sparse_merge(result, b);
+ }
+ return true;
+ }
+
+ /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant.
+ * It may be possible to improve running times for sparsely populated maps
+ * by using hbitmap_iter_next, but this is suboptimal for dense maps.
+ */
+ assert(a->size == b->size);
+ for (i = HBITMAP_LEVELS - 1; i >= 0; i--) {
+ for (j = 0; j < a->sizes[i]; j++) {
+ result->levels[i][j] = a->levels[i][j] | b->levels[i][j];
+ }
+ }
+
+ /* Recompute the dirty count */
+ result->count = hb_count_between(result, 0, result->size - 1);
+
+ return true;
+}
+
+char *hbitmap_sha256(const HBitmap *bitmap, Error **errp)
+{
+ size_t size = bitmap->sizes[HBITMAP_LEVELS - 1] * sizeof(unsigned long);
+ char *data = (char *)bitmap->levels[HBITMAP_LEVELS - 1];
+ char *hash = NULL;
+ qcrypto_hash_digest(QCRYPTO_HASH_ALG_SHA256, data, size, &hash, errp);
+
+ return hash;
+}
diff --git a/util/hexdump.c b/util/hexdump.c
new file mode 100644
index 000000000..2c105a884
--- /dev/null
+++ b/util/hexdump.c
@@ -0,0 +1,65 @@
+/*
+ * Helper to hexdump a buffer
+ *
+ * Copyright (c) 2013 Red Hat, Inc.
+ * Copyright (c) 2013 Gerd Hoffmann <kraxel@redhat.com>
+ * Copyright (c) 2013 Peter Crosthwaite <peter.crosthwaite@xilinx.com>
+ * Copyright (c) 2013 Xilinx, Inc
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+void qemu_hexdump_line(char *line, unsigned int b, const void *bufptr,
+ unsigned int len, bool ascii)
+{
+ const char *buf = bufptr;
+ int i, c;
+
+ if (len > QEMU_HEXDUMP_LINE_BYTES) {
+ len = QEMU_HEXDUMP_LINE_BYTES;
+ }
+
+ line += snprintf(line, 6, "%04x:", b);
+ for (i = 0; i < QEMU_HEXDUMP_LINE_BYTES; i++) {
+ if ((i % 4) == 0) {
+ *line++ = ' ';
+ }
+ if (i < len) {
+ line += sprintf(line, " %02x", (unsigned char)buf[b + i]);
+ } else {
+ line += sprintf(line, " ");
+ }
+ }
+ if (ascii) {
+ *line++ = ' ';
+ for (i = 0; i < len; i++) {
+ c = buf[b + i];
+ if (c < ' ' || c > '~') {
+ c = '.';
+ }
+ *line++ = c;
+ }
+ }
+ *line = '\0';
+}
+
+void qemu_hexdump(FILE *fp, const char *prefix,
+ const void *bufptr, size_t size)
+{
+ unsigned int b, len;
+ char line[QEMU_HEXDUMP_LINE_LEN];
+
+ for (b = 0; b < size; b += QEMU_HEXDUMP_LINE_BYTES) {
+ len = size - b;
+ qemu_hexdump_line(line, b, bufptr, len, true);
+ fprintf(fp, "%s: %s\n", prefix, line);
+ }
+
+}
diff --git a/util/host-utils.c b/util/host-utils.c
new file mode 100644
index 000000000..bcc772b8e
--- /dev/null
+++ b/util/host-utils.c
@@ -0,0 +1,268 @@
+/*
+ * Utility compute operations used by translated code.
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ * Copyright (c) 2007 Aurelien Jarno
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/host-utils.h"
+
+#ifndef CONFIG_INT128
+/* Long integer helpers */
+static inline void mul64(uint64_t *plow, uint64_t *phigh,
+ uint64_t a, uint64_t b)
+{
+ typedef union {
+ uint64_t ll;
+ struct {
+#ifdef HOST_WORDS_BIGENDIAN
+ uint32_t high, low;
+#else
+ uint32_t low, high;
+#endif
+ } l;
+ } LL;
+ LL rl, rm, rn, rh, a0, b0;
+ uint64_t c;
+
+ a0.ll = a;
+ b0.ll = b;
+
+ rl.ll = (uint64_t)a0.l.low * b0.l.low;
+ rm.ll = (uint64_t)a0.l.low * b0.l.high;
+ rn.ll = (uint64_t)a0.l.high * b0.l.low;
+ rh.ll = (uint64_t)a0.l.high * b0.l.high;
+
+ c = (uint64_t)rl.l.high + rm.l.low + rn.l.low;
+ rl.l.high = c;
+ c >>= 32;
+ c = c + rm.l.high + rn.l.high + rh.l.low;
+ rh.l.low = c;
+ rh.l.high += (uint32_t)(c >> 32);
+
+ *plow = rl.ll;
+ *phigh = rh.ll;
+}
+
+/* Unsigned 64x64 -> 128 multiplication */
+void mulu64 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b)
+{
+ mul64(plow, phigh, a, b);
+}
+
+/* Signed 64x64 -> 128 multiplication */
+void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b)
+{
+ uint64_t rh;
+
+ mul64(plow, &rh, a, b);
+
+ /* Adjust for signs. */
+ if (b < 0) {
+ rh -= a;
+ }
+ if (a < 0) {
+ rh -= b;
+ }
+ *phigh = rh;
+}
+
+/*
+ * Unsigned 128-by-64 division.
+ * Returns the remainder.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor)
+{
+ uint64_t dhi = *phigh;
+ uint64_t dlo = *plow;
+ uint64_t rem, dhighest;
+ int sh;
+
+ if (divisor == 0 || dhi == 0) {
+ *plow = dlo / divisor;
+ *phigh = 0;
+ return dlo % divisor;
+ } else {
+ sh = clz64(divisor);
+
+ if (dhi < divisor) {
+ if (sh != 0) {
+ /* normalize the divisor, shifting the dividend accordingly */
+ divisor <<= sh;
+ dhi = (dhi << sh) | (dlo >> (64 - sh));
+ dlo <<= sh;
+ }
+
+ *phigh = 0;
+ *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+ } else {
+ if (sh != 0) {
+ /* normalize the divisor, shifting the dividend accordingly */
+ divisor <<= sh;
+ dhighest = dhi >> (64 - sh);
+ dhi = (dhi << sh) | (dlo >> (64 - sh));
+ dlo <<= sh;
+
+ *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor);
+ } else {
+ /**
+ * dhi >= divisor
+ * Since the MSB of divisor is set (sh == 0),
+ * (dhi - divisor) < divisor
+ *
+ * Thus, the high part of the quotient is 1, and we can
+ * calculate the low part with a single call to udiv_qrnnd
+ * after subtracting divisor from dhi
+ */
+ dhi -= divisor;
+ *phigh = 1;
+ }
+
+ *plow = udiv_qrnnd(&rem, dhi, dlo, divisor);
+ }
+
+ /*
+ * since the dividend/divisor might have been normalized,
+ * the remainder might also have to be shifted back
+ */
+ return rem >> sh;
+ }
+}
+
+/*
+ * Signed 128-by-64 division.
+ * Returns quotient via plow and phigh.
+ * Also returns the remainder via the function return value.
+ */
+int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor)
+{
+ bool neg_quotient = false, neg_remainder = false;
+ uint64_t unsig_hi = *phigh, unsig_lo = *plow;
+ uint64_t rem;
+
+ if (*phigh < 0) {
+ neg_quotient = !neg_quotient;
+ neg_remainder = !neg_remainder;
+
+ if (unsig_lo == 0) {
+ unsig_hi = -unsig_hi;
+ } else {
+ unsig_hi = ~unsig_hi;
+ unsig_lo = -unsig_lo;
+ }
+ }
+
+ if (divisor < 0) {
+ neg_quotient = !neg_quotient;
+
+ divisor = -divisor;
+ }
+
+ rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor);
+
+ if (neg_quotient) {
+ if (unsig_lo == 0) {
+ *phigh = -unsig_hi;
+ *plow = 0;
+ } else {
+ *phigh = ~unsig_hi;
+ *plow = -unsig_lo;
+ }
+ } else {
+ *phigh = unsig_hi;
+ *plow = unsig_lo;
+ }
+
+ if (neg_remainder) {
+ return -rem;
+ } else {
+ return rem;
+ }
+}
+#endif
+
+/**
+ * urshift - 128-bit Unsigned Right Shift.
+ * @plow: in/out - lower 64-bit integer.
+ * @phigh: in/out - higher 64-bit integer.
+ * @shift: in - bytes to shift, between 0 and 127.
+ *
+ * Result is zero-extended and stored in plow/phigh, which are
+ * input/output variables. Shift values outside the range will
+ * be mod to 128. In other words, the caller is responsible to
+ * verify/assert both the shift range and plow/phigh pointers.
+ */
+void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift)
+{
+ shift &= 127;
+ if (shift == 0) {
+ return;
+ }
+
+ uint64_t h = *phigh >> (shift & 63);
+ if (shift >= 64) {
+ *plow = h;
+ *phigh = 0;
+ } else {
+ *plow = (*plow >> (shift & 63)) | (*phigh << (64 - (shift & 63)));
+ *phigh = h;
+ }
+}
+
+/**
+ * ulshift - 128-bit Unsigned Left Shift.
+ * @plow: in/out - lower 64-bit integer.
+ * @phigh: in/out - higher 64-bit integer.
+ * @shift: in - bytes to shift, between 0 and 127.
+ * @overflow: out - true if any 1-bit is shifted out.
+ *
+ * Result is zero-extended and stored in plow/phigh, which are
+ * input/output variables. Shift values outside the range will
+ * be mod to 128. In other words, the caller is responsible to
+ * verify/assert both the shift range and plow/phigh pointers.
+ */
+void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow)
+{
+ uint64_t low = *plow;
+ uint64_t high = *phigh;
+
+ shift &= 127;
+ if (shift == 0) {
+ return;
+ }
+
+ /* check if any bit will be shifted out */
+ urshift(&low, &high, 128 - shift);
+ if (low | high) {
+ *overflow = true;
+ }
+
+ if (shift >= 64) {
+ *phigh = *plow << (shift & 63);
+ *plow = 0;
+ } else {
+ *phigh = (*plow >> (64 - (shift & 63))) | (*phigh << (shift & 63));
+ *plow = *plow << shift;
+ }
+}
diff --git a/util/id.c b/util/id.c
new file mode 100644
index 000000000..ded41c502
--- /dev/null
+++ b/util/id.c
@@ -0,0 +1,69 @@
+/*
+ * Dealing with identifiers
+ *
+ * Copyright (C) 2014 Red Hat, Inc.
+ *
+ * Authors:
+ * Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1
+ * or later. See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/ctype.h"
+#include "qemu/id.h"
+
+bool id_wellformed(const char *id)
+{
+ int i;
+
+ if (!qemu_isalpha(id[0])) {
+ return false;
+ }
+ for (i = 1; id[i]; i++) {
+ if (!qemu_isalnum(id[i]) && !strchr("-._", id[i])) {
+ return false;
+ }
+ }
+ return true;
+}
+
+#define ID_SPECIAL_CHAR '#'
+
+static const char *const id_subsys_str[ID_MAX] = {
+ [ID_QDEV] = "qdev",
+ [ID_BLOCK] = "block",
+ [ID_CHR] = "chr",
+ [ID_NET] = "net",
+};
+
+/*
+ * Generates an ID of the form PREFIX SUBSYSTEM NUMBER
+ * where:
+ *
+ * - PREFIX is the reserved character '#'
+ * - SUBSYSTEM identifies the subsystem creating the ID
+ * - NUMBER is a decimal number unique within SUBSYSTEM.
+ *
+ * Example: "#block146"
+ *
+ * Note that these IDs do not satisfy id_wellformed().
+ *
+ * The caller is responsible for freeing the returned string with g_free()
+ */
+char *id_generate(IdSubSystems id)
+{
+ static uint64_t id_counters[ID_MAX];
+ uint32_t rnd;
+
+ assert(id < ARRAY_SIZE(id_subsys_str));
+ assert(id_subsys_str[id]);
+
+ rnd = g_random_int_range(0, 100);
+
+ return g_strdup_printf("%c%s%" PRIu64 "%02" PRId32, ID_SPECIAL_CHAR,
+ id_subsys_str[id],
+ id_counters[id]++,
+ rnd);
+}
diff --git a/util/iov.c b/util/iov.c
new file mode 100644
index 000000000..58c7b3eee
--- /dev/null
+++ b/util/iov.c
@@ -0,0 +1,764 @@
+/*
+ * Helpers for getting linearized buffers from iov / filling buffers into iovs
+ *
+ * Copyright IBM, Corp. 2007, 2008
+ * Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Author(s):
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Amit Shah <amit.shah@redhat.com>
+ * Michael Tokarev <mjt@tls.msk.ru>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/iov.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+
+size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt,
+ size_t offset, const void *buf, size_t bytes)
+{
+ size_t done;
+ unsigned int i;
+ for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+ if (offset < iov[i].iov_len) {
+ size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+ memcpy(iov[i].iov_base + offset, buf + done, len);
+ done += len;
+ offset = 0;
+ } else {
+ offset -= iov[i].iov_len;
+ }
+ }
+ assert(offset == 0);
+ return done;
+}
+
+size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt,
+ size_t offset, void *buf, size_t bytes)
+{
+ size_t done;
+ unsigned int i;
+ for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+ if (offset < iov[i].iov_len) {
+ size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+ memcpy(buf + done, iov[i].iov_base + offset, len);
+ done += len;
+ offset = 0;
+ } else {
+ offset -= iov[i].iov_len;
+ }
+ }
+ assert(offset == 0);
+ return done;
+}
+
+size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt,
+ size_t offset, int fillc, size_t bytes)
+{
+ size_t done;
+ unsigned int i;
+ for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) {
+ if (offset < iov[i].iov_len) {
+ size_t len = MIN(iov[i].iov_len - offset, bytes - done);
+ memset(iov[i].iov_base + offset, fillc, len);
+ done += len;
+ offset = 0;
+ } else {
+ offset -= iov[i].iov_len;
+ }
+ }
+ assert(offset == 0);
+ return done;
+}
+
+size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt)
+{
+ size_t len;
+ unsigned int i;
+
+ len = 0;
+ for (i = 0; i < iov_cnt; i++) {
+ len += iov[i].iov_len;
+ }
+ return len;
+}
+
+/* helper function for iov_send_recv() */
+static ssize_t
+do_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, bool do_send)
+{
+#ifdef CONFIG_POSIX
+ ssize_t ret;
+ struct msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iov_cnt;
+ do {
+ ret = do_send
+ ? sendmsg(sockfd, &msg, 0)
+ : recvmsg(sockfd, &msg, 0);
+ } while (ret < 0 && errno == EINTR);
+ return ret;
+#else
+ /* else send piece-by-piece */
+ /*XXX Note: windows has WSASend() and WSARecv() */
+ unsigned i = 0;
+ ssize_t ret = 0;
+ while (i < iov_cnt) {
+ ssize_t r = do_send
+ ? send(sockfd, iov[i].iov_base, iov[i].iov_len, 0)
+ : recv(sockfd, iov[i].iov_base, iov[i].iov_len, 0);
+ if (r > 0) {
+ ret += r;
+ } else if (!r) {
+ break;
+ } else if (errno == EINTR) {
+ continue;
+ } else {
+ /* else it is some "other" error,
+ * only return if there was no data processed. */
+ if (ret == 0) {
+ ret = -1;
+ }
+ break;
+ }
+ i++;
+ }
+ return ret;
+#endif
+}
+
+ssize_t iov_send_recv(int sockfd, const struct iovec *_iov, unsigned iov_cnt,
+ size_t offset, size_t bytes,
+ bool do_send)
+{
+ ssize_t total = 0;
+ ssize_t ret;
+ size_t orig_len, tail;
+ unsigned niov;
+ struct iovec *local_iov, *iov;
+
+ if (bytes <= 0) {
+ return 0;
+ }
+
+ local_iov = g_new0(struct iovec, iov_cnt);
+ iov_copy(local_iov, iov_cnt, _iov, iov_cnt, offset, bytes);
+ offset = 0;
+ iov = local_iov;
+
+ while (bytes > 0) {
+ /* Find the start position, skipping `offset' bytes:
+ * first, skip all full-sized vector elements, */
+ for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) {
+ offset -= iov[niov].iov_len;
+ }
+
+ /* niov == iov_cnt would only be valid if bytes == 0, which
+ * we already ruled out in the loop condition. */
+ assert(niov < iov_cnt);
+ iov += niov;
+ iov_cnt -= niov;
+
+ if (offset) {
+ /* second, skip `offset' bytes from the (now) first element,
+ * undo it on exit */
+ iov[0].iov_base += offset;
+ iov[0].iov_len -= offset;
+ }
+ /* Find the end position skipping `bytes' bytes: */
+ /* first, skip all full-sized elements */
+ tail = bytes;
+ for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) {
+ tail -= iov[niov].iov_len;
+ }
+ if (tail) {
+ /* second, fixup the last element, and remember the original
+ * length */
+ assert(niov < iov_cnt);
+ assert(iov[niov].iov_len > tail);
+ orig_len = iov[niov].iov_len;
+ iov[niov++].iov_len = tail;
+ ret = do_send_recv(sockfd, iov, niov, do_send);
+ /* Undo the changes above before checking for errors */
+ iov[niov-1].iov_len = orig_len;
+ } else {
+ ret = do_send_recv(sockfd, iov, niov, do_send);
+ }
+ if (offset) {
+ iov[0].iov_base -= offset;
+ iov[0].iov_len += offset;
+ }
+
+ if (ret < 0) {
+ assert(errno != EINTR);
+ g_free(local_iov);
+ if (errno == EAGAIN && total > 0) {
+ return total;
+ }
+ return -1;
+ }
+
+ if (ret == 0 && !do_send) {
+ /* recv returns 0 when the peer has performed an orderly
+ * shutdown. */
+ break;
+ }
+
+ /* Prepare for the next iteration */
+ offset += ret;
+ total += ret;
+ bytes -= ret;
+ }
+
+ g_free(local_iov);
+ return total;
+}
+
+
+void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt,
+ FILE *fp, const char *prefix, size_t limit)
+{
+ int v;
+ size_t size = 0;
+ char *buf;
+
+ for (v = 0; v < iov_cnt; v++) {
+ size += iov[v].iov_len;
+ }
+ size = size > limit ? limit : size;
+ buf = g_malloc(size);
+ iov_to_buf(iov, iov_cnt, 0, buf, size);
+ qemu_hexdump(fp, prefix, buf, size);
+ g_free(buf);
+}
+
+unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt,
+ const struct iovec *iov, unsigned int iov_cnt,
+ size_t offset, size_t bytes)
+{
+ size_t len;
+ unsigned int i, j;
+ for (i = 0, j = 0;
+ i < iov_cnt && j < dst_iov_cnt && (offset || bytes); i++) {
+ if (offset >= iov[i].iov_len) {
+ offset -= iov[i].iov_len;
+ continue;
+ }
+ len = MIN(bytes, iov[i].iov_len - offset);
+
+ dst_iov[j].iov_base = iov[i].iov_base + offset;
+ dst_iov[j].iov_len = len;
+ j++;
+ bytes -= len;
+ offset = 0;
+ }
+ assert(offset == 0);
+ return j;
+}
+
+/* io vectors */
+
+void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint)
+{
+ qiov->iov = g_new(struct iovec, alloc_hint);
+ qiov->niov = 0;
+ qiov->nalloc = alloc_hint;
+ qiov->size = 0;
+}
+
+void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov)
+{
+ int i;
+
+ qiov->iov = iov;
+ qiov->niov = niov;
+ qiov->nalloc = -1;
+ qiov->size = 0;
+ for (i = 0; i < niov; i++)
+ qiov->size += iov[i].iov_len;
+}
+
+void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len)
+{
+ assert(qiov->nalloc != -1);
+
+ if (qiov->niov == qiov->nalloc) {
+ qiov->nalloc = 2 * qiov->nalloc + 1;
+ qiov->iov = g_renew(struct iovec, qiov->iov, qiov->nalloc);
+ }
+ qiov->iov[qiov->niov].iov_base = base;
+ qiov->iov[qiov->niov].iov_len = len;
+ qiov->size += len;
+ ++qiov->niov;
+}
+
+/*
+ * Concatenates (partial) iovecs from src_iov to the end of dst.
+ * It starts copying after skipping `soffset' bytes at the
+ * beginning of src and adds individual vectors from src to
+ * dst copies up to `sbytes' bytes total, or up to the end
+ * of src_iov if it comes first. This way, it is okay to specify
+ * very large value for `sbytes' to indicate "up to the end
+ * of src".
+ * Only vector pointers are processed, not the actual data buffers.
+ */
+size_t qemu_iovec_concat_iov(QEMUIOVector *dst,
+ struct iovec *src_iov, unsigned int src_cnt,
+ size_t soffset, size_t sbytes)
+{
+ int i;
+ size_t done;
+
+ if (!sbytes) {
+ return 0;
+ }
+ assert(dst->nalloc != -1);
+ for (i = 0, done = 0; done < sbytes && i < src_cnt; i++) {
+ if (soffset < src_iov[i].iov_len) {
+ size_t len = MIN(src_iov[i].iov_len - soffset, sbytes - done);
+ qemu_iovec_add(dst, src_iov[i].iov_base + soffset, len);
+ done += len;
+ soffset = 0;
+ } else {
+ soffset -= src_iov[i].iov_len;
+ }
+ }
+ assert(soffset == 0); /* offset beyond end of src */
+
+ return done;
+}
+
+/*
+ * Concatenates (partial) iovecs from src to the end of dst.
+ * It starts copying after skipping `soffset' bytes at the
+ * beginning of src and adds individual vectors from src to
+ * dst copies up to `sbytes' bytes total, or up to the end
+ * of src if it comes first. This way, it is okay to specify
+ * very large value for `sbytes' to indicate "up to the end
+ * of src".
+ * Only vector pointers are processed, not the actual data buffers.
+ */
+void qemu_iovec_concat(QEMUIOVector *dst,
+ QEMUIOVector *src, size_t soffset, size_t sbytes)
+{
+ qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes);
+}
+
+/*
+ * qiov_find_iov
+ *
+ * Return pointer to iovec structure, where byte at @offset in original vector
+ * @iov exactly is.
+ * Set @remaining_offset to be offset inside that iovec to the same byte.
+ */
+static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset,
+ size_t *remaining_offset)
+{
+ while (offset > 0 && offset >= iov->iov_len) {
+ offset -= iov->iov_len;
+ iov++;
+ }
+ *remaining_offset = offset;
+
+ return iov;
+}
+
+/*
+ * qiov_slice
+ *
+ * Find subarray of iovec's, containing requested range. @head would
+ * be offset in first iov (returned by the function), @tail would be
+ * count of extra bytes in last iovec (returned iov + @niov - 1).
+ */
+static struct iovec *qiov_slice(QEMUIOVector *qiov,
+ size_t offset, size_t len,
+ size_t *head, size_t *tail, int *niov)
+{
+ struct iovec *iov, *end_iov;
+
+ assert(offset + len <= qiov->size);
+
+ iov = iov_skip_offset(qiov->iov, offset, head);
+ end_iov = iov_skip_offset(iov, *head + len, tail);
+
+ if (*tail > 0) {
+ assert(*tail < end_iov->iov_len);
+ *tail = end_iov->iov_len - *tail;
+ end_iov++;
+ }
+
+ *niov = end_iov - iov;
+
+ return iov;
+}
+
+int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len)
+{
+ size_t head, tail;
+ int niov;
+
+ qiov_slice(qiov, offset, len, &head, &tail, &niov);
+
+ return niov;
+}
+
+/*
+ * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov,
+ * and @tail_buf buffer into new qiov.
+ */
+int qemu_iovec_init_extended(
+ QEMUIOVector *qiov,
+ void *head_buf, size_t head_len,
+ QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len,
+ void *tail_buf, size_t tail_len)
+{
+ size_t mid_head, mid_tail;
+ int total_niov, mid_niov = 0;
+ struct iovec *p, *mid_iov = NULL;
+
+ assert(mid_qiov->niov <= IOV_MAX);
+
+ if (SIZE_MAX - head_len < mid_len ||
+ SIZE_MAX - head_len - mid_len < tail_len)
+ {
+ return -EINVAL;
+ }
+
+ if (mid_len) {
+ mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len,
+ &mid_head, &mid_tail, &mid_niov);
+ }
+
+ total_niov = !!head_len + mid_niov + !!tail_len;
+ if (total_niov > IOV_MAX) {
+ return -EINVAL;
+ }
+
+ if (total_niov == 1) {
+ qemu_iovec_init_buf(qiov, NULL, 0);
+ p = &qiov->local_iov;
+ } else {
+ qiov->niov = qiov->nalloc = total_niov;
+ qiov->size = head_len + mid_len + tail_len;
+ p = qiov->iov = g_new(struct iovec, qiov->niov);
+ }
+
+ if (head_len) {
+ p->iov_base = head_buf;
+ p->iov_len = head_len;
+ p++;
+ }
+
+ assert(!mid_niov == !mid_len);
+ if (mid_niov) {
+ memcpy(p, mid_iov, mid_niov * sizeof(*p));
+ p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head;
+ p[0].iov_len -= mid_head;
+ p[mid_niov - 1].iov_len -= mid_tail;
+ p += mid_niov;
+ }
+
+ if (tail_len) {
+ p->iov_base = tail_buf;
+ p->iov_len = tail_len;
+ }
+
+ return 0;
+}
+
+/*
+ * Check if the contents of subrange of qiov data is all zeroes.
+ */
+bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes)
+{
+ struct iovec *iov;
+ size_t current_offset;
+
+ assert(offset + bytes <= qiov->size);
+
+ iov = iov_skip_offset(qiov->iov, offset, &current_offset);
+
+ while (bytes) {
+ uint8_t *base = (uint8_t *)iov->iov_base + current_offset;
+ size_t len = MIN(iov->iov_len - current_offset, bytes);
+
+ if (!buffer_is_zero(base, len)) {
+ return false;
+ }
+
+ current_offset = 0;
+ bytes -= len;
+ iov++;
+ }
+
+ return true;
+}
+
+void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source,
+ size_t offset, size_t len)
+{
+ int ret;
+
+ assert(source->size >= len);
+ assert(source->size - len >= offset);
+
+ /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */
+ ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0);
+ assert(ret == 0);
+}
+
+void qemu_iovec_destroy(QEMUIOVector *qiov)
+{
+ if (qiov->nalloc != -1) {
+ g_free(qiov->iov);
+ }
+
+ memset(qiov, 0, sizeof(*qiov));
+}
+
+void qemu_iovec_reset(QEMUIOVector *qiov)
+{
+ assert(qiov->nalloc != -1);
+
+ qiov->niov = 0;
+ qiov->size = 0;
+}
+
+size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset,
+ void *buf, size_t bytes)
+{
+ return iov_to_buf(qiov->iov, qiov->niov, offset, buf, bytes);
+}
+
+size_t qemu_iovec_from_buf(QEMUIOVector *qiov, size_t offset,
+ const void *buf, size_t bytes)
+{
+ return iov_from_buf(qiov->iov, qiov->niov, offset, buf, bytes);
+}
+
+size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset,
+ int fillc, size_t bytes)
+{
+ return iov_memset(qiov->iov, qiov->niov, offset, fillc, bytes);
+}
+
+/**
+ * Check that I/O vector contents are identical
+ *
+ * The IO vectors must have the same structure (same length of all parts).
+ * A typical usage is to compare vectors created with qemu_iovec_clone().
+ *
+ * @a: I/O vector
+ * @b: I/O vector
+ * @ret: Offset to first mismatching byte or -1 if match
+ */
+ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b)
+{
+ int i;
+ ssize_t offset = 0;
+
+ assert(a->niov == b->niov);
+ for (i = 0; i < a->niov; i++) {
+ size_t len = 0;
+ uint8_t *p = (uint8_t *)a->iov[i].iov_base;
+ uint8_t *q = (uint8_t *)b->iov[i].iov_base;
+
+ assert(a->iov[i].iov_len == b->iov[i].iov_len);
+ while (len < a->iov[i].iov_len && *p++ == *q++) {
+ len++;
+ }
+
+ offset += len;
+
+ if (len != a->iov[i].iov_len) {
+ return offset;
+ }
+ }
+ return -1;
+}
+
+typedef struct {
+ int src_index;
+ struct iovec *src_iov;
+ void *dest_base;
+} IOVectorSortElem;
+
+static int sortelem_cmp_src_base(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ /* Don't overflow */
+ if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) {
+ return -1;
+ } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+static int sortelem_cmp_src_index(const void *a, const void *b)
+{
+ const IOVectorSortElem *elem_a = a;
+ const IOVectorSortElem *elem_b = b;
+
+ return elem_a->src_index - elem_b->src_index;
+}
+
+/**
+ * Copy contents of I/O vector
+ *
+ * The relative relationships of overlapping iovecs are preserved. This is
+ * necessary to ensure identical semantics in the cloned I/O vector.
+ */
+void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf)
+{
+ IOVectorSortElem sortelems[src->niov];
+ void *last_end;
+ int i;
+
+ /* Sort by source iovecs by base address */
+ for (i = 0; i < src->niov; i++) {
+ sortelems[i].src_index = i;
+ sortelems[i].src_iov = &src->iov[i];
+ }
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base);
+
+ /* Allocate buffer space taking into account overlapping iovecs */
+ last_end = NULL;
+ for (i = 0; i < src->niov; i++) {
+ struct iovec *cur = sortelems[i].src_iov;
+ ptrdiff_t rewind = 0;
+
+ /* Detect overlap */
+ if (last_end && last_end > cur->iov_base) {
+ rewind = last_end - cur->iov_base;
+ }
+
+ sortelems[i].dest_base = buf - rewind;
+ buf += cur->iov_len - MIN(rewind, cur->iov_len);
+ last_end = MAX(cur->iov_base + cur->iov_len, last_end);
+ }
+
+ /* Sort by source iovec index and build destination iovec */
+ qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index);
+ for (i = 0; i < src->niov; i++) {
+ qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len);
+ }
+}
+
+void iov_discard_undo(IOVDiscardUndo *undo)
+{
+ /* Restore original iovec if it was modified */
+ if (undo->modified_iov) {
+ *undo->modified_iov = undo->orig;
+ }
+}
+
+size_t iov_discard_front_undoable(struct iovec **iov,
+ unsigned int *iov_cnt,
+ size_t bytes,
+ IOVDiscardUndo *undo)
+{
+ size_t total = 0;
+ struct iovec *cur;
+
+ if (undo) {
+ undo->modified_iov = NULL;
+ }
+
+ for (cur = *iov; *iov_cnt > 0; cur++) {
+ if (cur->iov_len > bytes) {
+ if (undo) {
+ undo->modified_iov = cur;
+ undo->orig = *cur;
+ }
+
+ cur->iov_base += bytes;
+ cur->iov_len -= bytes;
+ total += bytes;
+ break;
+ }
+
+ bytes -= cur->iov_len;
+ total += cur->iov_len;
+ *iov_cnt -= 1;
+ }
+
+ *iov = cur;
+ return total;
+}
+
+size_t iov_discard_front(struct iovec **iov, unsigned int *iov_cnt,
+ size_t bytes)
+{
+ return iov_discard_front_undoable(iov, iov_cnt, bytes, NULL);
+}
+
+size_t iov_discard_back_undoable(struct iovec *iov,
+ unsigned int *iov_cnt,
+ size_t bytes,
+ IOVDiscardUndo *undo)
+{
+ size_t total = 0;
+ struct iovec *cur;
+
+ if (undo) {
+ undo->modified_iov = NULL;
+ }
+
+ if (*iov_cnt == 0) {
+ return 0;
+ }
+
+ cur = iov + (*iov_cnt - 1);
+
+ while (*iov_cnt > 0) {
+ if (cur->iov_len > bytes) {
+ if (undo) {
+ undo->modified_iov = cur;
+ undo->orig = *cur;
+ }
+
+ cur->iov_len -= bytes;
+ total += bytes;
+ break;
+ }
+
+ bytes -= cur->iov_len;
+ total += cur->iov_len;
+ cur--;
+ *iov_cnt -= 1;
+ }
+
+ return total;
+}
+
+size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt,
+ size_t bytes)
+{
+ return iov_discard_back_undoable(iov, iov_cnt, bytes, NULL);
+}
+
+void qemu_iovec_discard_back(QEMUIOVector *qiov, size_t bytes)
+{
+ size_t total;
+ unsigned int niov = qiov->niov;
+
+ assert(qiov->size >= bytes);
+ total = iov_discard_back(qiov->iov, &niov, bytes);
+ assert(total == bytes);
+
+ qiov->niov = niov;
+ qiov->size -= bytes;
+}
diff --git a/util/iova-tree.c b/util/iova-tree.c
new file mode 100644
index 000000000..23ea35b7a
--- /dev/null
+++ b/util/iova-tree.c
@@ -0,0 +1,114 @@
+/*
+ * IOVA tree implementation based on GTree.
+ *
+ * Copyright 2018 Red Hat, Inc.
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+
+struct IOVATree {
+ GTree *tree;
+};
+
+static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
+{
+ const DMAMap *m1 = a, *m2 = b;
+
+ if (m1->iova > m2->iova + m2->size) {
+ return 1;
+ }
+
+ if (m1->iova + m1->size < m2->iova) {
+ return -1;
+ }
+
+ /* Overlapped */
+ return 0;
+}
+
+IOVATree *iova_tree_new(void)
+{
+ IOVATree *iova_tree = g_new0(IOVATree, 1);
+
+ /* We don't have values actually, no need to free */
+ iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL);
+
+ return iova_tree;
+}
+
+const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
+{
+ return g_tree_lookup(tree->tree, map);
+}
+
+const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
+{
+ const DMAMap map = { .iova = iova, .size = 0 };
+
+ return iova_tree_find(tree, &map);
+}
+
+static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range)
+{
+ /* Key and value are sharing the same range data */
+ g_tree_insert(gtree, range, range);
+}
+
+int iova_tree_insert(IOVATree *tree, const DMAMap *map)
+{
+ DMAMap *new;
+
+ if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) {
+ return IOVA_ERR_INVALID;
+ }
+
+ /* We don't allow to insert range that overlaps with existings */
+ if (iova_tree_find(tree, map)) {
+ return IOVA_ERR_OVERLAP;
+ }
+
+ new = g_new0(DMAMap, 1);
+ memcpy(new, map, sizeof(*new));
+ iova_tree_insert_internal(tree->tree, new);
+
+ return IOVA_OK;
+}
+
+static gboolean iova_tree_traverse(gpointer key, gpointer value,
+ gpointer data)
+{
+ iova_tree_iterator iterator = data;
+ DMAMap *map = key;
+
+ g_assert(key == value);
+
+ return iterator(map);
+}
+
+void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator)
+{
+ g_tree_foreach(tree->tree, iova_tree_traverse, iterator);
+}
+
+int iova_tree_remove(IOVATree *tree, const DMAMap *map)
+{
+ const DMAMap *overlap;
+
+ while ((overlap = iova_tree_find(tree, map))) {
+ g_tree_remove(tree->tree, overlap);
+ }
+
+ return IOVA_OK;
+}
+
+void iova_tree_destroy(IOVATree *tree)
+{
+ g_tree_destroy(tree->tree);
+ g_free(tree);
+}
diff --git a/util/keyval.c b/util/keyval.c
new file mode 100644
index 000000000..904337c8a
--- /dev/null
+++ b/util/keyval.c
@@ -0,0 +1,577 @@
+/*
+ * Parsing KEY=VALUE,... strings
+ *
+ * Copyright (C) 2017 Red Hat Inc.
+ *
+ * Authors:
+ * Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+/*
+ * KEY=VALUE,... syntax:
+ *
+ * key-vals = [ key-val { ',' key-val } [ ',' ] ]
+ * key-val = key '=' val | help
+ * key = key-fragment { '.' key-fragment }
+ * key-fragment = / [^=,.]+ /
+ * val = { / [^,]+ / | ',,' }
+ * help = 'help' | '?'
+ *
+ * Semantics defined by reduction to JSON:
+ *
+ * key-vals specifies a JSON object, i.e. a tree whose root is an
+ * object, inner nodes other than the root are objects or arrays,
+ * and leaves are strings.
+ *
+ * Each key-val = key-fragment '.' ... '=' val specifies a path from
+ * root to a leaf (left of '='), and the leaf's value (right of
+ * '=').
+ *
+ * A path from the root is defined recursively:
+ * L '.' key-fragment is a child of the node denoted by path L
+ * key-fragment is a child of the tree root
+ * If key-fragment is numeric, the parent is an array and the child
+ * is its key-fragment-th member, counting from zero.
+ * Else, the parent is an object, and the child is its member named
+ * key-fragment.
+ *
+ * This constrains inner nodes to be either array or object. The
+ * constraints must be satisfiable. Counter-example: a.b=1,a=2 is
+ * not, because root.a must be an object to satisfy a.b=1 and a
+ * string to satisfy a=2.
+ *
+ * Array subscripts can occur in any order, but the set of
+ * subscripts must not have gaps. For instance, a.1=v is not okay,
+ * because root.a[0] is missing.
+ *
+ * If multiple key-val denote the same leaf, the last one determines
+ * the value.
+ *
+ * Key-fragments must be valid QAPI names or consist only of decimal
+ * digits.
+ *
+ * The length of any key-fragment must be between 1 and 127.
+ *
+ * If any key-val is help, the object is to be treated as a help
+ * request.
+ *
+ * Design flaw: there is no way to denote an empty array or non-root
+ * object. While interpreting "key absent" as empty seems natural
+ * (removing a key-val from the input string removes the member when
+ * there are more, so why not when it's the last), it doesn't work:
+ * "key absent" already means "optional object/array absent", which
+ * isn't the same as "empty object/array present".
+ *
+ * Design flaw: scalar values can only be strings; there is no way to
+ * denote numbers, true, false or null. The special QObject input
+ * visitor returned by qobject_input_visitor_new_keyval() mostly hides
+ * this by automatically converting strings to the type the visitor
+ * expects. Breaks down for type 'any', where the visitor's
+ * expectation isn't clear. Code visiting 'any' needs to do the
+ * conversion itself, but only when using this keyval visitor.
+ * Awkward. Note that we carefully restrict alternate types to avoid
+ * similar ambiguity.
+ *
+ * Alternative syntax for use with an implied key:
+ *
+ * key-vals = [ key-val-1st { ',' key-val } [ ',' ] ]
+ * key-val-1st = val-no-key | key-val
+ * val-no-key = / [^=,]+ / - help
+ *
+ * where val-no-key is syntactic sugar for implied-key=val-no-key.
+ *
+ * Note that you can't use the sugared form when the value contains
+ * '=' or ','.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/help_option.h"
+#include "qemu/option.h"
+
+/*
+ * Convert @key to a list index.
+ * Convert all leading decimal digits to a (non-negative) number,
+ * capped at INT_MAX.
+ * If @end is non-null, assign a pointer to the first character after
+ * the number to *@end.
+ * Else, fail if any characters follow.
+ * On success, return the converted number.
+ * On failure, return a negative value.
+ * Note: since only digits are converted, no two keys can map to the
+ * same number, except by overflow to INT_MAX.
+ */
+static int key_to_index(const char *key, const char **end)
+{
+ int ret;
+ unsigned long index;
+
+ if (*key < '0' || *key > '9') {
+ return -EINVAL;
+ }
+ ret = qemu_strtoul(key, end, 10, &index);
+ if (ret) {
+ return ret == -ERANGE ? INT_MAX : ret;
+ }
+ return index <= INT_MAX ? index : INT_MAX;
+}
+
+/*
+ * Ensure @cur maps @key_in_cur the right way.
+ * If @value is null, it needs to map to a QDict, else to this
+ * QString.
+ * If @cur doesn't have @key_in_cur, put an empty QDict or @value,
+ * respectively.
+ * Else, if it needs to map to a QDict, and already does, do nothing.
+ * Else, if it needs to map to this QString, and already maps to a
+ * QString, replace it by @value.
+ * Else, fail because we have conflicting needs on how to map
+ * @key_in_cur.
+ * In any case, take over the reference to @value, i.e. if the caller
+ * wants to hold on to a reference, it needs to qobject_ref().
+ * Use @key up to @key_cursor to identify the key in error messages.
+ * On success, return the mapped value.
+ * On failure, store an error through @errp and return NULL.
+ */
+static QObject *keyval_parse_put(QDict *cur,
+ const char *key_in_cur, QString *value,
+ const char *key, const char *key_cursor,
+ Error **errp)
+{
+ QObject *old, *new;
+
+ old = qdict_get(cur, key_in_cur);
+ if (old) {
+ if (qobject_type(old) != (value ? QTYPE_QSTRING : QTYPE_QDICT)) {
+ error_setg(errp, "Parameters '%.*s.*' used inconsistently",
+ (int)(key_cursor - key), key);
+ qobject_unref(value);
+ return NULL;
+ }
+ if (!value) {
+ return old; /* already QDict, do nothing */
+ }
+ new = QOBJECT(value); /* replacement */
+ } else {
+ new = value ? QOBJECT(value) : QOBJECT(qdict_new());
+ }
+ qdict_put_obj(cur, key_in_cur, new);
+ return new;
+}
+
+/*
+ * Parse one parameter from @params.
+ *
+ * If we're looking at KEY=VALUE, store result in @qdict.
+ * The first fragment of KEY applies to @qdict. Subsequent fragments
+ * apply to nested QDicts, which are created on demand. @implied_key
+ * is as in keyval_parse().
+ *
+ * If we're looking at "help" or "?", set *help to true.
+ *
+ * On success, return a pointer to the next parameter, or else to '\0'.
+ * On failure, return NULL.
+ */
+static const char *keyval_parse_one(QDict *qdict, const char *params,
+ const char *implied_key, bool *help,
+ Error **errp)
+{
+ const char *key, *key_end, *val_end, *s, *end;
+ size_t len;
+ char key_in_cur[128];
+ QDict *cur;
+ int ret;
+ QObject *next;
+ GString *val;
+
+ key = params;
+ val_end = NULL;
+ len = strcspn(params, "=,");
+ if (len && key[len] != '=') {
+ if (starts_with_help_option(key) == len) {
+ *help = true;
+ s = key + len;
+ if (*s == ',') {
+ s++;
+ }
+ return s;
+ }
+ if (implied_key) {
+ /* Desugar implied key */
+ key = implied_key;
+ val_end = params + len;
+ len = strlen(implied_key);
+ }
+ }
+ key_end = key + len;
+
+ /*
+ * Loop over key fragments: @s points to current fragment, it
+ * applies to @cur. @key_in_cur[] holds the previous fragment.
+ */
+ cur = qdict;
+ s = key;
+ for (;;) {
+ /* Want a key index (unless it's first) or a QAPI name */
+ if (s != key && key_to_index(s, &end) >= 0) {
+ len = end - s;
+ } else {
+ ret = parse_qapi_name(s, false);
+ len = ret < 0 ? 0 : ret;
+ }
+ assert(s + len <= key_end);
+ if (!len || (s + len < key_end && s[len] != '.')) {
+ assert(key != implied_key);
+ error_setg(errp, "Invalid parameter '%.*s'",
+ (int)(key_end - key), key);
+ return NULL;
+ }
+ if (len >= sizeof(key_in_cur)) {
+ assert(key != implied_key);
+ error_setg(errp, "Parameter%s '%.*s' is too long",
+ s != key || s + len != key_end ? " fragment" : "",
+ (int)len, s);
+ return NULL;
+ }
+
+ if (s != key) {
+ next = keyval_parse_put(cur, key_in_cur, NULL,
+ key, s - 1, errp);
+ if (!next) {
+ return NULL;
+ }
+ cur = qobject_to(QDict, next);
+ assert(cur);
+ }
+
+ memcpy(key_in_cur, s, len);
+ key_in_cur[len] = 0;
+ s += len;
+
+ if (*s != '.') {
+ break;
+ }
+ s++;
+ }
+
+ if (key == implied_key) {
+ assert(!*s);
+ val = g_string_new_len(params, val_end - params);
+ s = val_end;
+ if (*s == ',') {
+ s++;
+ }
+ } else {
+ if (*s != '=') {
+ error_setg(errp, "Expected '=' after parameter '%.*s'",
+ (int)(s - key), key);
+ return NULL;
+ }
+ s++;
+
+ val = g_string_new(NULL);
+ for (;;) {
+ if (!*s) {
+ break;
+ } else if (*s == ',') {
+ s++;
+ if (*s != ',') {
+ break;
+ }
+ }
+ g_string_append_c(val, *s++);
+ }
+ }
+
+ if (!keyval_parse_put(cur, key_in_cur, qstring_from_gstring(val),
+ key, key_end, errp)) {
+ return NULL;
+ }
+ return s;
+}
+
+static char *reassemble_key(GSList *key)
+{
+ GString *s = g_string_new("");
+ GSList *p;
+
+ for (p = key; p; p = p->next) {
+ g_string_prepend_c(s, '.');
+ g_string_prepend(s, (char *)p->data);
+ }
+
+ return g_string_free(s, FALSE);
+}
+
+/*
+ * Recursive worker for keyval_merge.
+ *
+ * @str is the path that led to the * current dictionary (to be used for
+ * error messages). It is modified internally but restored before the
+ * function returns.
+ */
+static void keyval_do_merge(QDict *dest, const QDict *merged, GString *str, Error **errp)
+{
+ size_t save_len = str->len;
+ const QDictEntry *ent;
+ QObject *old_value;
+
+ for (ent = qdict_first(merged); ent; ent = qdict_next(merged, ent)) {
+ old_value = qdict_get(dest, ent->key);
+ if (old_value) {
+ if (qobject_type(old_value) != qobject_type(ent->value)) {
+ error_setg(errp, "Parameter '%s%s' used inconsistently",
+ str->str, ent->key);
+ return;
+ } else if (qobject_type(ent->value) == QTYPE_QDICT) {
+ /* Merge sub-dictionaries. */
+ g_string_append(str, ent->key);
+ g_string_append_c(str, '.');
+ keyval_do_merge(qobject_to(QDict, old_value),
+ qobject_to(QDict, ent->value),
+ str, errp);
+ g_string_truncate(str, save_len);
+ continue;
+ } else if (qobject_type(ent->value) == QTYPE_QLIST) {
+ /* Append to old list. */
+ QList *old = qobject_to(QList, old_value);
+ QList *new = qobject_to(QList, ent->value);
+ const QListEntry *item;
+ QLIST_FOREACH_ENTRY(new, item) {
+ qobject_ref(item->value);
+ qlist_append_obj(old, item->value);
+ }
+ continue;
+ } else {
+ assert(qobject_type(ent->value) == QTYPE_QSTRING);
+ }
+ }
+
+ qobject_ref(ent->value);
+ qdict_put_obj(dest, ent->key, ent->value);
+ }
+}
+
+/* Merge the @merged dictionary into @dest.
+ *
+ * The dictionaries are expected to be returned by the keyval parser, and
+ * therefore the only expected scalar type is the string. In case the same
+ * path is present in both @dest and @merged, the semantics are as follows:
+ *
+ * - lists are concatenated
+ *
+ * - dictionaries are merged recursively
+ *
+ * - for scalar values, @merged wins
+ *
+ * In case an error is reported, @dest may already have been modified.
+ *
+ * This function can be used to implement semantics analogous to QemuOpts's
+ * .merge_lists = true case, or to implement -set for options backed by QDicts.
+ *
+ * Note: while QemuOpts is commonly used so that repeated keys overwrite
+ * ("last one wins"), it can also be used so that repeated keys build up
+ * a list. keyval_merge() can only be used when the options' semantics are
+ * the former, not the latter.
+ */
+void keyval_merge(QDict *dest, const QDict *merged, Error **errp)
+{
+ GString *str;
+
+ str = g_string_new("");
+ keyval_do_merge(dest, merged, str, errp);
+ g_string_free(str, TRUE);
+}
+
+/*
+ * Listify @cur recursively.
+ * Replace QDicts whose keys are all valid list indexes by QLists.
+ * @key_of_cur is the list of key fragments leading up to @cur.
+ * On success, return either @cur or its replacement.
+ * On failure, store an error through @errp and return NULL.
+ */
+static QObject *keyval_listify(QDict *cur, GSList *key_of_cur, Error **errp)
+{
+ GSList key_node;
+ bool has_index, has_member;
+ const QDictEntry *ent;
+ QDict *qdict;
+ QObject *val;
+ char *key;
+ size_t nelt;
+ QObject **elt;
+ int index, max_index, i;
+ QList *list;
+
+ key_node.next = key_of_cur;
+
+ /*
+ * Recursively listify @cur's members, and figure out whether @cur
+ * itself is to be listified.
+ */
+ has_index = false;
+ has_member = false;
+ for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) {
+ if (key_to_index(ent->key, NULL) >= 0) {
+ has_index = true;
+ } else {
+ has_member = true;
+ }
+
+ qdict = qobject_to(QDict, ent->value);
+ if (!qdict) {
+ continue;
+ }
+
+ key_node.data = ent->key;
+ val = keyval_listify(qdict, &key_node, errp);
+ if (!val) {
+ return NULL;
+ }
+ if (val != ent->value) {
+ qdict_put_obj(cur, ent->key, val);
+ }
+ }
+
+ if (has_index && has_member) {
+ key = reassemble_key(key_of_cur);
+ error_setg(errp, "Parameters '%s*' used inconsistently", key);
+ g_free(key);
+ return NULL;
+ }
+ if (!has_index) {
+ return QOBJECT(cur);
+ }
+
+ /* Copy @cur's values to @elt[] */
+ nelt = qdict_size(cur) + 1; /* one extra, for use as sentinel */
+ elt = g_new0(QObject *, nelt);
+ max_index = -1;
+ for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) {
+ index = key_to_index(ent->key, NULL);
+ assert(index >= 0);
+ if (index > max_index) {
+ max_index = index;
+ }
+ /*
+ * We iterate @nelt times. If we get one exceeding @nelt
+ * here, we will put less than @nelt values into @elt[],
+ * triggering the error in the next loop.
+ */
+ if ((size_t)index >= nelt - 1) {
+ continue;
+ }
+ /* Even though dict keys are distinct, indexes need not be */
+ elt[index] = ent->value;
+ }
+
+ /*
+ * Make a list from @elt[], reporting the first missing element,
+ * if any.
+ * If we dropped an index >= nelt in the previous loop, this loop
+ * will run into the sentinel and report index @nelt missing.
+ */
+ list = qlist_new();
+ assert(!elt[nelt-1]); /* need the sentinel to be null */
+ for (i = 0; i < MIN(nelt, max_index + 1); i++) {
+ if (!elt[i]) {
+ key = reassemble_key(key_of_cur);
+ error_setg(errp, "Parameter '%s%d' missing", key, i);
+ g_free(key);
+ g_free(elt);
+ qobject_unref(list);
+ return NULL;
+ }
+ qobject_ref(elt[i]);
+ qlist_append_obj(list, elt[i]);
+ }
+
+ g_free(elt);
+ return QOBJECT(list);
+}
+
+/*
+ * Parse @params in QEMU's traditional KEY=VALUE,... syntax.
+ *
+ * If @implied_key, the first KEY= can be omitted. @implied_key is
+ * implied then, and VALUE can't be empty or contain ',' or '='.
+ *
+ * A parameter "help" or "?" without a value isn't added to the
+ * resulting dictionary, but instead is interpreted as help request.
+ * All other options are parsed and returned normally so that context
+ * specific help can be printed.
+ *
+ * If @p_help is not NULL, store whether help is requested there.
+ * If @p_help is NULL and help is requested, fail.
+ *
+ * On success, return @dict, now filled with the parsed keys and values.
+ *
+ * On failure, store an error through @errp and return NULL. Any keys
+ * and values parsed so far will be in @dict nevertheless.
+ */
+QDict *keyval_parse_into(QDict *qdict, const char *params, const char *implied_key,
+ bool *p_help, Error **errp)
+{
+ QObject *listified;
+ const char *s;
+ bool help = false;
+
+ s = params;
+ while (*s) {
+ s = keyval_parse_one(qdict, s, implied_key, &help, errp);
+ if (!s) {
+ return NULL;
+ }
+ implied_key = NULL;
+ }
+
+ if (p_help) {
+ *p_help = help;
+ } else if (help) {
+ error_setg(errp, "Help is not available for this option");
+ return NULL;
+ }
+
+ listified = keyval_listify(qdict, NULL, errp);
+ if (!listified) {
+ return NULL;
+ }
+ assert(listified == QOBJECT(qdict));
+ return qdict;
+}
+
+/*
+ * Parse @params in QEMU's traditional KEY=VALUE,... syntax.
+ *
+ * If @implied_key, the first KEY= can be omitted. @implied_key is
+ * implied then, and VALUE can't be empty or contain ',' or '='.
+ *
+ * A parameter "help" or "?" without a value isn't added to the
+ * resulting dictionary, but instead is interpreted as help request.
+ * All other options are parsed and returned normally so that context
+ * specific help can be printed.
+ *
+ * If @p_help is not NULL, store whether help is requested there.
+ * If @p_help is NULL and help is requested, fail.
+ *
+ * On success, return a dictionary of the parsed keys and values.
+ * On failure, store an error through @errp and return NULL.
+ */
+QDict *keyval_parse(const char *params, const char *implied_key,
+ bool *p_help, Error **errp)
+{
+ QDict *qdict = qdict_new();
+ QDict *ret = keyval_parse_into(qdict, params, implied_key, p_help, errp);
+
+ if (!ret) {
+ qobject_unref(qdict);
+ }
+ return ret;
+}
diff --git a/util/lockcnt.c b/util/lockcnt.c
new file mode 100644
index 000000000..5da36946b
--- /dev/null
+++ b/util/lockcnt.c
@@ -0,0 +1,399 @@
+/*
+ * QemuLockCnt implementation
+ *
+ * Copyright Red Hat, Inc. 2017
+ *
+ * Author:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ */
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "trace.h"
+
+#ifdef CONFIG_LINUX
+#include "qemu/futex.h"
+
+/* On Linux, bits 0-1 are a futex-based lock, bits 2-31 are the counter.
+ * For the mutex algorithm see Ulrich Drepper's "Futexes Are Tricky" (ok,
+ * this is not the most relaxing citation I could make...). It is similar
+ * to mutex2 in the paper.
+ */
+
+#define QEMU_LOCKCNT_STATE_MASK 3
+#define QEMU_LOCKCNT_STATE_FREE 0 /* free, uncontended */
+#define QEMU_LOCKCNT_STATE_LOCKED 1 /* locked, uncontended */
+#define QEMU_LOCKCNT_STATE_WAITING 2 /* locked, contended */
+
+#define QEMU_LOCKCNT_COUNT_STEP 4
+#define QEMU_LOCKCNT_COUNT_SHIFT 2
+
+void qemu_lockcnt_init(QemuLockCnt *lockcnt)
+{
+ lockcnt->count = 0;
+}
+
+void qemu_lockcnt_destroy(QemuLockCnt *lockcnt)
+{
+}
+
+/* *val is the current value of lockcnt->count.
+ *
+ * If the lock is free, try a cmpxchg from *val to new_if_free; return
+ * true and set *val to the old value found by the cmpxchg in
+ * lockcnt->count.
+ *
+ * If the lock is taken, wait for it to be released and return false
+ * *without trying again to take the lock*. Again, set *val to the
+ * new value of lockcnt->count.
+ *
+ * If *waited is true on return, new_if_free's bottom two bits must not
+ * be QEMU_LOCKCNT_STATE_LOCKED on subsequent calls, because the caller
+ * does not know if there are other waiters. Furthermore, after *waited
+ * is set the caller has effectively acquired the lock. If it returns
+ * with the lock not taken, it must wake another futex waiter.
+ */
+static bool qemu_lockcnt_cmpxchg_or_wait(QemuLockCnt *lockcnt, int *val,
+ int new_if_free, bool *waited)
+{
+ /* Fast path for when the lock is free. */
+ if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_FREE) {
+ int expected = *val;
+
+ trace_lockcnt_fast_path_attempt(lockcnt, expected, new_if_free);
+ *val = qatomic_cmpxchg(&lockcnt->count, expected, new_if_free);
+ if (*val == expected) {
+ trace_lockcnt_fast_path_success(lockcnt, expected, new_if_free);
+ *val = new_if_free;
+ return true;
+ }
+ }
+
+ /* The slow path moves from locked to waiting if necessary, then
+ * does a futex wait. Both steps can be repeated ad nauseam,
+ * only getting out of the loop if we can have another shot at the
+ * fast path. Once we can, get out to compute the new destination
+ * value for the fast path.
+ */
+ while ((*val & QEMU_LOCKCNT_STATE_MASK) != QEMU_LOCKCNT_STATE_FREE) {
+ if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_LOCKED) {
+ int expected = *val;
+ int new = expected - QEMU_LOCKCNT_STATE_LOCKED + QEMU_LOCKCNT_STATE_WAITING;
+
+ trace_lockcnt_futex_wait_prepare(lockcnt, expected, new);
+ *val = qatomic_cmpxchg(&lockcnt->count, expected, new);
+ if (*val == expected) {
+ *val = new;
+ }
+ continue;
+ }
+
+ if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_WAITING) {
+ *waited = true;
+ trace_lockcnt_futex_wait(lockcnt, *val);
+ qemu_futex_wait(&lockcnt->count, *val);
+ *val = qatomic_read(&lockcnt->count);
+ trace_lockcnt_futex_wait_resume(lockcnt, *val);
+ continue;
+ }
+
+ abort();
+ }
+ return false;
+}
+
+static void lockcnt_wake(QemuLockCnt *lockcnt)
+{
+ trace_lockcnt_futex_wake(lockcnt);
+ qemu_futex_wake(&lockcnt->count, 1);
+}
+
+void qemu_lockcnt_inc(QemuLockCnt *lockcnt)
+{
+ int val = qatomic_read(&lockcnt->count);
+ bool waited = false;
+
+ for (;;) {
+ if (val >= QEMU_LOCKCNT_COUNT_STEP) {
+ int expected = val;
+ val = qatomic_cmpxchg(&lockcnt->count, val,
+ val + QEMU_LOCKCNT_COUNT_STEP);
+ if (val == expected) {
+ break;
+ }
+ } else {
+ /* The fast path is (0, unlocked)->(1, unlocked). */
+ if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, QEMU_LOCKCNT_COUNT_STEP,
+ &waited)) {
+ break;
+ }
+ }
+ }
+
+ /* If we were woken by another thread, we should also wake one because
+ * we are effectively releasing the lock that was given to us. This is
+ * the case where qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING
+ * in the low bits, and qemu_lockcnt_inc_and_unlock would find it and
+ * wake someone.
+ */
+ if (waited) {
+ lockcnt_wake(lockcnt);
+ }
+}
+
+void qemu_lockcnt_dec(QemuLockCnt *lockcnt)
+{
+ qatomic_sub(&lockcnt->count, QEMU_LOCKCNT_COUNT_STEP);
+}
+
+/* Decrement a counter, and return locked if it is decremented to zero.
+ * If the function returns true, it is impossible for the counter to
+ * become nonzero until the next qemu_lockcnt_unlock.
+ */
+bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt)
+{
+ int val = qatomic_read(&lockcnt->count);
+ int locked_state = QEMU_LOCKCNT_STATE_LOCKED;
+ bool waited = false;
+
+ for (;;) {
+ if (val >= 2 * QEMU_LOCKCNT_COUNT_STEP) {
+ int expected = val;
+ val = qatomic_cmpxchg(&lockcnt->count, val,
+ val - QEMU_LOCKCNT_COUNT_STEP);
+ if (val == expected) {
+ break;
+ }
+ } else {
+ /* If count is going 1->0, take the lock. The fast path is
+ * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting).
+ */
+ if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) {
+ return true;
+ }
+
+ if (waited) {
+ /* At this point we do not know if there are more waiters. Assume
+ * there are.
+ */
+ locked_state = QEMU_LOCKCNT_STATE_WAITING;
+ }
+ }
+ }
+
+ /* If we were woken by another thread, but we're returning in unlocked
+ * state, we should also wake a thread because we are effectively
+ * releasing the lock that was given to us. This is the case where
+ * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low
+ * bits, and qemu_lockcnt_unlock would find it and wake someone.
+ */
+ if (waited) {
+ lockcnt_wake(lockcnt);
+ }
+ return false;
+}
+
+/* If the counter is one, decrement it and return locked. Otherwise do
+ * nothing.
+ *
+ * If the function returns true, it is impossible for the counter to
+ * become nonzero until the next qemu_lockcnt_unlock.
+ */
+bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt)
+{
+ int val = qatomic_read(&lockcnt->count);
+ int locked_state = QEMU_LOCKCNT_STATE_LOCKED;
+ bool waited = false;
+
+ while (val < 2 * QEMU_LOCKCNT_COUNT_STEP) {
+ /* If count is going 1->0, take the lock. The fast path is
+ * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting).
+ */
+ if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) {
+ return true;
+ }
+
+ if (waited) {
+ /* At this point we do not know if there are more waiters. Assume
+ * there are.
+ */
+ locked_state = QEMU_LOCKCNT_STATE_WAITING;
+ }
+ }
+
+ /* If we were woken by another thread, but we're returning in unlocked
+ * state, we should also wake a thread because we are effectively
+ * releasing the lock that was given to us. This is the case where
+ * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low
+ * bits, and qemu_lockcnt_inc_and_unlock would find it and wake someone.
+ */
+ if (waited) {
+ lockcnt_wake(lockcnt);
+ }
+ return false;
+}
+
+void qemu_lockcnt_lock(QemuLockCnt *lockcnt)
+{
+ int val = qatomic_read(&lockcnt->count);
+ int step = QEMU_LOCKCNT_STATE_LOCKED;
+ bool waited = false;
+
+ /* The third argument is only used if the low bits of val are 0
+ * (QEMU_LOCKCNT_STATE_FREE), so just blindly mix in the desired
+ * state.
+ */
+ while (!qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, val + step, &waited)) {
+ if (waited) {
+ /* At this point we do not know if there are more waiters. Assume
+ * there are.
+ */
+ step = QEMU_LOCKCNT_STATE_WAITING;
+ }
+ }
+}
+
+void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt)
+{
+ int expected, new, val;
+
+ val = qatomic_read(&lockcnt->count);
+ do {
+ expected = val;
+ new = (val + QEMU_LOCKCNT_COUNT_STEP) & ~QEMU_LOCKCNT_STATE_MASK;
+ trace_lockcnt_unlock_attempt(lockcnt, val, new);
+ val = qatomic_cmpxchg(&lockcnt->count, val, new);
+ } while (val != expected);
+
+ trace_lockcnt_unlock_success(lockcnt, val, new);
+ if (val & QEMU_LOCKCNT_STATE_WAITING) {
+ lockcnt_wake(lockcnt);
+ }
+}
+
+void qemu_lockcnt_unlock(QemuLockCnt *lockcnt)
+{
+ int expected, new, val;
+
+ val = qatomic_read(&lockcnt->count);
+ do {
+ expected = val;
+ new = val & ~QEMU_LOCKCNT_STATE_MASK;
+ trace_lockcnt_unlock_attempt(lockcnt, val, new);
+ val = qatomic_cmpxchg(&lockcnt->count, val, new);
+ } while (val != expected);
+
+ trace_lockcnt_unlock_success(lockcnt, val, new);
+ if (val & QEMU_LOCKCNT_STATE_WAITING) {
+ lockcnt_wake(lockcnt);
+ }
+}
+
+unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt)
+{
+ return qatomic_read(&lockcnt->count) >> QEMU_LOCKCNT_COUNT_SHIFT;
+}
+#else
+void qemu_lockcnt_init(QemuLockCnt *lockcnt)
+{
+ qemu_mutex_init(&lockcnt->mutex);
+ lockcnt->count = 0;
+}
+
+void qemu_lockcnt_destroy(QemuLockCnt *lockcnt)
+{
+ qemu_mutex_destroy(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_inc(QemuLockCnt *lockcnt)
+{
+ int old;
+ for (;;) {
+ old = qatomic_read(&lockcnt->count);
+ if (old == 0) {
+ qemu_lockcnt_lock(lockcnt);
+ qemu_lockcnt_inc_and_unlock(lockcnt);
+ return;
+ } else {
+ if (qatomic_cmpxchg(&lockcnt->count, old, old + 1) == old) {
+ return;
+ }
+ }
+ }
+}
+
+void qemu_lockcnt_dec(QemuLockCnt *lockcnt)
+{
+ qatomic_dec(&lockcnt->count);
+}
+
+/* Decrement a counter, and return locked if it is decremented to zero.
+ * It is impossible for the counter to become nonzero while the mutex
+ * is taken.
+ */
+bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt)
+{
+ int val = qatomic_read(&lockcnt->count);
+ while (val > 1) {
+ int old = qatomic_cmpxchg(&lockcnt->count, val, val - 1);
+ if (old != val) {
+ val = old;
+ continue;
+ }
+
+ return false;
+ }
+
+ qemu_lockcnt_lock(lockcnt);
+ if (qatomic_fetch_dec(&lockcnt->count) == 1) {
+ return true;
+ }
+
+ qemu_lockcnt_unlock(lockcnt);
+ return false;
+}
+
+/* Decrement a counter and return locked if it is decremented to zero.
+ * Otherwise do nothing.
+ *
+ * It is impossible for the counter to become nonzero while the mutex
+ * is taken.
+ */
+bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt)
+{
+ /* No need for acquire semantics if we return false. */
+ int val = qatomic_read(&lockcnt->count);
+ if (val > 1) {
+ return false;
+ }
+
+ qemu_lockcnt_lock(lockcnt);
+ if (qatomic_fetch_dec(&lockcnt->count) == 1) {
+ return true;
+ }
+
+ qemu_lockcnt_inc_and_unlock(lockcnt);
+ return false;
+}
+
+void qemu_lockcnt_lock(QemuLockCnt *lockcnt)
+{
+ qemu_mutex_lock(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt)
+{
+ qatomic_inc(&lockcnt->count);
+ qemu_mutex_unlock(&lockcnt->mutex);
+}
+
+void qemu_lockcnt_unlock(QemuLockCnt *lockcnt)
+{
+ qemu_mutex_unlock(&lockcnt->mutex);
+}
+
+unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt)
+{
+ return qatomic_read(&lockcnt->count);
+}
+#endif
diff --git a/util/log.c b/util/log.c
new file mode 100644
index 000000000..2ee1500be
--- /dev/null
+++ b/util/log.c
@@ -0,0 +1,389 @@
+/*
+ * Logging support
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/range.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "trace/control.h"
+#include "qemu/thread.h"
+#include "qemu/lockable.h"
+
+static char *logfilename;
+static QemuMutex qemu_logfile_mutex;
+QemuLogFile *qemu_logfile;
+int qemu_loglevel;
+static int log_append = 0;
+static GArray *debug_regions;
+
+/* Return the number of characters emitted. */
+int qemu_log(const char *fmt, ...)
+{
+ int ret = 0;
+ QemuLogFile *logfile;
+
+ rcu_read_lock();
+ logfile = qatomic_rcu_read(&qemu_logfile);
+ if (logfile) {
+ va_list ap;
+ va_start(ap, fmt);
+ ret = vfprintf(logfile->fd, fmt, ap);
+ va_end(ap);
+
+ /* Don't pass back error results. */
+ if (ret < 0) {
+ ret = 0;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __attribute__((__constructor__)) qemu_logfile_init(void)
+{
+ qemu_mutex_init(&qemu_logfile_mutex);
+}
+
+static void qemu_logfile_free(QemuLogFile *logfile)
+{
+ g_assert(logfile);
+
+ if (logfile->fd != stderr) {
+ fclose(logfile->fd);
+ }
+ g_free(logfile);
+}
+
+static bool log_uses_own_buffers;
+
+/* enable or disable low levels log */
+void qemu_set_log(int log_flags)
+{
+ bool need_to_open_file = false;
+ QemuLogFile *logfile;
+
+ qemu_loglevel = log_flags;
+#ifdef CONFIG_TRACE_LOG
+ qemu_loglevel |= LOG_TRACE;
+#endif
+ /*
+ * In all cases we only log if qemu_loglevel is set.
+ * Also:
+ * If not daemonized we will always log either to stderr
+ * or to a file (if there is a logfilename).
+ * If we are daemonized,
+ * we will only log if there is a logfilename.
+ */
+ if (qemu_loglevel && (!is_daemonized() || logfilename)) {
+ need_to_open_file = true;
+ }
+ QEMU_LOCK_GUARD(&qemu_logfile_mutex);
+ if (qemu_logfile && !need_to_open_file) {
+ logfile = qemu_logfile;
+ qatomic_rcu_set(&qemu_logfile, NULL);
+ call_rcu(logfile, qemu_logfile_free, rcu);
+ } else if (!qemu_logfile && need_to_open_file) {
+ logfile = g_new0(QemuLogFile, 1);
+ if (logfilename) {
+ logfile->fd = fopen(logfilename, log_append ? "a" : "w");
+ if (!logfile->fd) {
+ g_free(logfile);
+ perror(logfilename);
+ _exit(1);
+ }
+ /* In case we are a daemon redirect stderr to logfile */
+ if (is_daemonized()) {
+ dup2(fileno(logfile->fd), STDERR_FILENO);
+ fclose(logfile->fd);
+ /* This will skip closing logfile in qemu_log_close() */
+ logfile->fd = stderr;
+ }
+ } else {
+ /* Default to stderr if no log file specified */
+ assert(!is_daemonized());
+ logfile->fd = stderr;
+ }
+ /* must avoid mmap() usage of glibc by setting a buffer "by hand" */
+ if (log_uses_own_buffers) {
+ static char logfile_buf[4096];
+
+ setvbuf(logfile->fd, logfile_buf, _IOLBF, sizeof(logfile_buf));
+ } else {
+#if defined(_WIN32)
+ /* Win32 doesn't support line-buffering, so use unbuffered output. */
+ setvbuf(logfile->fd, NULL, _IONBF, 0);
+#else
+ setvbuf(logfile->fd, NULL, _IOLBF, 0);
+#endif
+ log_append = 1;
+ }
+ qatomic_rcu_set(&qemu_logfile, logfile);
+ }
+}
+
+void qemu_log_needs_buffers(void)
+{
+ log_uses_own_buffers = true;
+}
+
+/*
+ * Allow the user to include %d in their logfile which will be
+ * substituted with the current PID. This is useful for debugging many
+ * nested linux-user tasks but will result in lots of logs.
+ *
+ * filename may be NULL. In that case, log output is sent to stderr
+ */
+void qemu_set_log_filename(const char *filename, Error **errp)
+{
+ g_free(logfilename);
+ logfilename = NULL;
+
+ if (filename) {
+ char *pidstr = strstr(filename, "%");
+ if (pidstr) {
+ /* We only accept one %d, no other format strings */
+ if (pidstr[1] != 'd' || strchr(pidstr + 2, '%')) {
+ error_setg(errp, "Bad logfile format: %s", filename);
+ return;
+ } else {
+ logfilename = g_strdup_printf(filename, getpid());
+ }
+ } else {
+ logfilename = g_strdup(filename);
+ }
+ }
+
+ qemu_log_close();
+ qemu_set_log(qemu_loglevel);
+}
+
+/* Returns true if addr is in our debug filter or no filter defined
+ */
+bool qemu_log_in_addr_range(uint64_t addr)
+{
+ if (debug_regions) {
+ int i = 0;
+ for (i = 0; i < debug_regions->len; i++) {
+ Range *range = &g_array_index(debug_regions, Range, i);
+ if (range_contains(range, addr)) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ return true;
+ }
+}
+
+
+void qemu_set_dfilter_ranges(const char *filter_spec, Error **errp)
+{
+ gchar **ranges = g_strsplit(filter_spec, ",", 0);
+ int i;
+
+ if (debug_regions) {
+ g_array_unref(debug_regions);
+ debug_regions = NULL;
+ }
+
+ debug_regions = g_array_sized_new(FALSE, FALSE,
+ sizeof(Range), g_strv_length(ranges));
+ for (i = 0; ranges[i]; i++) {
+ const char *r = ranges[i];
+ const char *range_op, *r2, *e;
+ uint64_t r1val, r2val, lob, upb;
+ struct Range range;
+
+ range_op = strstr(r, "-");
+ r2 = range_op ? range_op + 1 : NULL;
+ if (!range_op) {
+ range_op = strstr(r, "+");
+ r2 = range_op ? range_op + 1 : NULL;
+ }
+ if (!range_op) {
+ range_op = strstr(r, "..");
+ r2 = range_op ? range_op + 2 : NULL;
+ }
+ if (!range_op) {
+ error_setg(errp, "Bad range specifier");
+ goto out;
+ }
+
+ if (qemu_strtou64(r, &e, 0, &r1val)
+ || e != range_op) {
+ error_setg(errp, "Invalid number to the left of %.*s",
+ (int)(r2 - range_op), range_op);
+ goto out;
+ }
+ if (qemu_strtou64(r2, NULL, 0, &r2val)) {
+ error_setg(errp, "Invalid number to the right of %.*s",
+ (int)(r2 - range_op), range_op);
+ goto out;
+ }
+
+ switch (*range_op) {
+ case '+':
+ lob = r1val;
+ upb = r1val + r2val - 1;
+ break;
+ case '-':
+ upb = r1val;
+ lob = r1val - (r2val - 1);
+ break;
+ case '.':
+ lob = r1val;
+ upb = r2val;
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ if (lob > upb) {
+ error_setg(errp, "Invalid range");
+ goto out;
+ }
+ range_set_bounds(&range, lob, upb);
+ g_array_append_val(debug_regions, range);
+ }
+out:
+ g_strfreev(ranges);
+}
+
+/* fflush() the log file */
+void qemu_log_flush(void)
+{
+ QemuLogFile *logfile;
+
+ rcu_read_lock();
+ logfile = qatomic_rcu_read(&qemu_logfile);
+ if (logfile) {
+ fflush(logfile->fd);
+ }
+ rcu_read_unlock();
+}
+
+/* Close the log file */
+void qemu_log_close(void)
+{
+ QemuLogFile *logfile;
+
+ qemu_mutex_lock(&qemu_logfile_mutex);
+ logfile = qemu_logfile;
+
+ if (logfile) {
+ qatomic_rcu_set(&qemu_logfile, NULL);
+ call_rcu(logfile, qemu_logfile_free, rcu);
+ }
+ qemu_mutex_unlock(&qemu_logfile_mutex);
+}
+
+const QEMULogItem qemu_log_items[] = {
+ { CPU_LOG_TB_OUT_ASM, "out_asm",
+ "show generated host assembly code for each compiled TB" },
+ { CPU_LOG_TB_IN_ASM, "in_asm",
+ "show target assembly code for each compiled TB" },
+ { CPU_LOG_TB_OP, "op",
+ "show micro ops for each compiled TB" },
+ { CPU_LOG_TB_OP_OPT, "op_opt",
+ "show micro ops after optimization" },
+ { CPU_LOG_TB_OP_IND, "op_ind",
+ "show micro ops before indirect lowering" },
+ { CPU_LOG_INT, "int",
+ "show interrupts/exceptions in short format" },
+ { CPU_LOG_EXEC, "exec",
+ "show trace before each executed TB (lots of logs)" },
+ { CPU_LOG_TB_CPU, "cpu",
+ "show CPU registers before entering a TB (lots of logs)" },
+ { CPU_LOG_TB_FPU, "fpu",
+ "include FPU registers in the 'cpu' logging" },
+ { CPU_LOG_MMU, "mmu",
+ "log MMU-related activities" },
+ { CPU_LOG_PCALL, "pcall",
+ "x86 only: show protected mode far calls/returns/exceptions" },
+ { CPU_LOG_RESET, "cpu_reset",
+ "show CPU state before CPU resets" },
+ { LOG_UNIMP, "unimp",
+ "log unimplemented functionality" },
+ { LOG_GUEST_ERROR, "guest_errors",
+ "log when the guest OS does something invalid (eg accessing a\n"
+ "non-existent register)" },
+ { CPU_LOG_PAGE, "page",
+ "dump pages at beginning of user mode emulation" },
+ { CPU_LOG_TB_NOCHAIN, "nochain",
+ "do not chain compiled TBs so that \"exec\" and \"cpu\" show\n"
+ "complete traces" },
+#ifdef CONFIG_PLUGIN
+ { CPU_LOG_PLUGIN, "plugin", "output from TCG plugins\n"},
+#endif
+ { LOG_STRACE, "strace",
+ "log every user-mode syscall, its input, and its result" },
+ { 0, NULL, NULL },
+};
+
+/* takes a comma separated list of log masks. Return 0 if error. */
+int qemu_str_to_log_mask(const char *str)
+{
+ const QEMULogItem *item;
+ int mask = 0;
+ char **parts = g_strsplit(str, ",", 0);
+ char **tmp;
+
+ for (tmp = parts; tmp && *tmp; tmp++) {
+ if (g_str_equal(*tmp, "all")) {
+ for (item = qemu_log_items; item->mask != 0; item++) {
+ mask |= item->mask;
+ }
+#ifdef CONFIG_TRACE_LOG
+ } else if (g_str_has_prefix(*tmp, "trace:") && (*tmp)[6] != '\0') {
+ trace_enable_events((*tmp) + 6);
+ mask |= LOG_TRACE;
+#endif
+ } else {
+ for (item = qemu_log_items; item->mask != 0; item++) {
+ if (g_str_equal(*tmp, item->name)) {
+ goto found;
+ }
+ }
+ goto error;
+ found:
+ mask |= item->mask;
+ }
+ }
+
+ g_strfreev(parts);
+ return mask;
+
+ error:
+ g_strfreev(parts);
+ return 0;
+}
+
+void qemu_print_log_usage(FILE *f)
+{
+ const QEMULogItem *item;
+ fprintf(f, "Log items (comma separated):\n");
+ for (item = qemu_log_items; item->mask != 0; item++) {
+ fprintf(f, "%-15s %s\n", item->name, item->help);
+ }
+#ifdef CONFIG_TRACE_LOG
+ fprintf(f, "trace:PATTERN enable trace events\n");
+ fprintf(f, "\nUse \"-d trace:help\" to get a list of trace events.\n\n");
+#endif
+}
diff --git a/util/main-loop.c b/util/main-loop.c
new file mode 100644
index 000000000..06b18b195
--- /dev/null
+++ b/util/main-loop.c
@@ -0,0 +1,594 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/cutils.h"
+#include "qemu/timer.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/replay.h"
+#include "qemu/main-loop.h"
+#include "block/aio.h"
+#include "qemu/error-report.h"
+#include "qemu/queue.h"
+#include "qemu/compiler.h"
+
+#ifndef _WIN32
+#include <sys/wait.h>
+#endif
+
+#ifndef _WIN32
+
+/* If we have signalfd, we mask out the signals we want to handle and then
+ * use signalfd to listen for them. We rely on whatever the current signal
+ * handler is to dispatch the signals when we receive them.
+ */
+/*
+ * Disable CFI checks.
+ * We are going to call a signal hander directly. Such handler may or may not
+ * have been defined in our binary, so there's no guarantee that the pointer
+ * used to set the handler is a cfi-valid pointer. Since the handlers are
+ * stored in kernel memory, changing the handler to an attacker-defined
+ * function requires being able to call a sigaction() syscall,
+ * which is not as easy as overwriting a pointer in memory.
+ */
+QEMU_DISABLE_CFI
+static void sigfd_handler(void *opaque)
+{
+ int fd = (intptr_t)opaque;
+ struct qemu_signalfd_siginfo info;
+ struct sigaction action;
+ ssize_t len;
+
+ while (1) {
+ do {
+ len = read(fd, &info, sizeof(info));
+ } while (len == -1 && errno == EINTR);
+
+ if (len == -1 && errno == EAGAIN) {
+ break;
+ }
+
+ if (len != sizeof(info)) {
+ error_report("read from sigfd returned %zd: %s", len,
+ g_strerror(errno));
+ return;
+ }
+
+ sigaction(info.ssi_signo, NULL, &action);
+ if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) {
+ sigaction_invoke(&action, &info);
+ } else if (action.sa_handler) {
+ action.sa_handler(info.ssi_signo);
+ }
+ }
+}
+
+static int qemu_signal_init(Error **errp)
+{
+ int sigfd;
+ sigset_t set;
+
+ /*
+ * SIG_IPI must be blocked in the main thread and must not be caught
+ * by sigwait() in the signal thread. Otherwise, the cpu thread will
+ * not catch it reliably.
+ */
+ sigemptyset(&set);
+ sigaddset(&set, SIG_IPI);
+ sigaddset(&set, SIGIO);
+ sigaddset(&set, SIGALRM);
+ sigaddset(&set, SIGBUS);
+ /* SIGINT cannot be handled via signalfd, so that ^C can be used
+ * to interrupt QEMU when it is being run under gdb. SIGHUP and
+ * SIGTERM are also handled asynchronously, even though it is not
+ * strictly necessary, because they use the same handler as SIGINT.
+ */
+ pthread_sigmask(SIG_BLOCK, &set, NULL);
+
+ sigdelset(&set, SIG_IPI);
+ sigfd = qemu_signalfd(&set);
+ if (sigfd == -1) {
+ error_setg_errno(errp, errno, "failed to create signalfd");
+ return -errno;
+ }
+
+ fcntl_setfl(sigfd, O_NONBLOCK);
+
+ qemu_set_fd_handler(sigfd, sigfd_handler, NULL, (void *)(intptr_t)sigfd);
+
+ return 0;
+}
+
+#else /* _WIN32 */
+
+static int qemu_signal_init(Error **errp)
+{
+ return 0;
+}
+#endif
+
+static AioContext *qemu_aio_context;
+static QEMUBH *qemu_notify_bh;
+
+static void notify_event_cb(void *opaque)
+{
+ /* No need to do anything; this bottom half is only used to
+ * kick the kernel out of ppoll/poll/WaitForMultipleObjects.
+ */
+}
+
+AioContext *qemu_get_aio_context(void)
+{
+ return qemu_aio_context;
+}
+
+void qemu_notify_event(void)
+{
+ if (!qemu_aio_context) {
+ return;
+ }
+ qemu_bh_schedule(qemu_notify_bh);
+}
+
+static GArray *gpollfds;
+
+int qemu_init_main_loop(Error **errp)
+{
+ int ret;
+ GSource *src;
+
+ init_clocks(qemu_timer_notify_cb);
+
+ ret = qemu_signal_init(errp);
+ if (ret) {
+ return ret;
+ }
+
+ qemu_aio_context = aio_context_new(errp);
+ if (!qemu_aio_context) {
+ return -EMFILE;
+ }
+ qemu_set_current_aio_context(qemu_aio_context);
+ qemu_notify_bh = qemu_bh_new(notify_event_cb, NULL);
+ gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD));
+ src = aio_get_g_source(qemu_aio_context);
+ g_source_set_name(src, "aio-context");
+ g_source_attach(src, NULL);
+ g_source_unref(src);
+ src = iohandler_get_g_source();
+ g_source_set_name(src, "io-handler");
+ g_source_attach(src, NULL);
+ g_source_unref(src);
+ return 0;
+}
+
+static int max_priority;
+
+#ifndef _WIN32
+static int glib_pollfds_idx;
+static int glib_n_poll_fds;
+
+void qemu_fd_register(int fd)
+{
+}
+
+static void glib_pollfds_fill(int64_t *cur_timeout)
+{
+ GMainContext *context = g_main_context_default();
+ int timeout = 0;
+ int64_t timeout_ns;
+ int n;
+
+ g_main_context_prepare(context, &max_priority);
+
+ glib_pollfds_idx = gpollfds->len;
+ n = glib_n_poll_fds;
+ do {
+ GPollFD *pfds;
+ glib_n_poll_fds = n;
+ g_array_set_size(gpollfds, glib_pollfds_idx + glib_n_poll_fds);
+ pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx);
+ n = g_main_context_query(context, max_priority, &timeout, pfds,
+ glib_n_poll_fds);
+ } while (n != glib_n_poll_fds);
+
+ if (timeout < 0) {
+ timeout_ns = -1;
+ } else {
+ timeout_ns = (int64_t)timeout * (int64_t)SCALE_MS;
+ }
+
+ *cur_timeout = qemu_soonest_timeout(timeout_ns, *cur_timeout);
+}
+
+static void glib_pollfds_poll(void)
+{
+ GMainContext *context = g_main_context_default();
+ GPollFD *pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx);
+
+ if (g_main_context_check(context, max_priority, pfds, glib_n_poll_fds)) {
+ g_main_context_dispatch(context);
+ }
+}
+
+#define MAX_MAIN_LOOP_SPIN (1000)
+
+static int os_host_main_loop_wait(int64_t timeout)
+{
+ GMainContext *context = g_main_context_default();
+ int ret;
+
+ g_main_context_acquire(context);
+
+ glib_pollfds_fill(&timeout);
+
+ qemu_mutex_unlock_iothread();
+ replay_mutex_unlock();
+
+ ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, timeout);
+
+ replay_mutex_lock();
+ qemu_mutex_lock_iothread();
+
+ glib_pollfds_poll();
+
+ g_main_context_release(context);
+
+ return ret;
+}
+#else
+/***********************************************************/
+/* Polling handling */
+
+typedef struct PollingEntry {
+ PollingFunc *func;
+ void *opaque;
+ struct PollingEntry *next;
+} PollingEntry;
+
+static PollingEntry *first_polling_entry;
+
+int qemu_add_polling_cb(PollingFunc *func, void *opaque)
+{
+ PollingEntry **ppe, *pe;
+ pe = g_malloc0(sizeof(PollingEntry));
+ pe->func = func;
+ pe->opaque = opaque;
+ for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next);
+ *ppe = pe;
+ return 0;
+}
+
+void qemu_del_polling_cb(PollingFunc *func, void *opaque)
+{
+ PollingEntry **ppe, *pe;
+ for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) {
+ pe = *ppe;
+ if (pe->func == func && pe->opaque == opaque) {
+ *ppe = pe->next;
+ g_free(pe);
+ break;
+ }
+ }
+}
+
+/***********************************************************/
+/* Wait objects support */
+typedef struct WaitObjects {
+ int num;
+ int revents[MAXIMUM_WAIT_OBJECTS + 1];
+ HANDLE events[MAXIMUM_WAIT_OBJECTS + 1];
+ WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1];
+ void *opaque[MAXIMUM_WAIT_OBJECTS + 1];
+} WaitObjects;
+
+static WaitObjects wait_objects = {0};
+
+int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+ WaitObjects *w = &wait_objects;
+ if (w->num >= MAXIMUM_WAIT_OBJECTS) {
+ return -1;
+ }
+ w->events[w->num] = handle;
+ w->func[w->num] = func;
+ w->opaque[w->num] = opaque;
+ w->revents[w->num] = 0;
+ w->num++;
+ return 0;
+}
+
+void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque)
+{
+ int i, found;
+ WaitObjects *w = &wait_objects;
+
+ found = 0;
+ for (i = 0; i < w->num; i++) {
+ if (w->events[i] == handle) {
+ found = 1;
+ }
+ if (found) {
+ w->events[i] = w->events[i + 1];
+ w->func[i] = w->func[i + 1];
+ w->opaque[i] = w->opaque[i + 1];
+ w->revents[i] = w->revents[i + 1];
+ }
+ }
+ if (found) {
+ w->num--;
+ }
+}
+
+void qemu_fd_register(int fd)
+{
+ WSAEventSelect(fd, event_notifier_get_handle(&qemu_aio_context->notifier),
+ FD_READ | FD_ACCEPT | FD_CLOSE |
+ FD_CONNECT | FD_WRITE | FD_OOB);
+}
+
+static int pollfds_fill(GArray *pollfds, fd_set *rfds, fd_set *wfds,
+ fd_set *xfds)
+{
+ int nfds = -1;
+ int i;
+
+ for (i = 0; i < pollfds->len; i++) {
+ GPollFD *pfd = &g_array_index(pollfds, GPollFD, i);
+ int fd = pfd->fd;
+ int events = pfd->events;
+ if (events & G_IO_IN) {
+ FD_SET(fd, rfds);
+ nfds = MAX(nfds, fd);
+ }
+ if (events & G_IO_OUT) {
+ FD_SET(fd, wfds);
+ nfds = MAX(nfds, fd);
+ }
+ if (events & G_IO_PRI) {
+ FD_SET(fd, xfds);
+ nfds = MAX(nfds, fd);
+ }
+ }
+ return nfds;
+}
+
+static void pollfds_poll(GArray *pollfds, int nfds, fd_set *rfds,
+ fd_set *wfds, fd_set *xfds)
+{
+ int i;
+
+ for (i = 0; i < pollfds->len; i++) {
+ GPollFD *pfd = &g_array_index(pollfds, GPollFD, i);
+ int fd = pfd->fd;
+ int revents = 0;
+
+ if (FD_ISSET(fd, rfds)) {
+ revents |= G_IO_IN;
+ }
+ if (FD_ISSET(fd, wfds)) {
+ revents |= G_IO_OUT;
+ }
+ if (FD_ISSET(fd, xfds)) {
+ revents |= G_IO_PRI;
+ }
+ pfd->revents = revents & pfd->events;
+ }
+}
+
+static int os_host_main_loop_wait(int64_t timeout)
+{
+ GMainContext *context = g_main_context_default();
+ GPollFD poll_fds[1024 * 2]; /* this is probably overkill */
+ int select_ret = 0;
+ int g_poll_ret, ret, i, n_poll_fds;
+ PollingEntry *pe;
+ WaitObjects *w = &wait_objects;
+ gint poll_timeout;
+ int64_t poll_timeout_ns;
+ static struct timeval tv0;
+ fd_set rfds, wfds, xfds;
+ int nfds;
+
+ g_main_context_acquire(context);
+
+ /* XXX: need to suppress polling by better using win32 events */
+ ret = 0;
+ for (pe = first_polling_entry; pe != NULL; pe = pe->next) {
+ ret |= pe->func(pe->opaque);
+ }
+ if (ret != 0) {
+ g_main_context_release(context);
+ return ret;
+ }
+
+ FD_ZERO(&rfds);
+ FD_ZERO(&wfds);
+ FD_ZERO(&xfds);
+ nfds = pollfds_fill(gpollfds, &rfds, &wfds, &xfds);
+ if (nfds >= 0) {
+ select_ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv0);
+ if (select_ret != 0) {
+ timeout = 0;
+ }
+ if (select_ret > 0) {
+ pollfds_poll(gpollfds, nfds, &rfds, &wfds, &xfds);
+ }
+ }
+
+ g_main_context_prepare(context, &max_priority);
+ n_poll_fds = g_main_context_query(context, max_priority, &poll_timeout,
+ poll_fds, ARRAY_SIZE(poll_fds));
+ g_assert(n_poll_fds + w->num <= ARRAY_SIZE(poll_fds));
+
+ for (i = 0; i < w->num; i++) {
+ poll_fds[n_poll_fds + i].fd = (DWORD_PTR)w->events[i];
+ poll_fds[n_poll_fds + i].events = G_IO_IN;
+ }
+
+ if (poll_timeout < 0) {
+ poll_timeout_ns = -1;
+ } else {
+ poll_timeout_ns = (int64_t)poll_timeout * (int64_t)SCALE_MS;
+ }
+
+ poll_timeout_ns = qemu_soonest_timeout(poll_timeout_ns, timeout);
+
+ qemu_mutex_unlock_iothread();
+
+ replay_mutex_unlock();
+
+ g_poll_ret = qemu_poll_ns(poll_fds, n_poll_fds + w->num, poll_timeout_ns);
+
+ replay_mutex_lock();
+
+ qemu_mutex_lock_iothread();
+ if (g_poll_ret > 0) {
+ for (i = 0; i < w->num; i++) {
+ w->revents[i] = poll_fds[n_poll_fds + i].revents;
+ }
+ for (i = 0; i < w->num; i++) {
+ if (w->revents[i] && w->func[i]) {
+ w->func[i](w->opaque[i]);
+ }
+ }
+ }
+
+ if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) {
+ g_main_context_dispatch(context);
+ }
+
+ g_main_context_release(context);
+
+ return select_ret || g_poll_ret;
+}
+#endif
+
+static NotifierList main_loop_poll_notifiers =
+ NOTIFIER_LIST_INITIALIZER(main_loop_poll_notifiers);
+
+void main_loop_poll_add_notifier(Notifier *notify)
+{
+ notifier_list_add(&main_loop_poll_notifiers, notify);
+}
+
+void main_loop_poll_remove_notifier(Notifier *notify)
+{
+ notifier_remove(notify);
+}
+
+void main_loop_wait(int nonblocking)
+{
+ MainLoopPoll mlpoll = {
+ .state = MAIN_LOOP_POLL_FILL,
+ .timeout = UINT32_MAX,
+ .pollfds = gpollfds,
+ };
+ int ret;
+ int64_t timeout_ns;
+
+ if (nonblocking) {
+ mlpoll.timeout = 0;
+ }
+
+ /* poll any events */
+ g_array_set_size(gpollfds, 0); /* reset for new iteration */
+ /* XXX: separate device handlers from system ones */
+ notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);
+
+ if (mlpoll.timeout == UINT32_MAX) {
+ timeout_ns = -1;
+ } else {
+ timeout_ns = (uint64_t)mlpoll.timeout * (int64_t)(SCALE_MS);
+ }
+
+ timeout_ns = qemu_soonest_timeout(timeout_ns,
+ timerlistgroup_deadline_ns(
+ &main_loop_tlg));
+
+ ret = os_host_main_loop_wait(timeout_ns);
+ mlpoll.state = ret < 0 ? MAIN_LOOP_POLL_ERR : MAIN_LOOP_POLL_OK;
+ notifier_list_notify(&main_loop_poll_notifiers, &mlpoll);
+
+ if (icount_enabled()) {
+ /*
+ * CPU thread can infinitely wait for event after
+ * missing the warp
+ */
+ icount_start_warp_timer();
+ }
+ qemu_clock_run_all_timers();
+}
+
+/* Functions to operate on the main QEMU AioContext. */
+
+QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name)
+{
+ return aio_bh_new_full(qemu_aio_context, cb, opaque, name);
+}
+
+/*
+ * Functions to operate on the I/O handler AioContext.
+ * This context runs on top of main loop. We can't reuse qemu_aio_context
+ * because iohandlers mustn't be polled by aio_poll(qemu_aio_context).
+ */
+static AioContext *iohandler_ctx;
+
+static void iohandler_init(void)
+{
+ if (!iohandler_ctx) {
+ iohandler_ctx = aio_context_new(&error_abort);
+ }
+}
+
+AioContext *iohandler_get_aio_context(void)
+{
+ iohandler_init();
+ return iohandler_ctx;
+}
+
+GSource *iohandler_get_g_source(void)
+{
+ iohandler_init();
+ return aio_get_g_source(iohandler_ctx);
+}
+
+void qemu_set_fd_handler(int fd,
+ IOHandler *fd_read,
+ IOHandler *fd_write,
+ void *opaque)
+{
+ iohandler_init();
+ aio_set_fd_handler(iohandler_ctx, fd, false,
+ fd_read, fd_write, NULL, opaque);
+}
+
+void event_notifier_set_handler(EventNotifier *e,
+ EventNotifierHandler *handler)
+{
+ iohandler_init();
+ aio_set_event_notifier(iohandler_ctx, e, false,
+ handler, NULL);
+}
diff --git a/util/memfd.c b/util/memfd.c
new file mode 100644
index 000000000..4a3c07e0b
--- /dev/null
+++ b/util/memfd.c
@@ -0,0 +1,206 @@
+/*
+ * memfd.c
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * QEMU library functions on POSIX which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/memfd.h"
+#include "qemu/host-utils.h"
+
+#if defined CONFIG_LINUX && !defined CONFIG_MEMFD
+#include <sys/syscall.h>
+#include <asm/unistd.h>
+
+int memfd_create(const char *name, unsigned int flags)
+{
+#ifdef __NR_memfd_create
+ return syscall(__NR_memfd_create, name, flags);
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+}
+#endif
+
+int qemu_memfd_create(const char *name, size_t size, bool hugetlb,
+ uint64_t hugetlbsize, unsigned int seals, Error **errp)
+{
+ int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0;
+
+ if (htsize && 1ULL << htsize != hugetlbsize) {
+ error_setg(errp, "Hugepage size must be a power of 2");
+ return -1;
+ }
+
+ htsize = htsize << MFD_HUGE_SHIFT;
+
+#ifdef CONFIG_LINUX
+ int mfd = -1;
+ unsigned int flags = MFD_CLOEXEC;
+
+ if (seals) {
+ flags |= MFD_ALLOW_SEALING;
+ }
+ if (hugetlb) {
+ flags |= MFD_HUGETLB;
+ flags |= htsize;
+ }
+ mfd = memfd_create(name, flags);
+ if (mfd < 0) {
+ error_setg_errno(errp, errno,
+ "failed to create memfd with flags 0x%x", flags);
+ goto err;
+ }
+
+ if (ftruncate(mfd, size) == -1) {
+ error_setg_errno(errp, errno, "failed to resize memfd to %zu", size);
+ goto err;
+ }
+
+ if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) {
+ error_setg_errno(errp, errno, "failed to add seals 0x%x", seals);
+ goto err;
+ }
+
+ return mfd;
+
+err:
+ if (mfd >= 0) {
+ close(mfd);
+ }
+#else
+ error_setg_errno(errp, ENOSYS, "failed to create memfd");
+#endif
+ return -1;
+}
+
+/*
+ * This is a best-effort helper for shared memory allocation, with
+ * optional sealing. The helper will do his best to allocate using
+ * memfd with sealing, but may fallback on other methods without
+ * sealing.
+ */
+void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals,
+ int *fd, Error **errp)
+{
+ void *ptr;
+ int mfd = qemu_memfd_create(name, size, false, 0, seals, NULL);
+
+ /* some systems have memfd without sealing */
+ if (mfd == -1) {
+ mfd = qemu_memfd_create(name, size, false, 0, 0, NULL);
+ }
+
+ if (mfd == -1) {
+ const char *tmpdir = g_get_tmp_dir();
+ gchar *fname;
+
+ fname = g_strdup_printf("%s/memfd-XXXXXX", tmpdir);
+ mfd = mkstemp(fname);
+ unlink(fname);
+ g_free(fname);
+
+ if (mfd == -1 ||
+ ftruncate(mfd, size) == -1) {
+ goto err;
+ }
+ }
+
+ ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
+ if (ptr == MAP_FAILED) {
+ goto err;
+ }
+
+ *fd = mfd;
+ return ptr;
+
+err:
+ error_setg_errno(errp, errno, "failed to allocate shared memory");
+ if (mfd >= 0) {
+ close(mfd);
+ }
+ return NULL;
+}
+
+void qemu_memfd_free(void *ptr, size_t size, int fd)
+{
+ if (ptr) {
+ munmap(ptr, size);
+ }
+
+ if (fd != -1) {
+ close(fd);
+ }
+}
+
+enum {
+ MEMFD_KO,
+ MEMFD_OK,
+ MEMFD_TODO
+};
+
+/**
+ * qemu_memfd_alloc_check():
+ *
+ * Check if qemu_memfd_alloc() can allocate, including using a
+ * fallback implementation when host doesn't support memfd.
+ */
+bool qemu_memfd_alloc_check(void)
+{
+ static int memfd_check = MEMFD_TODO;
+
+ if (memfd_check == MEMFD_TODO) {
+ int fd;
+ void *ptr;
+
+ fd = -1;
+ ptr = qemu_memfd_alloc("test", 4096, 0, &fd, NULL);
+ memfd_check = ptr ? MEMFD_OK : MEMFD_KO;
+ qemu_memfd_free(ptr, 4096, fd);
+ }
+
+ return memfd_check == MEMFD_OK;
+}
+
+/**
+ * qemu_memfd_check():
+ *
+ * Check if host supports memfd.
+ */
+bool qemu_memfd_check(unsigned int flags)
+{
+#ifdef CONFIG_LINUX
+ int mfd = memfd_create("test", flags | MFD_CLOEXEC);
+
+ if (mfd >= 0) {
+ close(mfd);
+ return true;
+ }
+#endif
+
+ return false;
+}
diff --git a/util/meson.build b/util/meson.build
new file mode 100644
index 000000000..05b593055
--- /dev/null
+++ b/util/meson.build
@@ -0,0 +1,89 @@
+util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c'))
+if not config_host_data.get('CONFIG_ATOMIC64')
+ util_ss.add(files('atomic64.c'))
+endif
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('aio-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('fdmon-poll.c'))
+if config_host_data.get('CONFIG_EPOLL_CREATE1')
+ util_ss.add(files('fdmon-epoll.c'))
+endif
+util_ss.add(when: linux_io_uring, if_true: files('fdmon-io_uring.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('compatfd.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('event_notifier-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('mmap-alloc.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('oslib-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: [files('qemu-openpty.c'), util])
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('qemu-thread-posix.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('memfd.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('aio-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('event_notifier-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('oslib-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: files('qemu-thread-win32.c'))
+util_ss.add(when: 'CONFIG_WIN32', if_true: winmm)
+util_ss.add(files('envlist.c', 'path.c', 'module.c'))
+util_ss.add(files('host-utils.c'))
+util_ss.add(files('bitmap.c', 'bitops.c'))
+util_ss.add(files('fifo8.c'))
+util_ss.add(files('cacheinfo.c', 'cacheflush.c'))
+util_ss.add(files('error.c', 'qemu-error.c'))
+util_ss.add(files('qemu-print.c'))
+util_ss.add(files('id.c'))
+util_ss.add(files('qemu-config.c', 'notify.c'))
+util_ss.add(files('qemu-option.c', 'qemu-progress.c'))
+util_ss.add(files('keyval.c'))
+util_ss.add(files('crc32c.c'))
+util_ss.add(files('uuid.c'))
+util_ss.add(files('getauxval.c'))
+util_ss.add(files('rcu.c'))
+util_ss.add(when: 'CONFIG_MEMBARRIER', if_true: files('sys_membarrier.c'))
+util_ss.add(files('log.c'))
+util_ss.add(files('pagesize.c'))
+util_ss.add(files('qdist.c'))
+util_ss.add(files('qht.c'))
+util_ss.add(files('qsp.c'))
+util_ss.add(files('range.c'))
+util_ss.add(files('stats64.c'))
+util_ss.add(files('systemd.c'))
+util_ss.add(files('transactions.c'))
+util_ss.add(when: 'CONFIG_POSIX', if_true: files('drm.c'))
+util_ss.add(files('guest-random.c'))
+util_ss.add(files('yank.c'))
+
+if have_user
+ util_ss.add(files('selfmap.c'))
+endif
+
+if have_system
+ util_ss.add(files('crc-ccitt.c'))
+ util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio])
+ util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c'))
+endif
+
+if have_block
+ util_ss.add(files('aiocb.c', 'async.c', 'aio-wait.c'))
+ util_ss.add(files('base64.c'))
+ util_ss.add(files('buffer.c'))
+ util_ss.add(files('bufferiszero.c'))
+ util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND'])))
+ util_ss.add(files('hbitmap.c'))
+ util_ss.add(files('hexdump.c'))
+ util_ss.add(files('iova-tree.c'))
+ util_ss.add(files('iov.c', 'qemu-sockets.c', 'uri.c'))
+ util_ss.add(files('lockcnt.c'))
+ util_ss.add(files('main-loop.c'))
+ util_ss.add(files('nvdimm-utils.c'))
+ util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c'))
+ util_ss.add(when: 'CONFIG_LINUX', if_true: [
+ files('vhost-user-server.c'), vhost_user
+ ])
+ util_ss.add(files('block-helpers.c'))
+ util_ss.add(files('qemu-coroutine-sleep.c'))
+ util_ss.add(files('qemu-co-shared-resource.c'))
+ util_ss.add(files('thread-pool.c', 'qemu-timer.c'))
+ util_ss.add(files('readline.c'))
+ util_ss.add(files('throttle.c'))
+ util_ss.add(files('timed-average.c'))
+ util_ss.add(when: 'CONFIG_INOTIFY1', if_true: files('filemonitor-inotify.c'),
+ if_false: files('filemonitor-stub.c'))
+ util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
+endif
diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
new file mode 100644
index 000000000..893d86435
--- /dev/null
+++ b/util/mmap-alloc.c
@@ -0,0 +1,306 @@
+/*
+ * Support for RAM backed by mmaped host memory.
+ *
+ * Copyright (c) 2015 Red Hat, Inc.
+ *
+ * Authors:
+ * Michael S. Tsirkin <mst@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifdef CONFIG_LINUX
+#include <linux/mman.h>
+#else /* !CONFIG_LINUX */
+#define MAP_SYNC 0x0
+#define MAP_SHARED_VALIDATE 0x0
+#endif /* CONFIG_LINUX */
+
+#include "qemu/osdep.h"
+#include "qemu/mmap-alloc.h"
+#include "qemu/host-utils.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+#define HUGETLBFS_MAGIC 0x958458f6
+
+#ifdef CONFIG_LINUX
+#include <sys/vfs.h>
+#endif
+
+size_t qemu_fd_getpagesize(int fd)
+{
+#ifdef CONFIG_LINUX
+ struct statfs fs;
+ int ret;
+
+ if (fd != -1) {
+ do {
+ ret = fstatfs(fd, &fs);
+ } while (ret != 0 && errno == EINTR);
+
+ if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) {
+ return fs.f_bsize;
+ }
+ }
+#ifdef __sparc__
+ /* SPARC Linux needs greater alignment than the pagesize */
+ return QEMU_VMALLOC_ALIGN;
+#endif
+#endif
+
+ return qemu_real_host_page_size;
+}
+
+size_t qemu_mempath_getpagesize(const char *mem_path)
+{
+#ifdef CONFIG_LINUX
+ struct statfs fs;
+ int ret;
+
+ if (mem_path) {
+ do {
+ ret = statfs(mem_path, &fs);
+ } while (ret != 0 && errno == EINTR);
+
+ if (ret != 0) {
+ fprintf(stderr, "Couldn't statfs() memory path: %s\n",
+ strerror(errno));
+ exit(1);
+ }
+
+ if (fs.f_type == HUGETLBFS_MAGIC) {
+ /* It's hugepage, return the huge page size */
+ return fs.f_bsize;
+ }
+ }
+#ifdef __sparc__
+ /* SPARC Linux needs greater alignment than the pagesize */
+ return QEMU_VMALLOC_ALIGN;
+#endif
+#endif
+
+ return qemu_real_host_page_size;
+}
+
+#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory"
+static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags)
+{
+#if defined(__linux__)
+ const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
+ const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
+ gchar *content = NULL;
+ const char *endptr;
+ unsigned int tmp;
+
+ /*
+ * hugeltb accounting is different than ordinary swap reservation:
+ * a) Hugetlb pages from the pool are reserved for both private and
+ * shared mappings. For shared mappings, all mappers have to specify
+ * MAP_NORESERVE.
+ * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory.
+ */
+ if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) {
+ return true;
+ }
+
+ /*
+ * Accountable mappings in the kernel that can be affected by MAP_NORESEVE
+ * are private writable mappings (see mm/mmap.c:accountable_mapping() in
+ * Linux). For all shared or readonly mappings, MAP_NORESERVE is always
+ * implicitly active -- no reservation; this includes shmem. The only
+ * exception is shared anonymous memory, it is accounted like private
+ * anonymous memory.
+ */
+ if (readonly || (shared && fd >= 0)) {
+ return true;
+ }
+
+ /*
+ * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when
+ * memory overcommit is set to "never". Sparse memory regions aren't really
+ * possible in this system configuration.
+ *
+ * Bail out now instead of silently committing way more memory than
+ * currently desired by the user.
+ */
+ if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) &&
+ !qemu_strtoui(content, &endptr, 0, &tmp) &&
+ (!endptr || *endptr == '\n')) {
+ if (tmp == 2) {
+ error_report("Skipping reservation of swap space is not supported:"
+ " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\"");
+ return false;
+ }
+ return true;
+ }
+ /* this interface has been around since Linux 2.6 */
+ error_report("Skipping reservation of swap space is not supported:"
+ " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\"");
+ return false;
+#endif
+ /*
+ * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it,
+ * and removed it a while ago.
+ */
+ error_report("Skipping reservation of swap space is not supported");
+ return false;
+}
+
+/*
+ * Reserve a new memory region of the requested size to be used for mapping
+ * from the given fd (if any).
+ */
+static void *mmap_reserve(size_t size, int fd)
+{
+ int flags = MAP_PRIVATE;
+
+#if defined(__powerpc64__) && defined(__linux__)
+ /*
+ * On ppc64 mappings in the same segment (aka slice) must share the same
+ * page size. Since we will be re-allocating part of this segment
+ * from the supplied fd, we should make sure to use the same page size, to
+ * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to
+ * avoid allocating backing store memory.
+ * We do this unless we are using the system page size, in which case
+ * anonymous memory is OK.
+ */
+ if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size) {
+ fd = -1;
+ flags |= MAP_ANONYMOUS;
+ } else {
+ flags |= MAP_NORESERVE;
+ }
+#else
+ fd = -1;
+ flags |= MAP_ANONYMOUS;
+#endif
+
+ return mmap(0, size, PROT_NONE, flags, fd, 0);
+}
+
+/*
+ * Activate memory in a reserved region from the given fd (if any), to make
+ * it accessible.
+ */
+static void *mmap_activate(void *ptr, size_t size, int fd,
+ uint32_t qemu_map_flags, off_t map_offset)
+{
+ const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE;
+ const bool readonly = qemu_map_flags & QEMU_MAP_READONLY;
+ const bool shared = qemu_map_flags & QEMU_MAP_SHARED;
+ const bool sync = qemu_map_flags & QEMU_MAP_SYNC;
+ const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
+ int map_sync_flags = 0;
+ int flags = MAP_FIXED;
+ void *activated_ptr;
+
+ if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) {
+ return MAP_FAILED;
+ }
+
+ flags |= fd == -1 ? MAP_ANONYMOUS : 0;
+ flags |= shared ? MAP_SHARED : MAP_PRIVATE;
+ flags |= noreserve ? MAP_NORESERVE : 0;
+ if (shared && sync) {
+ map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
+ }
+
+ activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd,
+ map_offset);
+ if (activated_ptr == MAP_FAILED && map_sync_flags) {
+ if (errno == ENOTSUP) {
+ char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
+ char *file_name = g_malloc0(PATH_MAX);
+ int len = readlink(proc_link, file_name, PATH_MAX - 1);
+
+ if (len < 0) {
+ len = 0;
+ }
+ file_name[len] = '\0';
+ fprintf(stderr, "Warning: requesting persistence across crashes "
+ "for backend file %s failed. Proceeding without "
+ "persistence, data might become corrupted in case of host "
+ "crash.\n", file_name);
+ g_free(proc_link);
+ g_free(file_name);
+ warn_report("Using non DAX backing file with 'pmem=on' option"
+ " is deprecated");
+ }
+ /*
+ * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try
+ * again without these flags to handle backwards compatibility.
+ */
+ activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset);
+ }
+ return activated_ptr;
+}
+
+static inline size_t mmap_guard_pagesize(int fd)
+{
+#if defined(__powerpc64__) && defined(__linux__)
+ /* Mappings in the same segment must share the same page size */
+ return qemu_fd_getpagesize(fd);
+#else
+ return qemu_real_host_page_size;
+#endif
+}
+
+void *qemu_ram_mmap(int fd,
+ size_t size,
+ size_t align,
+ uint32_t qemu_map_flags,
+ off_t map_offset)
+{
+ const size_t guard_pagesize = mmap_guard_pagesize(fd);
+ size_t offset, total;
+ void *ptr, *guardptr;
+
+ /*
+ * Note: this always allocates at least one extra page of virtual address
+ * space, even if size is already aligned.
+ */
+ total = size + align;
+
+ guardptr = mmap_reserve(total, fd);
+ if (guardptr == MAP_FAILED) {
+ return MAP_FAILED;
+ }
+
+ assert(is_power_of_2(align));
+ /* Always align to host page size */
+ assert(align >= guard_pagesize);
+
+ offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
+
+ ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags,
+ map_offset);
+ if (ptr == MAP_FAILED) {
+ munmap(guardptr, total);
+ return MAP_FAILED;
+ }
+
+ if (offset > 0) {
+ munmap(guardptr, offset);
+ }
+
+ /*
+ * Leave a single PROT_NONE page allocated after the RAM block, to serve as
+ * a guard page guarding against potential buffer overflows.
+ */
+ total -= offset;
+ if (total > size + guard_pagesize) {
+ munmap(ptr + size + guard_pagesize, total - size - guard_pagesize);
+ }
+
+ return ptr;
+}
+
+void qemu_ram_munmap(int fd, void *ptr, size_t size)
+{
+ if (ptr) {
+ /* Unmap both the RAM block and the guard page */
+ munmap(ptr, size + mmap_guard_pagesize(fd));
+ }
+}
diff --git a/util/module.c b/util/module.c
new file mode 100644
index 000000000..6bb4ad915
--- /dev/null
+++ b/util/module.c
@@ -0,0 +1,387 @@
+/*
+ * QEMU Module Infrastructure
+ *
+ * Copyright IBM, Corp. 2009
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#ifdef CONFIG_MODULES
+#include <gmodule.h>
+#endif
+#include "qemu/queue.h"
+#include "qemu/module.h"
+#include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#ifdef CONFIG_MODULE_UPGRADES
+#include "qemu-version.h"
+#endif
+#include "trace.h"
+
+typedef struct ModuleEntry
+{
+ void (*init)(void);
+ QTAILQ_ENTRY(ModuleEntry) node;
+ module_init_type type;
+} ModuleEntry;
+
+typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList;
+
+static ModuleTypeList init_type_list[MODULE_INIT_MAX];
+static bool modules_init_done[MODULE_INIT_MAX];
+
+static ModuleTypeList dso_init_list;
+
+static void init_lists(void)
+{
+ static int inited;
+ int i;
+
+ if (inited) {
+ return;
+ }
+
+ for (i = 0; i < MODULE_INIT_MAX; i++) {
+ QTAILQ_INIT(&init_type_list[i]);
+ }
+
+ QTAILQ_INIT(&dso_init_list);
+
+ inited = 1;
+}
+
+
+static ModuleTypeList *find_type(module_init_type type)
+{
+ init_lists();
+
+ return &init_type_list[type];
+}
+
+void register_module_init(void (*fn)(void), module_init_type type)
+{
+ ModuleEntry *e;
+ ModuleTypeList *l;
+
+ e = g_malloc0(sizeof(*e));
+ e->init = fn;
+ e->type = type;
+
+ l = find_type(type);
+
+ QTAILQ_INSERT_TAIL(l, e, node);
+}
+
+void register_dso_module_init(void (*fn)(void), module_init_type type)
+{
+ ModuleEntry *e;
+
+ init_lists();
+
+ e = g_malloc0(sizeof(*e));
+ e->init = fn;
+ e->type = type;
+
+ QTAILQ_INSERT_TAIL(&dso_init_list, e, node);
+}
+
+void module_call_init(module_init_type type)
+{
+ ModuleTypeList *l;
+ ModuleEntry *e;
+
+ if (modules_init_done[type]) {
+ return;
+ }
+
+ l = find_type(type);
+
+ QTAILQ_FOREACH(e, l, node) {
+ e->init();
+ }
+
+ modules_init_done[type] = true;
+}
+
+#ifdef CONFIG_MODULES
+
+static const QemuModinfo module_info_stub[] = { {
+ /* end of list */
+} };
+static const QemuModinfo *module_info = module_info_stub;
+static const char *module_arch;
+
+void module_init_info(const QemuModinfo *info)
+{
+ module_info = info;
+}
+
+void module_allow_arch(const char *arch)
+{
+ module_arch = arch;
+}
+
+static bool module_check_arch(const QemuModinfo *modinfo)
+{
+ if (modinfo->arch) {
+ if (!module_arch) {
+ /* no arch set -> ignore all */
+ return false;
+ }
+ if (strcmp(module_arch, modinfo->arch) != 0) {
+ /* mismatch */
+ return false;
+ }
+ }
+ return true;
+}
+
+static int module_load_file(const char *fname, bool mayfail, bool export_symbols)
+{
+ GModule *g_module;
+ void (*sym)(void);
+ const char *dsosuf = CONFIG_HOST_DSOSUF;
+ int len = strlen(fname);
+ int suf_len = strlen(dsosuf);
+ ModuleEntry *e, *next;
+ int ret, flags;
+
+ if (len <= suf_len || strcmp(&fname[len - suf_len], dsosuf)) {
+ /* wrong suffix */
+ ret = -EINVAL;
+ goto out;
+ }
+ if (access(fname, F_OK)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ assert(QTAILQ_EMPTY(&dso_init_list));
+
+ flags = 0;
+ if (!export_symbols) {
+ flags |= G_MODULE_BIND_LOCAL;
+ }
+ g_module = g_module_open(fname, flags);
+ if (!g_module) {
+ if (!mayfail) {
+ fprintf(stderr, "Failed to open module: %s\n",
+ g_module_error());
+ }
+ ret = -EINVAL;
+ goto out;
+ }
+ if (!g_module_symbol(g_module, DSO_STAMP_FUN_STR, (gpointer *)&sym)) {
+ fprintf(stderr, "Failed to initialize module: %s\n",
+ fname);
+ /* Print some info if this is a QEMU module (but from different build),
+ * this will make debugging user problems easier. */
+ if (g_module_symbol(g_module, "qemu_module_dummy", (gpointer *)&sym)) {
+ fprintf(stderr,
+ "Note: only modules from the same build can be loaded.\n");
+ }
+ g_module_close(g_module);
+ ret = -EINVAL;
+ } else {
+ QTAILQ_FOREACH(e, &dso_init_list, node) {
+ e->init();
+ register_module_init(e->init, e->type);
+ }
+ ret = 0;
+ }
+
+ trace_module_load_module(fname);
+ QTAILQ_FOREACH_SAFE(e, &dso_init_list, node, next) {
+ QTAILQ_REMOVE(&dso_init_list, e, node);
+ g_free(e);
+ }
+out:
+ return ret;
+}
+#endif
+
+bool module_load_one(const char *prefix, const char *lib_name, bool mayfail)
+{
+ bool success = false;
+
+#ifdef CONFIG_MODULES
+ char *fname = NULL;
+#ifdef CONFIG_MODULE_UPGRADES
+ char *version_dir;
+#endif
+ const char *search_dir;
+ char *dirs[5];
+ char *module_name;
+ int i = 0, n_dirs = 0;
+ int ret;
+ bool export_symbols = false;
+ static GHashTable *loaded_modules;
+ const QemuModinfo *modinfo;
+ const char **sl;
+
+ if (!g_module_supported()) {
+ fprintf(stderr, "Module is not supported by system.\n");
+ return false;
+ }
+
+ if (!loaded_modules) {
+ loaded_modules = g_hash_table_new(g_str_hash, g_str_equal);
+ }
+
+ module_name = g_strdup_printf("%s%s", prefix, lib_name);
+
+ if (g_hash_table_contains(loaded_modules, module_name)) {
+ g_free(module_name);
+ return true;
+ }
+ g_hash_table_add(loaded_modules, module_name);
+
+ for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+ if (modinfo->arch) {
+ if (strcmp(modinfo->name, module_name) == 0) {
+ if (!module_check_arch(modinfo)) {
+ return false;
+ }
+ }
+ }
+ if (modinfo->deps) {
+ if (strcmp(modinfo->name, module_name) == 0) {
+ /* we depend on other module(s) */
+ for (sl = modinfo->deps; *sl != NULL; sl++) {
+ module_load_one("", *sl, false);
+ }
+ } else {
+ for (sl = modinfo->deps; *sl != NULL; sl++) {
+ if (strcmp(module_name, *sl) == 0) {
+ /* another module depends on us */
+ export_symbols = true;
+ }
+ }
+ }
+ }
+ }
+
+ search_dir = getenv("QEMU_MODULE_DIR");
+ if (search_dir != NULL) {
+ dirs[n_dirs++] = g_strdup_printf("%s", search_dir);
+ }
+ dirs[n_dirs++] = get_relocated_path(CONFIG_QEMU_MODDIR);
+ dirs[n_dirs++] = g_strdup(qemu_get_exec_dir());
+
+#ifdef CONFIG_MODULE_UPGRADES
+ version_dir = g_strcanon(g_strdup(QEMU_PKGVERSION),
+ G_CSET_A_2_Z G_CSET_a_2_z G_CSET_DIGITS "+-.~",
+ '_');
+ dirs[n_dirs++] = g_strdup_printf("/var/run/qemu/%s", version_dir);
+#endif
+
+ assert(n_dirs <= ARRAY_SIZE(dirs));
+
+ for (i = 0; i < n_dirs; i++) {
+ fname = g_strdup_printf("%s/%s%s",
+ dirs[i], module_name, CONFIG_HOST_DSOSUF);
+ ret = module_load_file(fname, mayfail, export_symbols);
+ g_free(fname);
+ fname = NULL;
+ /* Try loading until loaded a module file */
+ if (!ret) {
+ success = true;
+ break;
+ }
+ }
+
+ if (!success) {
+ g_hash_table_remove(loaded_modules, module_name);
+ g_free(module_name);
+ }
+
+ for (i = 0; i < n_dirs; i++) {
+ g_free(dirs[i]);
+ }
+
+#endif
+ return success;
+}
+
+#ifdef CONFIG_MODULES
+
+static bool module_loaded_qom_all;
+
+void module_load_qom_one(const char *type)
+{
+ const QemuModinfo *modinfo;
+ const char **sl;
+
+ if (!type) {
+ return;
+ }
+
+ trace_module_lookup_object_type(type);
+ for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+ if (!modinfo->objs) {
+ continue;
+ }
+ if (!module_check_arch(modinfo)) {
+ continue;
+ }
+ for (sl = modinfo->objs; *sl != NULL; sl++) {
+ if (strcmp(type, *sl) == 0) {
+ module_load_one("", modinfo->name, false);
+ }
+ }
+ }
+}
+
+void module_load_qom_all(void)
+{
+ const QemuModinfo *modinfo;
+
+ if (module_loaded_qom_all) {
+ return;
+ }
+
+ for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+ if (!modinfo->objs) {
+ continue;
+ }
+ if (!module_check_arch(modinfo)) {
+ continue;
+ }
+ module_load_one("", modinfo->name, false);
+ }
+ module_loaded_qom_all = true;
+}
+
+void qemu_load_module_for_opts(const char *group)
+{
+ const QemuModinfo *modinfo;
+ const char **sl;
+
+ for (modinfo = module_info; modinfo->name != NULL; modinfo++) {
+ if (!modinfo->opts) {
+ continue;
+ }
+ for (sl = modinfo->opts; *sl != NULL; sl++) {
+ if (strcmp(group, *sl) == 0) {
+ module_load_one("", modinfo->name, false);
+ }
+ }
+ }
+}
+
+#else
+
+void module_allow_arch(const char *arch) {}
+void qemu_load_module_for_opts(const char *group) {}
+void module_load_qom_one(const char *type) {}
+void module_load_qom_all(void) {}
+
+#endif
diff --git a/util/notify.c b/util/notify.c
new file mode 100644
index 000000000..76bab212a
--- /dev/null
+++ b/util/notify.c
@@ -0,0 +1,76 @@
+/*
+ * Notifier lists
+ *
+ * Copyright IBM, Corp. 2010
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/notify.h"
+
+void notifier_list_init(NotifierList *list)
+{
+ QLIST_INIT(&list->notifiers);
+}
+
+void notifier_list_add(NotifierList *list, Notifier *notifier)
+{
+ QLIST_INSERT_HEAD(&list->notifiers, notifier, node);
+}
+
+void notifier_remove(Notifier *notifier)
+{
+ QLIST_REMOVE(notifier, node);
+}
+
+void notifier_list_notify(NotifierList *list, void *data)
+{
+ Notifier *notifier, *next;
+
+ QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) {
+ notifier->notify(notifier, data);
+ }
+}
+
+bool notifier_list_empty(NotifierList *list)
+{
+ return QLIST_EMPTY(&list->notifiers);
+}
+
+void notifier_with_return_list_init(NotifierWithReturnList *list)
+{
+ QLIST_INIT(&list->notifiers);
+}
+
+void notifier_with_return_list_add(NotifierWithReturnList *list,
+ NotifierWithReturn *notifier)
+{
+ QLIST_INSERT_HEAD(&list->notifiers, notifier, node);
+}
+
+void notifier_with_return_remove(NotifierWithReturn *notifier)
+{
+ QLIST_REMOVE(notifier, node);
+}
+
+int notifier_with_return_list_notify(NotifierWithReturnList *list, void *data)
+{
+ NotifierWithReturn *notifier, *next;
+ int ret = 0;
+
+ QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) {
+ ret = notifier->notify(notifier, data);
+ if (ret != 0) {
+ break;
+ }
+ }
+ return ret;
+}
diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c
new file mode 100644
index 000000000..aa3d199f2
--- /dev/null
+++ b/util/nvdimm-utils.c
@@ -0,0 +1,30 @@
+#include "qemu/osdep.h"
+#include "qemu/nvdimm-utils.h"
+#include "hw/mem/nvdimm.h"
+
+static int nvdimm_device_list(Object *obj, void *opaque)
+{
+ GSList **list = opaque;
+
+ if (object_dynamic_cast(obj, TYPE_NVDIMM)) {
+ *list = g_slist_append(*list, DEVICE(obj));
+ }
+
+ object_child_foreach(obj, nvdimm_device_list, opaque);
+ return 0;
+}
+
+/*
+ * inquire NVDIMM devices and link them into the list which is
+ * returned to the caller.
+ *
+ * Note: it is the caller's responsibility to free the list to avoid
+ * memory leak.
+ */
+GSList *nvdimm_get_device_list(void)
+{
+ GSList *list = NULL;
+
+ object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list);
+ return list;
+}
diff --git a/util/osdep.c b/util/osdep.c
new file mode 100644
index 000000000..42a0a4986
--- /dev/null
+++ b/util/osdep.c
@@ -0,0 +1,617 @@
+/*
+ * QEMU low level functions
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+
+/* Needed early for CONFIG_BSD etc. */
+
+#ifdef CONFIG_SOLARIS
+#include <sys/statvfs.h>
+/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for
+ discussion about Solaris header problems */
+extern int madvise(char *, size_t, int);
+#endif
+
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/sockets.h"
+#include "qemu/error-report.h"
+#include "monitor/monitor.h"
+
+static bool fips_enabled = false;
+
+static const char *hw_version = QEMU_HW_VERSION;
+
+int socket_set_cork(int fd, int v)
+{
+#if defined(SOL_TCP) && defined(TCP_CORK)
+ return qemu_setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
+#else
+ return 0;
+#endif
+}
+
+int socket_set_nodelay(int fd)
+{
+ int v = 1;
+ return qemu_setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &v, sizeof(v));
+}
+
+int qemu_madvise(void *addr, size_t len, int advice)
+{
+ if (advice == QEMU_MADV_INVALID) {
+ errno = EINVAL;
+ return -1;
+ }
+#if defined(CONFIG_MADVISE)
+ return madvise(addr, len, advice);
+#elif defined(CONFIG_POSIX_MADVISE)
+ return posix_madvise(addr, len, advice);
+#else
+ errno = EINVAL;
+ return -1;
+#endif
+}
+
+static int qemu_mprotect__osdep(void *addr, size_t size, int prot)
+{
+ g_assert(!((uintptr_t)addr & ~qemu_real_host_page_mask));
+ g_assert(!(size & ~qemu_real_host_page_mask));
+
+#ifdef _WIN32
+ DWORD old_protect;
+
+ if (!VirtualProtect(addr, size, prot, &old_protect)) {
+ g_autofree gchar *emsg = g_win32_error_message(GetLastError());
+ error_report("%s: VirtualProtect failed: %s", __func__, emsg);
+ return -1;
+ }
+ return 0;
+#else
+ if (mprotect(addr, size, prot)) {
+ error_report("%s: mprotect failed: %s", __func__, strerror(errno));
+ return -1;
+ }
+ return 0;
+#endif
+}
+
+int qemu_mprotect_rw(void *addr, size_t size)
+{
+#ifdef _WIN32
+ return qemu_mprotect__osdep(addr, size, PAGE_READWRITE);
+#else
+ return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE);
+#endif
+}
+
+int qemu_mprotect_rwx(void *addr, size_t size)
+{
+#ifdef _WIN32
+ return qemu_mprotect__osdep(addr, size, PAGE_EXECUTE_READWRITE);
+#else
+ return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC);
+#endif
+}
+
+int qemu_mprotect_none(void *addr, size_t size)
+{
+#ifdef _WIN32
+ return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS);
+#else
+ return qemu_mprotect__osdep(addr, size, PROT_NONE);
+#endif
+}
+
+#ifndef _WIN32
+
+static int fcntl_op_setlk = -1;
+static int fcntl_op_getlk = -1;
+
+/*
+ * Dups an fd and sets the flags
+ */
+int qemu_dup_flags(int fd, int flags)
+{
+ int ret;
+ int serrno;
+ int dup_flags;
+
+ ret = qemu_dup(fd);
+ if (ret == -1) {
+ goto fail;
+ }
+
+ dup_flags = fcntl(ret, F_GETFL);
+ if (dup_flags == -1) {
+ goto fail;
+ }
+
+ if ((flags & O_SYNC) != (dup_flags & O_SYNC)) {
+ errno = EINVAL;
+ goto fail;
+ }
+
+ /* Set/unset flags that we can with fcntl */
+ if (fcntl(ret, F_SETFL, flags) == -1) {
+ goto fail;
+ }
+
+ /* Truncate the file in the cases that open() would truncate it */
+ if (flags & O_TRUNC ||
+ ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))) {
+ if (ftruncate(ret, 0) == -1) {
+ goto fail;
+ }
+ }
+
+ return ret;
+
+fail:
+ serrno = errno;
+ if (ret != -1) {
+ close(ret);
+ }
+ errno = serrno;
+ return -1;
+}
+
+int qemu_dup(int fd)
+{
+ int ret;
+#ifdef F_DUPFD_CLOEXEC
+ ret = fcntl(fd, F_DUPFD_CLOEXEC, 0);
+#else
+ ret = dup(fd);
+ if (ret != -1) {
+ qemu_set_cloexec(ret);
+ }
+#endif
+ return ret;
+}
+
+static int qemu_parse_fdset(const char *param)
+{
+ return qemu_parse_fd(param);
+}
+
+static void qemu_probe_lock_ops(void)
+{
+ if (fcntl_op_setlk == -1) {
+#ifdef F_OFD_SETLK
+ int fd;
+ int ret;
+ struct flock fl = {
+ .l_whence = SEEK_SET,
+ .l_start = 0,
+ .l_len = 0,
+ .l_type = F_WRLCK,
+ };
+
+ fd = open("/dev/null", O_RDWR);
+ if (fd < 0) {
+ fprintf(stderr,
+ "Failed to open /dev/null for OFD lock probing: %s\n",
+ strerror(errno));
+ fcntl_op_setlk = F_SETLK;
+ fcntl_op_getlk = F_GETLK;
+ return;
+ }
+ ret = fcntl(fd, F_OFD_GETLK, &fl);
+ close(fd);
+ if (!ret) {
+ fcntl_op_setlk = F_OFD_SETLK;
+ fcntl_op_getlk = F_OFD_GETLK;
+ } else {
+ fcntl_op_setlk = F_SETLK;
+ fcntl_op_getlk = F_GETLK;
+ }
+#else
+ fcntl_op_setlk = F_SETLK;
+ fcntl_op_getlk = F_GETLK;
+#endif
+ }
+}
+
+bool qemu_has_ofd_lock(void)
+{
+ qemu_probe_lock_ops();
+#ifdef F_OFD_SETLK
+ return fcntl_op_setlk == F_OFD_SETLK;
+#else
+ return false;
+#endif
+}
+
+static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type)
+{
+ int ret;
+ struct flock fl = {
+ .l_whence = SEEK_SET,
+ .l_start = start,
+ .l_len = len,
+ .l_type = fl_type,
+ };
+ qemu_probe_lock_ops();
+ do {
+ ret = fcntl(fd, fcntl_op_setlk, &fl);
+ } while (ret == -1 && errno == EINTR);
+ return ret == -1 ? -errno : 0;
+}
+
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive)
+{
+ return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK);
+}
+
+int qemu_unlock_fd(int fd, int64_t start, int64_t len)
+{
+ return qemu_lock_fcntl(fd, start, len, F_UNLCK);
+}
+
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
+{
+ int ret;
+ struct flock fl = {
+ .l_whence = SEEK_SET,
+ .l_start = start,
+ .l_len = len,
+ .l_type = exclusive ? F_WRLCK : F_RDLCK,
+ };
+ qemu_probe_lock_ops();
+ ret = fcntl(fd, fcntl_op_getlk, &fl);
+ if (ret == -1) {
+ return -errno;
+ } else {
+ return fl.l_type == F_UNLCK ? 0 : -EAGAIN;
+ }
+}
+#endif
+
+static int qemu_open_cloexec(const char *name, int flags, mode_t mode)
+{
+ int ret;
+#ifdef O_CLOEXEC
+ ret = open(name, flags | O_CLOEXEC, mode);
+#else
+ ret = open(name, flags, mode);
+ if (ret >= 0) {
+ qemu_set_cloexec(ret);
+ }
+#endif
+ return ret;
+}
+
+/*
+ * Opens a file with FD_CLOEXEC set
+ */
+static int
+qemu_open_internal(const char *name, int flags, mode_t mode, Error **errp)
+{
+ int ret;
+
+#ifndef _WIN32
+ const char *fdset_id_str;
+
+ /* Attempt dup of fd from fd set */
+ if (strstart(name, "/dev/fdset/", &fdset_id_str)) {
+ int64_t fdset_id;
+ int dupfd;
+
+ fdset_id = qemu_parse_fdset(fdset_id_str);
+ if (fdset_id == -1) {
+ error_setg(errp, "Could not parse fdset %s", name);
+ errno = EINVAL;
+ return -1;
+ }
+
+ dupfd = monitor_fdset_dup_fd_add(fdset_id, flags);
+ if (dupfd == -1) {
+ error_setg_errno(errp, errno, "Could not dup FD for %s flags %x",
+ name, flags);
+ return -1;
+ }
+
+ return dupfd;
+ }
+#endif
+
+ ret = qemu_open_cloexec(name, flags, mode);
+
+ if (ret == -1) {
+ const char *action = flags & O_CREAT ? "create" : "open";
+#ifdef O_DIRECT
+ /* Give more helpful error message for O_DIRECT */
+ if (errno == EINVAL && (flags & O_DIRECT)) {
+ ret = open(name, flags & ~O_DIRECT, mode);
+ if (ret != -1) {
+ close(ret);
+ error_setg(errp, "Could not %s '%s': "
+ "filesystem does not support O_DIRECT",
+ action, name);
+ errno = EINVAL; /* restore first open()'s errno */
+ return -1;
+ }
+ }
+#endif /* O_DIRECT */
+ error_setg_errno(errp, errno, "Could not %s '%s'",
+ action, name);
+ }
+
+ return ret;
+}
+
+
+int qemu_open(const char *name, int flags, Error **errp)
+{
+ assert(!(flags & O_CREAT));
+
+ return qemu_open_internal(name, flags, 0, errp);
+}
+
+
+int qemu_create(const char *name, int flags, mode_t mode, Error **errp)
+{
+ assert(!(flags & O_CREAT));
+
+ return qemu_open_internal(name, flags | O_CREAT, mode, errp);
+}
+
+
+int qemu_open_old(const char *name, int flags, ...)
+{
+ va_list ap;
+ mode_t mode = 0;
+ int ret;
+
+ va_start(ap, flags);
+ if (flags & O_CREAT) {
+ mode = va_arg(ap, int);
+ }
+ va_end(ap);
+
+ ret = qemu_open_internal(name, flags, mode, NULL);
+
+#ifdef O_DIRECT
+ if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) {
+ error_report("file system may not support O_DIRECT");
+ errno = EINVAL; /* in case it was clobbered */
+ }
+#endif /* O_DIRECT */
+
+ return ret;
+}
+
+int qemu_close(int fd)
+{
+ int64_t fdset_id;
+
+ /* Close fd that was dup'd from an fdset */
+ fdset_id = monitor_fdset_dup_fd_find(fd);
+ if (fdset_id != -1) {
+ int ret;
+
+ ret = close(fd);
+ if (ret == 0) {
+ monitor_fdset_dup_fd_remove(fd);
+ }
+
+ return ret;
+ }
+
+ return close(fd);
+}
+
+/*
+ * Delete a file from the filesystem, unless the filename is /dev/fdset/...
+ *
+ * Returns: On success, zero is returned. On error, -1 is returned,
+ * and errno is set appropriately.
+ */
+int qemu_unlink(const char *name)
+{
+ if (g_str_has_prefix(name, "/dev/fdset/")) {
+ return 0;
+ }
+
+ return unlink(name);
+}
+
+/*
+ * A variant of write(2) which handles partial write.
+ *
+ * Return the number of bytes transferred.
+ * Set errno if fewer than `count' bytes are written.
+ *
+ * This function don't work with non-blocking fd's.
+ * Any of the possibilities with non-blocking fd's is bad:
+ * - return a short write (then name is wrong)
+ * - busy wait adding (errno == EAGAIN) to the loop
+ */
+ssize_t qemu_write_full(int fd, const void *buf, size_t count)
+{
+ ssize_t ret = 0;
+ ssize_t total = 0;
+
+ while (count) {
+ ret = write(fd, buf, count);
+ if (ret < 0) {
+ if (errno == EINTR)
+ continue;
+ break;
+ }
+
+ count -= ret;
+ buf += ret;
+ total += ret;
+ }
+
+ return total;
+}
+
+/*
+ * Opens a socket with FD_CLOEXEC set
+ */
+int qemu_socket(int domain, int type, int protocol)
+{
+ int ret;
+
+#ifdef SOCK_CLOEXEC
+ ret = socket(domain, type | SOCK_CLOEXEC, protocol);
+ if (ret != -1 || errno != EINVAL) {
+ return ret;
+ }
+#endif
+ ret = socket(domain, type, protocol);
+ if (ret >= 0) {
+ qemu_set_cloexec(ret);
+ }
+
+ return ret;
+}
+
+/*
+ * Accept a connection and set FD_CLOEXEC
+ */
+int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen)
+{
+ int ret;
+
+#ifdef CONFIG_ACCEPT4
+ ret = accept4(s, addr, addrlen, SOCK_CLOEXEC);
+ if (ret != -1 || errno != ENOSYS) {
+ return ret;
+ }
+#endif
+ ret = accept(s, addr, addrlen);
+ if (ret >= 0) {
+ qemu_set_cloexec(ret);
+ }
+
+ return ret;
+}
+
+void qemu_set_hw_version(const char *version)
+{
+ hw_version = version;
+}
+
+const char *qemu_hw_version(void)
+{
+ return hw_version;
+}
+
+void fips_set_state(bool requested)
+{
+#ifdef __linux__
+ if (requested) {
+ FILE *fds = fopen("/proc/sys/crypto/fips_enabled", "r");
+ if (fds != NULL) {
+ fips_enabled = (fgetc(fds) == '1');
+ fclose(fds);
+ }
+ }
+#else
+ fips_enabled = false;
+#endif /* __linux__ */
+
+#ifdef _FIPS_DEBUG
+ fprintf(stderr, "FIPS mode %s (requested %s)\n",
+ (fips_enabled ? "enabled" : "disabled"),
+ (requested ? "enabled" : "disabled"));
+#endif
+}
+
+bool fips_get_state(void)
+{
+ return fips_enabled;
+}
+
+#ifdef _WIN32
+static void socket_cleanup(void)
+{
+ WSACleanup();
+}
+#endif
+
+int socket_init(void)
+{
+#ifdef _WIN32
+ WSADATA Data;
+ int ret, err;
+
+ ret = WSAStartup(MAKEWORD(2, 2), &Data);
+ if (ret != 0) {
+ err = WSAGetLastError();
+ fprintf(stderr, "WSAStartup: %d\n", err);
+ return -1;
+ }
+ atexit(socket_cleanup);
+#endif
+ return 0;
+}
+
+
+#ifndef CONFIG_IOVEC
+/* helper function for iov_send_recv() */
+static ssize_t
+readv_writev(int fd, const struct iovec *iov, int iov_cnt, bool do_write)
+{
+ unsigned i = 0;
+ ssize_t ret = 0;
+ while (i < iov_cnt) {
+ ssize_t r = do_write
+ ? write(fd, iov[i].iov_base, iov[i].iov_len)
+ : read(fd, iov[i].iov_base, iov[i].iov_len);
+ if (r > 0) {
+ ret += r;
+ } else if (!r) {
+ break;
+ } else if (errno == EINTR) {
+ continue;
+ } else {
+ /* else it is some "other" error,
+ * only return if there was no data processed. */
+ if (ret == 0) {
+ ret = -1;
+ }
+ break;
+ }
+ i++;
+ }
+ return ret;
+}
+
+ssize_t
+readv(int fd, const struct iovec *iov, int iov_cnt)
+{
+ return readv_writev(fd, iov, iov_cnt, false);
+}
+
+ssize_t
+writev(int fd, const struct iovec *iov, int iov_cnt)
+{
+ return readv_writev(fd, iov, iov_cnt, true);
+}
+#endif
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
new file mode 100644
index 000000000..e8bdb02e1
--- /dev/null
+++ b/util/oslib-posix.c
@@ -0,0 +1,860 @@
+/*
+ * os-posix-lib.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * QEMU library functions on POSIX which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <termios.h>
+
+#include <glib/gprintf.h>
+
+#include "qemu-common.h"
+#include "sysemu/sysemu.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h"
+#include "qemu/thread.h"
+#include <libgen.h>
+#include "qemu/cutils.h"
+#include "qemu/compiler.h"
+
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <sys/thr.h>
+#include <libutil.h>
+#endif
+
+#ifdef __NetBSD__
+#include <sys/sysctl.h>
+#include <lwp.h>
+#endif
+
+#ifdef __APPLE__
+#include <mach-o/dyld.h>
+#endif
+
+#ifdef __HAIKU__
+#include <kernel/image.h>
+#endif
+
+#include "qemu/mmap-alloc.h"
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+#include "qemu/error-report.h"
+#endif
+
+#define MAX_MEM_PREALLOC_THREAD_COUNT 16
+
+struct MemsetThread {
+ char *addr;
+ size_t numpages;
+ size_t hpagesize;
+ QemuThread pgthread;
+ sigjmp_buf env;
+};
+typedef struct MemsetThread MemsetThread;
+
+static MemsetThread *memset_thread;
+static int memset_num_threads;
+static bool memset_thread_failed;
+
+static QemuMutex page_mutex;
+static QemuCond page_cond;
+static bool threads_created_flag;
+
+int qemu_get_thread_id(void)
+{
+#if defined(__linux__)
+ return syscall(SYS_gettid);
+#elif defined(__FreeBSD__)
+ /* thread id is up to INT_MAX */
+ long tid;
+ thr_self(&tid);
+ return (int)tid;
+#elif defined(__NetBSD__)
+ return _lwp_self();
+#elif defined(__OpenBSD__)
+ return getthrid();
+#else
+ return getpid();
+#endif
+}
+
+int qemu_daemon(int nochdir, int noclose)
+{
+ return daemon(nochdir, noclose);
+}
+
+bool qemu_write_pidfile(const char *path, Error **errp)
+{
+ int fd;
+ char pidstr[32];
+
+ while (1) {
+ struct stat a, b;
+ struct flock lock = {
+ .l_type = F_WRLCK,
+ .l_whence = SEEK_SET,
+ .l_len = 0,
+ };
+
+ fd = qemu_open_old(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
+ if (fd == -1) {
+ error_setg_errno(errp, errno, "Cannot open pid file");
+ return false;
+ }
+
+ if (fstat(fd, &b) < 0) {
+ error_setg_errno(errp, errno, "Cannot stat file");
+ goto fail_close;
+ }
+
+ if (fcntl(fd, F_SETLK, &lock)) {
+ error_setg_errno(errp, errno, "Cannot lock pid file");
+ goto fail_close;
+ }
+
+ /*
+ * Now make sure the path we locked is the same one that now
+ * exists on the filesystem.
+ */
+ if (stat(path, &a) < 0) {
+ /*
+ * PID file disappeared, someone else must be racing with
+ * us, so try again.
+ */
+ close(fd);
+ continue;
+ }
+
+ if (a.st_ino == b.st_ino) {
+ break;
+ }
+
+ /*
+ * PID file was recreated, someone else must be racing with
+ * us, so try again.
+ */
+ close(fd);
+ }
+
+ if (ftruncate(fd, 0) < 0) {
+ error_setg_errno(errp, errno, "Failed to truncate pid file");
+ goto fail_unlink;
+ }
+
+ snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid());
+ if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) {
+ error_setg(errp, "Failed to write pid file");
+ goto fail_unlink;
+ }
+
+ return true;
+
+fail_unlink:
+ unlink(path);
+fail_close:
+ close(fd);
+ return false;
+}
+
+void *qemu_oom_check(void *ptr)
+{
+ if (ptr == NULL) {
+ fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno));
+ abort();
+ }
+ return ptr;
+}
+
+void *qemu_try_memalign(size_t alignment, size_t size)
+{
+ void *ptr;
+
+ if (alignment < sizeof(void*)) {
+ alignment = sizeof(void*);
+ } else {
+ g_assert(is_power_of_2(alignment));
+ }
+
+#if defined(CONFIG_POSIX_MEMALIGN)
+ int ret;
+ ret = posix_memalign(&ptr, alignment, size);
+ if (ret != 0) {
+ errno = ret;
+ ptr = NULL;
+ }
+#elif defined(CONFIG_BSD)
+ ptr = valloc(size);
+#else
+ ptr = memalign(alignment, size);
+#endif
+ trace_qemu_memalign(alignment, size, ptr);
+ return ptr;
+}
+
+void *qemu_memalign(size_t alignment, size_t size)
+{
+ return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
+/* alloc shared memory pages */
+void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared,
+ bool noreserve)
+{
+ const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) |
+ (noreserve ? QEMU_MAP_NORESERVE : 0);
+ size_t align = QEMU_VMALLOC_ALIGN;
+ void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0);
+
+ if (ptr == MAP_FAILED) {
+ return NULL;
+ }
+
+ if (alignment) {
+ *alignment = align;
+ }
+
+ trace_qemu_anon_ram_alloc(size, ptr);
+ return ptr;
+}
+
+void qemu_vfree(void *ptr)
+{
+ trace_qemu_vfree(ptr);
+ free(ptr);
+}
+
+void qemu_anon_ram_free(void *ptr, size_t size)
+{
+ trace_qemu_anon_ram_free(ptr, size);
+ qemu_ram_munmap(-1, ptr, size);
+}
+
+void qemu_set_block(int fd)
+{
+ int f;
+ f = fcntl(fd, F_GETFL);
+ assert(f != -1);
+ f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK);
+ assert(f != -1);
+}
+
+int qemu_try_set_nonblock(int fd)
+{
+ int f;
+ f = fcntl(fd, F_GETFL);
+ if (f == -1) {
+ return -errno;
+ }
+ if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) {
+ return -errno;
+ }
+ return 0;
+}
+
+void qemu_set_nonblock(int fd)
+{
+ int f;
+ f = qemu_try_set_nonblock(fd);
+ assert(f == 0);
+}
+
+int socket_set_fast_reuse(int fd)
+{
+ int val = 1, ret;
+
+ ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR,
+ (const char *)&val, sizeof(val));
+
+ assert(ret == 0);
+
+ return ret;
+}
+
+void qemu_set_cloexec(int fd)
+{
+ int f;
+ f = fcntl(fd, F_GETFD);
+ assert(f != -1);
+ f = fcntl(fd, F_SETFD, f | FD_CLOEXEC);
+ assert(f != -1);
+}
+
+/*
+ * Creates a pipe with FD_CLOEXEC set on both file descriptors
+ */
+int qemu_pipe(int pipefd[2])
+{
+ int ret;
+
+#ifdef CONFIG_PIPE2
+ ret = pipe2(pipefd, O_CLOEXEC);
+ if (ret != -1 || errno != ENOSYS) {
+ return ret;
+ }
+#endif
+ ret = pipe(pipefd);
+ if (ret == 0) {
+ qemu_set_cloexec(pipefd[0]);
+ qemu_set_cloexec(pipefd[1]);
+ }
+
+ return ret;
+}
+
+char *
+qemu_get_local_state_pathname(const char *relative_pathname)
+{
+ g_autofree char *dir = g_strdup_printf("%s/%s",
+ CONFIG_QEMU_LOCALSTATEDIR,
+ relative_pathname);
+ return get_relocated_path(dir);
+}
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+ struct termios tty;
+
+ tcgetattr(fd, &tty);
+
+ if (echo) {
+ tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN;
+ } else {
+ tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN);
+ }
+
+ tcsetattr(fd, TCSANOW, &tty);
+}
+
+static const char *exec_dir;
+
+void qemu_init_exec_dir(const char *argv0)
+{
+ char *p = NULL;
+ char buf[PATH_MAX];
+
+ if (exec_dir) {
+ return;
+ }
+
+#if defined(__linux__)
+ {
+ int len;
+ len = readlink("/proc/self/exe", buf, sizeof(buf) - 1);
+ if (len > 0) {
+ buf[len] = 0;
+ p = buf;
+ }
+ }
+#elif defined(__FreeBSD__) \
+ || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME))
+ {
+#if defined(__FreeBSD__)
+ static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1};
+#else
+ static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME};
+#endif
+ size_t len = sizeof(buf) - 1;
+
+ *buf = '\0';
+ if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) &&
+ *buf) {
+ buf[sizeof(buf) - 1] = '\0';
+ p = buf;
+ }
+ }
+#elif defined(__APPLE__)
+ {
+ char fpath[PATH_MAX];
+ uint32_t len = sizeof(fpath);
+ if (_NSGetExecutablePath(fpath, &len) == 0) {
+ p = realpath(fpath, buf);
+ if (!p) {
+ return;
+ }
+ }
+ }
+#elif defined(__HAIKU__)
+ {
+ image_info ii;
+ int32_t c = 0;
+
+ *buf = '\0';
+ while (get_next_image_info(0, &c, &ii) == B_OK) {
+ if (ii.type == B_APP_IMAGE) {
+ strncpy(buf, ii.name, sizeof(buf));
+ buf[sizeof(buf) - 1] = 0;
+ p = buf;
+ break;
+ }
+ }
+ }
+#endif
+ /* If we don't have any way of figuring out the actual executable
+ location then try argv[0]. */
+ if (!p && argv0) {
+ p = realpath(argv0, buf);
+ }
+ if (p) {
+ exec_dir = g_path_get_dirname(p);
+ } else {
+ exec_dir = CONFIG_BINDIR;
+ }
+}
+
+const char *qemu_get_exec_dir(void)
+{
+ return exec_dir;
+}
+
+static void sigbus_handler(int signal)
+{
+ int i;
+ if (memset_thread) {
+ for (i = 0; i < memset_num_threads; i++) {
+ if (qemu_thread_is_self(&memset_thread[i].pgthread)) {
+ siglongjmp(memset_thread[i].env, 1);
+ }
+ }
+ }
+}
+
+static void *do_touch_pages(void *arg)
+{
+ MemsetThread *memset_args = (MemsetThread *)arg;
+ sigset_t set, oldset;
+
+ /*
+ * On Linux, the page faults from the loop below can cause mmap_sem
+ * contention with allocation of the thread stacks. Do not start
+ * clearing until all threads have been created.
+ */
+ qemu_mutex_lock(&page_mutex);
+ while(!threads_created_flag){
+ qemu_cond_wait(&page_cond, &page_mutex);
+ }
+ qemu_mutex_unlock(&page_mutex);
+
+ /* unblock SIGBUS */
+ sigemptyset(&set);
+ sigaddset(&set, SIGBUS);
+ pthread_sigmask(SIG_UNBLOCK, &set, &oldset);
+
+ if (sigsetjmp(memset_args->env, 1)) {
+ memset_thread_failed = true;
+ } else {
+ char *addr = memset_args->addr;
+ size_t numpages = memset_args->numpages;
+ size_t hpagesize = memset_args->hpagesize;
+ size_t i;
+ for (i = 0; i < numpages; i++) {
+ /*
+ * Read & write back the same value, so we don't
+ * corrupt existing user/app data that might be
+ * stored.
+ *
+ * 'volatile' to stop compiler optimizing this away
+ * to a no-op
+ *
+ * TODO: get a better solution from kernel so we
+ * don't need to write at all so we don't cause
+ * wear on the storage backing the region...
+ */
+ *(volatile char *)addr = *addr;
+ addr += hpagesize;
+ }
+ }
+ pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+ return NULL;
+}
+
+static inline int get_memset_num_threads(int smp_cpus)
+{
+ long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
+ int ret = 1;
+
+ if (host_procs > 0) {
+ ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus);
+ }
+ /* In case sysconf() fails, we fall back to single threaded */
+ return ret;
+}
+
+static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
+ int smp_cpus)
+{
+ static gsize initialized = 0;
+ size_t numpages_per_thread, leftover;
+ char *addr = area;
+ int i = 0;
+
+ if (g_once_init_enter(&initialized)) {
+ qemu_mutex_init(&page_mutex);
+ qemu_cond_init(&page_cond);
+ g_once_init_leave(&initialized, 1);
+ }
+
+ memset_thread_failed = false;
+ threads_created_flag = false;
+ memset_num_threads = get_memset_num_threads(smp_cpus);
+ memset_thread = g_new0(MemsetThread, memset_num_threads);
+ numpages_per_thread = numpages / memset_num_threads;
+ leftover = numpages % memset_num_threads;
+ for (i = 0; i < memset_num_threads; i++) {
+ memset_thread[i].addr = addr;
+ memset_thread[i].numpages = numpages_per_thread + (i < leftover);
+ memset_thread[i].hpagesize = hpagesize;
+ qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
+ do_touch_pages, &memset_thread[i],
+ QEMU_THREAD_JOINABLE);
+ addr += memset_thread[i].numpages * hpagesize;
+ }
+
+ qemu_mutex_lock(&page_mutex);
+ threads_created_flag = true;
+ qemu_cond_broadcast(&page_cond);
+ qemu_mutex_unlock(&page_mutex);
+
+ for (i = 0; i < memset_num_threads; i++) {
+ qemu_thread_join(&memset_thread[i].pgthread);
+ }
+ g_free(memset_thread);
+ memset_thread = NULL;
+
+ return memset_thread_failed;
+}
+
+void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
+ Error **errp)
+{
+ int ret;
+ struct sigaction act, oldact;
+ size_t hpagesize = qemu_fd_getpagesize(fd);
+ size_t numpages = DIV_ROUND_UP(memory, hpagesize);
+
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = &sigbus_handler;
+ act.sa_flags = 0;
+
+ ret = sigaction(SIGBUS, &act, &oldact);
+ if (ret) {
+ error_setg_errno(errp, errno,
+ "os_mem_prealloc: failed to install signal handler");
+ return;
+ }
+
+ /* touch pages simultaneously */
+ if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+ error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
+ "pages available to allocate guest RAM");
+ }
+
+ ret = sigaction(SIGBUS, &oldact, NULL);
+ if (ret) {
+ /* Terminate QEMU since it can't recover from error */
+ perror("os_mem_prealloc: failed to reinstall signal handler");
+ exit(1);
+ }
+}
+
+char *qemu_get_pid_name(pid_t pid)
+{
+ char *name = NULL;
+
+#if defined(__FreeBSD__)
+ /* BSDs don't have /proc, but they provide a nice substitute */
+ struct kinfo_proc *proc = kinfo_getproc(pid);
+
+ if (proc) {
+ name = g_strdup(proc->ki_comm);
+ free(proc);
+ }
+#else
+ /* Assume a system with reasonable procfs */
+ char *pid_path;
+ size_t len;
+
+ pid_path = g_strdup_printf("/proc/%d/cmdline", pid);
+ g_file_get_contents(pid_path, &name, &len, NULL);
+ g_free(pid_path);
+#endif
+
+ return name;
+}
+
+
+pid_t qemu_fork(Error **errp)
+{
+ sigset_t oldmask, newmask;
+ struct sigaction sig_action;
+ int saved_errno;
+ pid_t pid;
+
+ /*
+ * Need to block signals now, so that child process can safely
+ * kill off caller's signal handlers without a race.
+ */
+ sigfillset(&newmask);
+ if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) {
+ error_setg_errno(errp, errno,
+ "cannot block signals");
+ return -1;
+ }
+
+ pid = fork();
+ saved_errno = errno;
+
+ if (pid < 0) {
+ /* attempt to restore signal mask, but ignore failure, to
+ * avoid obscuring the fork failure */
+ (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
+ error_setg_errno(errp, saved_errno,
+ "cannot fork child process");
+ errno = saved_errno;
+ return -1;
+ } else if (pid) {
+ /* parent process */
+
+ /* Restore our original signal mask now that the child is
+ * safely running. Only documented failures are EFAULT (not
+ * possible, since we are using just-grabbed mask) or EINVAL
+ * (not possible, since we are using correct arguments). */
+ (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL);
+ } else {
+ /* child process */
+ size_t i;
+
+ /* Clear out all signal handlers from parent so nothing
+ * unexpected can happen in our child once we unblock
+ * signals */
+ sig_action.sa_handler = SIG_DFL;
+ sig_action.sa_flags = 0;
+ sigemptyset(&sig_action.sa_mask);
+
+ for (i = 1; i < NSIG; i++) {
+ /* Only possible errors are EFAULT or EINVAL The former
+ * won't happen, the latter we expect, so no need to check
+ * return value */
+ (void)sigaction(i, &sig_action, NULL);
+ }
+
+ /* Unmask all signals in child, since we've no idea what the
+ * caller's done with their signal mask and don't want to
+ * propagate that to children */
+ sigemptyset(&newmask);
+ if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) {
+ Error *local_err = NULL;
+ error_setg_errno(&local_err, errno,
+ "cannot unblock signals");
+ error_report_err(local_err);
+ _exit(1);
+ }
+ }
+ return pid;
+}
+
+void *qemu_alloc_stack(size_t *sz)
+{
+ void *ptr, *guardpage;
+ int flags;
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ void *ptr2;
+#endif
+ size_t pagesz = qemu_real_host_page_size;
+#ifdef _SC_THREAD_STACK_MIN
+ /* avoid stacks smaller than _SC_THREAD_STACK_MIN */
+ long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN);
+ *sz = MAX(MAX(min_stack_sz, 0), *sz);
+#endif
+ /* adjust stack size to a multiple of the page size */
+ *sz = ROUND_UP(*sz, pagesz);
+ /* allocate one extra page for the guard page */
+ *sz += pagesz;
+
+ flags = MAP_PRIVATE | MAP_ANONYMOUS;
+#if defined(MAP_STACK) && defined(__OpenBSD__)
+ /* Only enable MAP_STACK on OpenBSD. Other OS's such as
+ * Linux/FreeBSD/NetBSD have a flag with the same name
+ * but have differing functionality. OpenBSD will SEGV
+ * if it spots execution with a stack pointer pointing
+ * at memory that was not allocated with MAP_STACK.
+ */
+ flags |= MAP_STACK;
+#endif
+
+ ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0);
+ if (ptr == MAP_FAILED) {
+ perror("failed to allocate memory for stack");
+ abort();
+ }
+
+#if defined(HOST_IA64)
+ /* separate register stack */
+ guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz);
+#elif defined(HOST_HPPA)
+ /* stack grows up */
+ guardpage = ptr + *sz - pagesz;
+#else
+ /* stack grows down */
+ guardpage = ptr;
+#endif
+ if (mprotect(guardpage, pagesz, PROT_NONE) != 0) {
+ perror("failed to set up stack guard page");
+ abort();
+ }
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) {
+ *(uint32_t *)ptr2 = 0xdeadbeaf;
+ }
+#endif
+
+ return ptr;
+}
+
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static __thread unsigned int max_stack_usage;
+#endif
+
+void qemu_free_stack(void *stack, size_t sz)
+{
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ unsigned int usage;
+ void *ptr;
+
+ for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz;
+ ptr += sizeof(uint32_t)) {
+ if (*(uint32_t *)ptr != 0xdeadbeaf) {
+ break;
+ }
+ }
+ usage = sz - (uintptr_t) (ptr - stack);
+ if (usage > max_stack_usage) {
+ error_report("thread %d max stack usage increased from %u to %u",
+ qemu_get_thread_id(), max_stack_usage, usage);
+ max_stack_usage = usage;
+ }
+#endif
+
+ munmap(stack, sz);
+}
+
+/*
+ * Disable CFI checks.
+ * We are going to call a signal hander directly. Such handler may or may not
+ * have been defined in our binary, so there's no guarantee that the pointer
+ * used to set the handler is a cfi-valid pointer. Since the handlers are
+ * stored in kernel memory, changing the handler to an attacker-defined
+ * function requires being able to call a sigaction() syscall,
+ * which is not as easy as overwriting a pointer in memory.
+ */
+QEMU_DISABLE_CFI
+void sigaction_invoke(struct sigaction *action,
+ struct qemu_signalfd_siginfo *info)
+{
+ siginfo_t si = {};
+ si.si_signo = info->ssi_signo;
+ si.si_errno = info->ssi_errno;
+ si.si_code = info->ssi_code;
+
+ /* Convert the minimal set of fields defined by POSIX.
+ * Positive si_code values are reserved for kernel-generated
+ * signals, where the valid siginfo fields are determined by
+ * the signal number. But according to POSIX, it is unspecified
+ * whether SI_USER and SI_QUEUE have values less than or equal to
+ * zero.
+ */
+ if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE ||
+ info->ssi_code <= 0) {
+ /* SIGTERM, etc. */
+ si.si_pid = info->ssi_pid;
+ si.si_uid = info->ssi_uid;
+ } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE ||
+ info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) {
+ si.si_addr = (void *)(uintptr_t)info->ssi_addr;
+ } else if (info->ssi_signo == SIGCHLD) {
+ si.si_pid = info->ssi_pid;
+ si.si_status = info->ssi_status;
+ si.si_uid = info->ssi_uid;
+ }
+ action->sa_sigaction(info->ssi_signo, &si, NULL);
+}
+
+#ifndef HOST_NAME_MAX
+# ifdef _POSIX_HOST_NAME_MAX
+# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX
+# else
+# define HOST_NAME_MAX 255
+# endif
+#endif
+
+char *qemu_get_host_name(Error **errp)
+{
+ long len = -1;
+ g_autofree char *hostname = NULL;
+
+#ifdef _SC_HOST_NAME_MAX
+ len = sysconf(_SC_HOST_NAME_MAX);
+#endif /* _SC_HOST_NAME_MAX */
+
+ if (len < 0) {
+ len = HOST_NAME_MAX;
+ }
+
+ /* Unfortunately, gethostname() below does not guarantee a
+ * NULL terminated string. Therefore, allocate one byte more
+ * to be sure. */
+ hostname = g_new0(char, len + 1);
+
+ if (gethostname(hostname, len) < 0) {
+ error_setg_errno(errp, errno,
+ "cannot get hostname");
+ return NULL;
+ }
+
+ return g_steal_pointer(&hostname);
+}
+
+size_t qemu_get_host_physmem(void)
+{
+#ifdef _SC_PHYS_PAGES
+ long pages = sysconf(_SC_PHYS_PAGES);
+ if (pages > 0) {
+ if (pages > SIZE_MAX / qemu_real_host_page_size) {
+ return SIZE_MAX;
+ } else {
+ return pages * qemu_real_host_page_size;
+ }
+ }
+#endif
+ return 0;
+}
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
new file mode 100644
index 000000000..af559ef33
--- /dev/null
+++ b/util/oslib-win32.c
@@ -0,0 +1,654 @@
+/*
+ * os-win32.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010-2016 Red Hat, Inc.
+ *
+ * QEMU library functions for win32 which are shared between QEMU and
+ * the QEMU tools.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * The implementation of g_poll (functions poll_rest, g_poll) at the end of
+ * this file are based on code from GNOME glib-2 and use a different license,
+ * see the license comment there.
+ */
+
+#include "qemu/osdep.h"
+#include <windows.h>
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "trace.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include <malloc.h>
+
+/* this must come after including "trace.h" */
+#include <shlobj.h>
+
+void *qemu_oom_check(void *ptr)
+{
+ if (ptr == NULL) {
+ fprintf(stderr, "Failed to allocate memory: %lu\n", GetLastError());
+ abort();
+ }
+ return ptr;
+}
+
+void *qemu_try_memalign(size_t alignment, size_t size)
+{
+ void *ptr;
+
+ g_assert(size != 0);
+ if (alignment < sizeof(void *)) {
+ alignment = sizeof(void *);
+ } else {
+ g_assert(is_power_of_2(alignment));
+ }
+ ptr = _aligned_malloc(size, alignment);
+ trace_qemu_memalign(alignment, size, ptr);
+ return ptr;
+}
+
+void *qemu_memalign(size_t alignment, size_t size)
+{
+ return qemu_oom_check(qemu_try_memalign(alignment, size));
+}
+
+static int get_allocation_granularity(void)
+{
+ SYSTEM_INFO system_info;
+
+ GetSystemInfo(&system_info);
+ return system_info.dwAllocationGranularity;
+}
+
+void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared,
+ bool noreserve)
+{
+ void *ptr;
+
+ if (noreserve) {
+ /*
+ * We need a MEM_COMMIT before accessing any memory in a MEM_RESERVE
+ * area; we cannot easily mimic POSIX MAP_NORESERVE semantics.
+ */
+ error_report("Skipping reservation of swap space is not supported.");
+ return NULL;
+ }
+
+ ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE);
+ trace_qemu_anon_ram_alloc(size, ptr);
+
+ if (ptr && align) {
+ *align = MAX(get_allocation_granularity(), getpagesize());
+ }
+ return ptr;
+}
+
+void qemu_vfree(void *ptr)
+{
+ trace_qemu_vfree(ptr);
+ _aligned_free(ptr);
+}
+
+void qemu_anon_ram_free(void *ptr, size_t size)
+{
+ trace_qemu_anon_ram_free(ptr, size);
+ if (ptr) {
+ VirtualFree(ptr, 0, MEM_RELEASE);
+ }
+}
+
+#ifndef _POSIX_THREAD_SAFE_FUNCTIONS
+/* FIXME: add proper locking */
+struct tm *gmtime_r(const time_t *timep, struct tm *result)
+{
+ struct tm *p = gmtime(timep);
+ memset(result, 0, sizeof(*result));
+ if (p) {
+ *result = *p;
+ p = result;
+ }
+ return p;
+}
+
+/* FIXME: add proper locking */
+struct tm *localtime_r(const time_t *timep, struct tm *result)
+{
+ struct tm *p = localtime(timep);
+ memset(result, 0, sizeof(*result));
+ if (p) {
+ *result = *p;
+ p = result;
+ }
+ return p;
+}
+#endif /* _POSIX_THREAD_SAFE_FUNCTIONS */
+
+static int socket_error(void)
+{
+ switch (WSAGetLastError()) {
+ case 0:
+ return 0;
+ case WSAEINTR:
+ return EINTR;
+ case WSAEINVAL:
+ return EINVAL;
+ case WSA_INVALID_HANDLE:
+ return EBADF;
+ case WSA_NOT_ENOUGH_MEMORY:
+ return ENOMEM;
+ case WSA_INVALID_PARAMETER:
+ return EINVAL;
+ case WSAENAMETOOLONG:
+ return ENAMETOOLONG;
+ case WSAENOTEMPTY:
+ return ENOTEMPTY;
+ case WSAEWOULDBLOCK:
+ /* not using EWOULDBLOCK as we don't want code to have
+ * to check both EWOULDBLOCK and EAGAIN */
+ return EAGAIN;
+ case WSAEINPROGRESS:
+ return EINPROGRESS;
+ case WSAEALREADY:
+ return EALREADY;
+ case WSAENOTSOCK:
+ return ENOTSOCK;
+ case WSAEDESTADDRREQ:
+ return EDESTADDRREQ;
+ case WSAEMSGSIZE:
+ return EMSGSIZE;
+ case WSAEPROTOTYPE:
+ return EPROTOTYPE;
+ case WSAENOPROTOOPT:
+ return ENOPROTOOPT;
+ case WSAEPROTONOSUPPORT:
+ return EPROTONOSUPPORT;
+ case WSAEOPNOTSUPP:
+ return EOPNOTSUPP;
+ case WSAEAFNOSUPPORT:
+ return EAFNOSUPPORT;
+ case WSAEADDRINUSE:
+ return EADDRINUSE;
+ case WSAEADDRNOTAVAIL:
+ return EADDRNOTAVAIL;
+ case WSAENETDOWN:
+ return ENETDOWN;
+ case WSAENETUNREACH:
+ return ENETUNREACH;
+ case WSAENETRESET:
+ return ENETRESET;
+ case WSAECONNABORTED:
+ return ECONNABORTED;
+ case WSAECONNRESET:
+ return ECONNRESET;
+ case WSAENOBUFS:
+ return ENOBUFS;
+ case WSAEISCONN:
+ return EISCONN;
+ case WSAENOTCONN:
+ return ENOTCONN;
+ case WSAETIMEDOUT:
+ return ETIMEDOUT;
+ case WSAECONNREFUSED:
+ return ECONNREFUSED;
+ case WSAELOOP:
+ return ELOOP;
+ case WSAEHOSTUNREACH:
+ return EHOSTUNREACH;
+ default:
+ return EIO;
+ }
+}
+
+void qemu_set_block(int fd)
+{
+ unsigned long opt = 0;
+ WSAEventSelect(fd, NULL, 0);
+ ioctlsocket(fd, FIONBIO, &opt);
+}
+
+int qemu_try_set_nonblock(int fd)
+{
+ unsigned long opt = 1;
+ if (ioctlsocket(fd, FIONBIO, &opt) != NO_ERROR) {
+ return -socket_error();
+ }
+ return 0;
+}
+
+void qemu_set_nonblock(int fd)
+{
+ (void)qemu_try_set_nonblock(fd);
+}
+
+int socket_set_fast_reuse(int fd)
+{
+ /* Enabling the reuse of an endpoint that was used by a socket still in
+ * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows
+ * fast reuse is the default and SO_REUSEADDR does strange things. So we
+ * don't have to do anything here. More info can be found at:
+ * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */
+ return 0;
+}
+
+int inet_aton(const char *cp, struct in_addr *ia)
+{
+ uint32_t addr = inet_addr(cp);
+ if (addr == 0xffffffff) {
+ return 0;
+ }
+ ia->s_addr = addr;
+ return 1;
+}
+
+void qemu_set_cloexec(int fd)
+{
+}
+
+/* Offset between 1/1/1601 and 1/1/1970 in 100 nanosec units */
+#define _W32_FT_OFFSET (116444736000000000ULL)
+
+int qemu_gettimeofday(qemu_timeval *tp)
+{
+ union {
+ unsigned long long ns100; /*time since 1 Jan 1601 in 100ns units */
+ FILETIME ft;
+ } _now;
+
+ if(tp) {
+ GetSystemTimeAsFileTime (&_now.ft);
+ tp->tv_usec=(long)((_now.ns100 / 10ULL) % 1000000ULL );
+ tp->tv_sec= (long)((_now.ns100 - _W32_FT_OFFSET) / 10000000ULL);
+ }
+ /* Always return 0 as per Open Group Base Specifications Issue 6.
+ Do not set errno on error. */
+ return 0;
+}
+
+int qemu_get_thread_id(void)
+{
+ return GetCurrentThreadId();
+}
+
+char *
+qemu_get_local_state_pathname(const char *relative_pathname)
+{
+ HRESULT result;
+ char base_path[MAX_PATH+1] = "";
+
+ result = SHGetFolderPath(NULL, CSIDL_COMMON_APPDATA, NULL,
+ /* SHGFP_TYPE_CURRENT */ 0, base_path);
+ if (result != S_OK) {
+ /* misconfigured environment */
+ g_critical("CSIDL_COMMON_APPDATA unavailable: %ld", (long)result);
+ abort();
+ }
+ return g_strdup_printf("%s" G_DIR_SEPARATOR_S "%s", base_path,
+ relative_pathname);
+}
+
+void qemu_set_tty_echo(int fd, bool echo)
+{
+ HANDLE handle = (HANDLE)_get_osfhandle(fd);
+ DWORD dwMode = 0;
+
+ if (handle == INVALID_HANDLE_VALUE) {
+ return;
+ }
+
+ GetConsoleMode(handle, &dwMode);
+
+ if (echo) {
+ SetConsoleMode(handle, dwMode | ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT);
+ } else {
+ SetConsoleMode(handle,
+ dwMode & ~(ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT));
+ }
+}
+
+static const char *exec_dir;
+
+void qemu_init_exec_dir(const char *argv0)
+{
+
+ char *p;
+ char buf[MAX_PATH];
+ DWORD len;
+
+ if (exec_dir) {
+ return;
+ }
+
+ len = GetModuleFileName(NULL, buf, sizeof(buf) - 1);
+ if (len == 0) {
+ return;
+ }
+
+ buf[len] = 0;
+ p = buf + len - 1;
+ while (p != buf && *p != '\\') {
+ p--;
+ }
+ *p = 0;
+ if (access(buf, R_OK) == 0) {
+ exec_dir = g_strdup(buf);
+ } else {
+ exec_dir = CONFIG_BINDIR;
+ }
+}
+
+const char *qemu_get_exec_dir(void)
+{
+ return exec_dir;
+}
+
+int getpagesize(void)
+{
+ SYSTEM_INFO system_info;
+
+ GetSystemInfo(&system_info);
+ return system_info.dwPageSize;
+}
+
+void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
+ Error **errp)
+{
+ int i;
+ size_t pagesize = qemu_real_host_page_size;
+
+ memory = (memory + pagesize - 1) & -pagesize;
+ for (i = 0; i < memory / pagesize; i++) {
+ memset(area + pagesize * i, 0, 1);
+ }
+}
+
+char *qemu_get_pid_name(pid_t pid)
+{
+ /* XXX Implement me */
+ abort();
+}
+
+
+pid_t qemu_fork(Error **errp)
+{
+ errno = ENOSYS;
+ error_setg_errno(errp, errno,
+ "cannot fork child process");
+ return -1;
+}
+
+
+#undef connect
+int qemu_connect_wrap(int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ int ret;
+ ret = connect(sockfd, addr, addrlen);
+ if (ret < 0) {
+ if (WSAGetLastError() == WSAEWOULDBLOCK) {
+ errno = EINPROGRESS;
+ } else {
+ errno = socket_error();
+ }
+ }
+ return ret;
+}
+
+
+#undef listen
+int qemu_listen_wrap(int sockfd, int backlog)
+{
+ int ret;
+ ret = listen(sockfd, backlog);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef bind
+int qemu_bind_wrap(int sockfd, const struct sockaddr *addr,
+ socklen_t addrlen)
+{
+ int ret;
+ ret = bind(sockfd, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef socket
+int qemu_socket_wrap(int domain, int type, int protocol)
+{
+ int ret;
+ ret = socket(domain, type, protocol);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef accept
+int qemu_accept_wrap(int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen)
+{
+ int ret;
+ ret = accept(sockfd, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef shutdown
+int qemu_shutdown_wrap(int sockfd, int how)
+{
+ int ret;
+ ret = shutdown(sockfd, how);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef ioctlsocket
+int qemu_ioctlsocket_wrap(int fd, int req, void *val)
+{
+ int ret;
+ ret = ioctlsocket(fd, req, val);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef closesocket
+int qemu_closesocket_wrap(int fd)
+{
+ int ret;
+ ret = closesocket(fd);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef getsockopt
+int qemu_getsockopt_wrap(int sockfd, int level, int optname,
+ void *optval, socklen_t *optlen)
+{
+ int ret;
+ ret = getsockopt(sockfd, level, optname, optval, optlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef setsockopt
+int qemu_setsockopt_wrap(int sockfd, int level, int optname,
+ const void *optval, socklen_t optlen)
+{
+ int ret;
+ ret = setsockopt(sockfd, level, optname, optval, optlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef getpeername
+int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen)
+{
+ int ret;
+ ret = getpeername(sockfd, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef getsockname
+int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr,
+ socklen_t *addrlen)
+{
+ int ret;
+ ret = getsockname(sockfd, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef send
+ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags)
+{
+ int ret;
+ ret = send(sockfd, buf, len, flags);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef sendto
+ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags,
+ const struct sockaddr *addr, socklen_t addrlen)
+{
+ int ret;
+ ret = sendto(sockfd, buf, len, flags, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef recv
+ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags)
+{
+ int ret;
+ ret = recv(sockfd, buf, len, flags);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+
+#undef recvfrom
+ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags,
+ struct sockaddr *addr, socklen_t *addrlen)
+{
+ int ret;
+ ret = recvfrom(sockfd, buf, len, flags, addr, addrlen);
+ if (ret < 0) {
+ errno = socket_error();
+ }
+ return ret;
+}
+
+bool qemu_write_pidfile(const char *filename, Error **errp)
+{
+ char buffer[128];
+ int len;
+ HANDLE file;
+ OVERLAPPED overlap;
+ BOOL ret;
+ memset(&overlap, 0, sizeof(overlap));
+
+ file = CreateFile(filename, GENERIC_WRITE, FILE_SHARE_READ, NULL,
+ OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+ error_setg(errp, "Failed to create PID file");
+ return false;
+ }
+ len = snprintf(buffer, sizeof(buffer), FMT_pid "\n", (pid_t)getpid());
+ ret = WriteFile(file, (LPCVOID)buffer, (DWORD)len,
+ NULL, &overlap);
+ CloseHandle(file);
+ if (ret == 0) {
+ error_setg(errp, "Failed to write PID file");
+ return false;
+ }
+ return true;
+}
+
+char *qemu_get_host_name(Error **errp)
+{
+ wchar_t tmp[MAX_COMPUTERNAME_LENGTH + 1];
+ DWORD size = G_N_ELEMENTS(tmp);
+
+ if (GetComputerNameW(tmp, &size) == 0) {
+ error_setg_win32(errp, GetLastError(), "failed close handle");
+ return NULL;
+ }
+
+ return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL);
+}
+
+size_t qemu_get_host_physmem(void)
+{
+ MEMORYSTATUSEX statex;
+ statex.dwLength = sizeof(statex);
+
+ if (GlobalMemoryStatusEx(&statex)) {
+ return statex.ullTotalPhys;
+ }
+ return 0;
+}
diff --git a/util/pagesize.c b/util/pagesize.c
new file mode 100644
index 000000000..998632cf6
--- /dev/null
+++ b/util/pagesize.c
@@ -0,0 +1,18 @@
+/*
+ * pagesize.c - query the host about its page size
+ *
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+uintptr_t qemu_real_host_page_size;
+intptr_t qemu_real_host_page_mask;
+
+static void __attribute__((constructor)) init_real_host_page_size(void)
+{
+ qemu_real_host_page_size = getpagesize();
+ qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size;
+}
diff --git a/util/path.c b/util/path.c
new file mode 100644
index 000000000..8e174eb43
--- /dev/null
+++ b/util/path.c
@@ -0,0 +1,70 @@
+/* Code to mangle pathnames into those matching a given prefix.
+ eg. open("/lib/foo.so") => open("/usr/gnemul/i386-linux/lib/foo.so");
+
+ The assumption is that this area does not change.
+*/
+#include "qemu/osdep.h"
+#include <sys/param.h>
+#include <dirent.h>
+#include "qemu/cutils.h"
+#include "qemu/path.h"
+#include "qemu/thread.h"
+
+static const char *base;
+static GHashTable *hash;
+static QemuMutex lock;
+
+void init_paths(const char *prefix)
+{
+ if (prefix[0] == '\0' || !strcmp(prefix, "/")) {
+ return;
+ }
+
+ if (prefix[0] == '/') {
+ base = g_strdup(prefix);
+ } else {
+ char *cwd = g_get_current_dir();
+ base = g_build_filename(cwd, prefix, NULL);
+ g_free(cwd);
+ }
+
+ hash = g_hash_table_new(g_str_hash, g_str_equal);
+ qemu_mutex_init(&lock);
+}
+
+/* Look for path in emulation dir, otherwise return name. */
+const char *path(const char *name)
+{
+ gpointer key, value;
+ const char *ret;
+
+ /* Only do absolute paths: quick and dirty, but should mostly be OK. */
+ if (!base || !name || name[0] != '/') {
+ return name;
+ }
+
+ qemu_mutex_lock(&lock);
+
+ /* Have we looked up this file before? */
+ if (g_hash_table_lookup_extended(hash, name, &key, &value)) {
+ ret = value ? value : name;
+ } else {
+ char *save = g_strdup(name);
+ char *full = g_build_filename(base, name, NULL);
+
+ /* Look for the path; record the result, pass or fail. */
+ if (access(full, F_OK) == 0) {
+ /* Exists. */
+ g_hash_table_insert(hash, save, full);
+ ret = full;
+ } else {
+ /* Does not exist. */
+ g_free(full);
+ g_hash_table_insert(hash, save, NULL);
+ ret = name;
+ }
+ }
+
+ qemu_mutex_unlock(&lock);
+ return ret;
+}
diff --git a/util/qdist.c b/util/qdist.c
new file mode 100644
index 000000000..5f75e24c2
--- /dev/null
+++ b/util/qdist.c
@@ -0,0 +1,397 @@
+/*
+ * qdist.c - QEMU helpers for handling frequency distributions of data.
+ *
+ * Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/qdist.h"
+
+#include <math.h>
+#ifndef NAN
+#define NAN (0.0 / 0.0)
+#endif
+
+#define QDIST_EMPTY_STR "(empty)"
+
+void qdist_init(struct qdist *dist)
+{
+ dist->entries = g_new(struct qdist_entry, 1);
+ dist->size = 1;
+ dist->n = 0;
+}
+
+void qdist_destroy(struct qdist *dist)
+{
+ g_free(dist->entries);
+}
+
+static inline int qdist_cmp_double(double a, double b)
+{
+ if (a > b) {
+ return 1;
+ } else if (a < b) {
+ return -1;
+ }
+ return 0;
+}
+
+static int qdist_cmp(const void *ap, const void *bp)
+{
+ const struct qdist_entry *a = ap;
+ const struct qdist_entry *b = bp;
+
+ return qdist_cmp_double(a->x, b->x);
+}
+
+void qdist_add(struct qdist *dist, double x, long count)
+{
+ struct qdist_entry *entry = NULL;
+
+ if (dist->n) {
+ struct qdist_entry e;
+
+ e.x = x;
+ entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp);
+ }
+
+ if (entry) {
+ entry->count += count;
+ return;
+ }
+
+ if (unlikely(dist->n == dist->size)) {
+ dist->size *= 2;
+ dist->entries = g_renew(struct qdist_entry, dist->entries, dist->size);
+ }
+ dist->n++;
+ entry = &dist->entries[dist->n - 1];
+ entry->x = x;
+ entry->count = count;
+ qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp);
+}
+
+void qdist_inc(struct qdist *dist, double x)
+{
+ qdist_add(dist, x, 1);
+}
+
+/*
+ * Unicode for block elements. See:
+ * https://en.wikipedia.org/wiki/Block_Elements
+ */
+static const gunichar qdist_blocks[] = {
+ 0x2581,
+ 0x2582,
+ 0x2583,
+ 0x2584,
+ 0x2585,
+ 0x2586,
+ 0x2587,
+ 0x2588
+};
+
+#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks)
+
+/*
+ * Print a distribution into a string.
+ *
+ * This function assumes that appropriate binning has been done on the input;
+ * see qdist_bin__internal() and qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+static char *qdist_pr_internal(const struct qdist *dist)
+{
+ double min, max;
+ GString *s = g_string_new("");
+ size_t i;
+
+ /* if only one entry, its printout will be either full or empty */
+ if (dist->n == 1) {
+ if (dist->entries[0].count) {
+ g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]);
+ } else {
+ g_string_append_c(s, ' ');
+ }
+ goto out;
+ }
+
+ /* get min and max counts */
+ min = dist->entries[0].count;
+ max = min;
+ for (i = 0; i < dist->n; i++) {
+ struct qdist_entry *e = &dist->entries[i];
+
+ if (e->count < min) {
+ min = e->count;
+ }
+ if (e->count > max) {
+ max = e->count;
+ }
+ }
+
+ for (i = 0; i < dist->n; i++) {
+ struct qdist_entry *e = &dist->entries[i];
+ int index;
+
+ /* make an exception with 0; instead of using block[0], print a space */
+ if (e->count) {
+ /* divide first to avoid loss of precision when e->count == max */
+ index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1);
+ g_string_append_unichar(s, qdist_blocks[index]);
+ } else {
+ g_string_append_c(s, ' ');
+ }
+ }
+ out:
+ return g_string_free(s, FALSE);
+}
+
+/*
+ * Bin the distribution in @from into @n bins of consecutive, non-overlapping
+ * intervals, copying the result to @to.
+ *
+ * This function is internal to qdist: only this file and test code should
+ * ever call it.
+ *
+ * Note: calling this function on an already-binned qdist is a bug.
+ *
+ * If @n == 0 or @from->n == 1, use @from->n.
+ */
+void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n)
+{
+ double xmin, xmax;
+ double step;
+ size_t i, j;
+
+ qdist_init(to);
+
+ if (from->n == 0) {
+ return;
+ }
+ if (n == 0 || from->n == 1) {
+ n = from->n;
+ }
+
+ /* set equally-sized bins between @from's left and right */
+ xmin = qdist_xmin(from);
+ xmax = qdist_xmax(from);
+ step = (xmax - xmin) / n;
+
+ if (n == from->n) {
+ /* if @from's entries are equally spaced, no need to re-bin */
+ for (i = 0; i < from->n; i++) {
+ if (from->entries[i].x != xmin + i * step) {
+ goto rebin;
+ }
+ }
+ /* they're equally spaced, so copy the dist and bail out */
+ to->entries = g_renew(struct qdist_entry, to->entries, n);
+ to->n = from->n;
+ memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n);
+ return;
+ }
+
+ rebin:
+ j = 0;
+ for (i = 0; i < n; i++) {
+ double x;
+ double left, right;
+
+ left = xmin + i * step;
+ right = xmin + (i + 1) * step;
+
+ /* Add x, even if it might not get any counts later */
+ x = left;
+ qdist_add(to, x, 0);
+
+ /*
+ * To avoid double-counting we capture [left, right) ranges, except for
+ * the righmost bin, which captures a [left, right] range.
+ */
+ while (j < from->n && (from->entries[j].x < right || i == n - 1)) {
+ struct qdist_entry *o = &from->entries[j];
+
+ qdist_add(to, x, o->count);
+ j++;
+ }
+ }
+}
+
+/*
+ * Print @dist into a string, after re-binning it into @n bins of consecutive,
+ * non-overlapping intervals.
+ *
+ * If @n == 0, use @orig->n.
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr_plain(const struct qdist *dist, size_t n)
+{
+ struct qdist binned;
+ char *ret;
+
+ if (dist->n == 0) {
+ return g_strdup(QDIST_EMPTY_STR);
+ }
+ qdist_bin__internal(&binned, dist, n);
+ ret = qdist_pr_internal(&binned);
+ qdist_destroy(&binned);
+ return ret;
+}
+
+static char *qdist_pr_label(const struct qdist *dist, size_t n_bins,
+ uint32_t opt, bool is_left)
+{
+ const char *percent;
+ const char *lparen;
+ const char *rparen;
+ GString *s;
+ double x1, x2, step;
+ double x;
+ double n;
+ int dec;
+
+ s = g_string_new("");
+ if (!(opt & QDIST_PR_LABELS)) {
+ goto out;
+ }
+
+ dec = opt & QDIST_PR_NODECIMAL ? 0 : 1;
+ percent = opt & QDIST_PR_PERCENT ? "%" : "";
+
+ n = n_bins ? n_bins : dist->n;
+ x = is_left ? qdist_xmin(dist) : qdist_xmax(dist);
+ step = (qdist_xmax(dist) - qdist_xmin(dist)) / n;
+
+ if (opt & QDIST_PR_100X) {
+ x *= 100.0;
+ step *= 100.0;
+ }
+ if (opt & QDIST_PR_NOBINRANGE) {
+ lparen = rparen = "";
+ x1 = x;
+ x2 = x; /* unnecessary, but a dumb compiler might not figure it out */
+ } else {
+ lparen = "[";
+ rparen = is_left ? ")" : "]";
+ if (is_left) {
+ x1 = x;
+ x2 = x + step;
+ } else {
+ x1 = x - step;
+ x2 = x;
+ }
+ }
+ g_string_append_printf(s, "%s%.*f", lparen, dec, x1);
+ if (!(opt & QDIST_PR_NOBINRANGE)) {
+ g_string_append_printf(s, ",%.*f%s", dec, x2, rparen);
+ }
+ g_string_append(s, percent);
+ out:
+ return g_string_free(s, FALSE);
+}
+
+/*
+ * Print the distribution's histogram into a string.
+ *
+ * See also: qdist_pr_plain().
+ *
+ * Callers must free the returned string with g_free().
+ */
+char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt)
+{
+ const char *border = opt & QDIST_PR_BORDER ? "|" : "";
+ char *llabel, *rlabel;
+ char *hgram;
+ GString *s;
+
+ if (dist->n == 0) {
+ return g_strdup(QDIST_EMPTY_STR);
+ }
+
+ s = g_string_new("");
+
+ llabel = qdist_pr_label(dist, n_bins, opt, true);
+ rlabel = qdist_pr_label(dist, n_bins, opt, false);
+ hgram = qdist_pr_plain(dist, n_bins);
+ g_string_append_printf(s, "%s%s%s%s%s",
+ llabel, border, hgram, border, rlabel);
+ g_free(llabel);
+ g_free(rlabel);
+ g_free(hgram);
+
+ return g_string_free(s, FALSE);
+}
+
+static inline double qdist_x(const struct qdist *dist, int index)
+{
+ if (dist->n == 0) {
+ return NAN;
+ }
+ return dist->entries[index].x;
+}
+
+double qdist_xmin(const struct qdist *dist)
+{
+ return qdist_x(dist, 0);
+}
+
+double qdist_xmax(const struct qdist *dist)
+{
+ return qdist_x(dist, dist->n - 1);
+}
+
+size_t qdist_unique_entries(const struct qdist *dist)
+{
+ return dist->n;
+}
+
+unsigned long qdist_sample_count(const struct qdist *dist)
+{
+ unsigned long count = 0;
+ size_t i;
+
+ for (i = 0; i < dist->n; i++) {
+ struct qdist_entry *e = &dist->entries[i];
+
+ count += e->count;
+ }
+ return count;
+}
+
+static double qdist_pairwise_avg(const struct qdist *dist, size_t index,
+ size_t n, unsigned long count)
+{
+ /* amortize the recursion by using a base case > 2 */
+ if (n <= 8) {
+ size_t i;
+ double ret = 0;
+
+ for (i = 0; i < n; i++) {
+ struct qdist_entry *e = &dist->entries[index + i];
+
+ ret += e->x * e->count / count;
+ }
+ return ret;
+ } else {
+ size_t n2 = n / 2;
+
+ return qdist_pairwise_avg(dist, index, n2, count) +
+ qdist_pairwise_avg(dist, index + n2, n - n2, count);
+ }
+}
+
+double qdist_avg(const struct qdist *dist)
+{
+ unsigned long count;
+
+ count = qdist_sample_count(dist);
+ if (!count) {
+ return NAN;
+ }
+ return qdist_pairwise_avg(dist, 0, dist->n, count);
+}
diff --git a/util/qemu-co-shared-resource.c b/util/qemu-co-shared-resource.c
new file mode 100644
index 000000000..a66cc07e7
--- /dev/null
+++ b/util/qemu-co-shared-resource.c
@@ -0,0 +1,90 @@
+/*
+ * Helper functionality for distributing a fixed total amount of
+ * an abstract resource among multiple coroutines.
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/co-shared-resource.h"
+
+struct SharedResource {
+ uint64_t total; /* Set in shres_create() and not changed anymore */
+
+ /* State fields protected by lock */
+ uint64_t available;
+ CoQueue queue;
+
+ QemuMutex lock;
+};
+
+SharedResource *shres_create(uint64_t total)
+{
+ SharedResource *s = g_new0(SharedResource, 1);
+
+ s->total = s->available = total;
+ qemu_co_queue_init(&s->queue);
+ qemu_mutex_init(&s->lock);
+
+ return s;
+}
+
+void shres_destroy(SharedResource *s)
+{
+ assert(s->available == s->total);
+ qemu_mutex_destroy(&s->lock);
+ g_free(s);
+}
+
+/* Called with lock held. */
+static bool co_try_get_from_shres_locked(SharedResource *s, uint64_t n)
+{
+ if (s->available >= n) {
+ s->available -= n;
+ return true;
+ }
+
+ return false;
+}
+
+bool co_try_get_from_shres(SharedResource *s, uint64_t n)
+{
+ QEMU_LOCK_GUARD(&s->lock);
+ return co_try_get_from_shres_locked(s, n);
+}
+
+void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n)
+{
+ assert(n <= s->total);
+ QEMU_LOCK_GUARD(&s->lock);
+ while (!co_try_get_from_shres_locked(s, n)) {
+ qemu_co_queue_wait(&s->queue, &s->lock);
+ }
+}
+
+void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n)
+{
+ QEMU_LOCK_GUARD(&s->lock);
+ assert(s->total - s->available >= n);
+ s->available += n;
+ qemu_co_queue_restart_all(&s->queue);
+}
diff --git a/util/qemu-config.c b/util/qemu-config.c
new file mode 100644
index 000000000..436ab63b1
--- /dev/null
+++ b/util/qemu-config.c
@@ -0,0 +1,565 @@
+#include "qemu/osdep.h"
+#include "block/qdict.h" /* for qdict_extract_subqdict() */
+#include "qapi/error.h"
+#include "qapi/qapi-commands-misc.h"
+#include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
+
+static QemuOptsList *vm_config_groups[48];
+static QemuOptsList *drive_config_groups[5];
+
+static QemuOptsList *find_list(QemuOptsList **lists, const char *group,
+ Error **errp)
+{
+ int i;
+
+ qemu_load_module_for_opts(group);
+ for (i = 0; lists[i] != NULL; i++) {
+ if (strcmp(lists[i]->name, group) == 0)
+ break;
+ }
+ if (lists[i] == NULL) {
+ error_setg(errp, "There is no option group '%s'", group);
+ }
+ return lists[i];
+}
+
+QemuOptsList *qemu_find_opts(const char *group)
+{
+ QemuOptsList *ret;
+ Error *local_err = NULL;
+
+ ret = find_list(vm_config_groups, group, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+
+ return ret;
+}
+
+QemuOpts *qemu_find_opts_singleton(const char *group)
+{
+ QemuOptsList *list;
+ QemuOpts *opts;
+
+ list = qemu_find_opts(group);
+ assert(list);
+ opts = qemu_opts_find(list, NULL);
+ if (!opts) {
+ opts = qemu_opts_create(list, NULL, 0, &error_abort);
+ }
+ return opts;
+}
+
+static CommandLineParameterInfoList *query_option_descs(const QemuOptDesc *desc)
+{
+ CommandLineParameterInfoList *param_list = NULL;
+ CommandLineParameterInfo *info;
+ int i;
+
+ for (i = 0; desc[i].name != NULL; i++) {
+ info = g_malloc0(sizeof(*info));
+ info->name = g_strdup(desc[i].name);
+
+ switch (desc[i].type) {
+ case QEMU_OPT_STRING:
+ info->type = COMMAND_LINE_PARAMETER_TYPE_STRING;
+ break;
+ case QEMU_OPT_BOOL:
+ info->type = COMMAND_LINE_PARAMETER_TYPE_BOOLEAN;
+ break;
+ case QEMU_OPT_NUMBER:
+ info->type = COMMAND_LINE_PARAMETER_TYPE_NUMBER;
+ break;
+ case QEMU_OPT_SIZE:
+ info->type = COMMAND_LINE_PARAMETER_TYPE_SIZE;
+ break;
+ }
+
+ if (desc[i].help) {
+ info->has_help = true;
+ info->help = g_strdup(desc[i].help);
+ }
+ if (desc[i].def_value_str) {
+ info->has_q_default = true;
+ info->q_default = g_strdup(desc[i].def_value_str);
+ }
+
+ QAPI_LIST_PREPEND(param_list, info);
+ }
+
+ return param_list;
+}
+
+/* remove repeated entry from the info list */
+static void cleanup_infolist(CommandLineParameterInfoList *head)
+{
+ CommandLineParameterInfoList *pre_entry, *cur, *del_entry;
+
+ cur = head;
+ while (cur->next) {
+ pre_entry = head;
+ while (pre_entry != cur->next) {
+ if (!strcmp(pre_entry->value->name, cur->next->value->name)) {
+ del_entry = cur->next;
+ cur->next = cur->next->next;
+ del_entry->next = NULL;
+ qapi_free_CommandLineParameterInfoList(del_entry);
+ break;
+ }
+ pre_entry = pre_entry->next;
+ }
+ cur = cur->next;
+ }
+}
+
+/* merge the description items of two parameter infolists */
+static void connect_infolist(CommandLineParameterInfoList *head,
+ CommandLineParameterInfoList *new)
+{
+ CommandLineParameterInfoList *cur;
+
+ cur = head;
+ while (cur->next) {
+ cur = cur->next;
+ }
+ cur->next = new;
+}
+
+/* access all the local QemuOptsLists for drive option */
+static CommandLineParameterInfoList *get_drive_infolist(void)
+{
+ CommandLineParameterInfoList *head = NULL, *cur;
+ int i;
+
+ for (i = 0; drive_config_groups[i] != NULL; i++) {
+ if (!head) {
+ head = query_option_descs(drive_config_groups[i]->desc);
+ } else {
+ cur = query_option_descs(drive_config_groups[i]->desc);
+ connect_infolist(head, cur);
+ }
+ }
+ cleanup_infolist(head);
+
+ return head;
+}
+
+/* restore machine options that are now machine's properties */
+static QemuOptsList machine_opts = {
+ .merge_lists = true,
+ .head = QTAILQ_HEAD_INITIALIZER(machine_opts.head),
+ .desc = {
+ {
+ .name = "type",
+ .type = QEMU_OPT_STRING,
+ .help = "emulated machine"
+ },{
+ .name = "accel",
+ .type = QEMU_OPT_STRING,
+ .help = "accelerator list",
+ },{
+ .name = "kernel_irqchip",
+ .type = QEMU_OPT_BOOL,
+ .help = "use KVM in-kernel irqchip",
+ },{
+ .name = "kvm_shadow_mem",
+ .type = QEMU_OPT_SIZE,
+ .help = "KVM shadow MMU size",
+ },{
+ .name = "kernel",
+ .type = QEMU_OPT_STRING,
+ .help = "Linux kernel image file",
+ },{
+ .name = "initrd",
+ .type = QEMU_OPT_STRING,
+ .help = "Linux initial ramdisk file",
+ },{
+ .name = "append",
+ .type = QEMU_OPT_STRING,
+ .help = "Linux kernel command line",
+ },{
+ .name = "dtb",
+ .type = QEMU_OPT_STRING,
+ .help = "Linux kernel device tree file",
+ },{
+ .name = "dumpdtb",
+ .type = QEMU_OPT_STRING,
+ .help = "Dump current dtb to a file and quit",
+ },{
+ .name = "phandle_start",
+ .type = QEMU_OPT_NUMBER,
+ .help = "The first phandle ID we may generate dynamically",
+ },{
+ .name = "dt_compatible",
+ .type = QEMU_OPT_STRING,
+ .help = "Overrides the \"compatible\" property of the dt root node",
+ },{
+ .name = "dump-guest-core",
+ .type = QEMU_OPT_BOOL,
+ .help = "Include guest memory in a core dump",
+ },{
+ .name = "mem-merge",
+ .type = QEMU_OPT_BOOL,
+ .help = "enable/disable memory merge support",
+ },{
+ .name = "usb",
+ .type = QEMU_OPT_BOOL,
+ .help = "Set on/off to enable/disable usb",
+ },{
+ .name = "firmware",
+ .type = QEMU_OPT_STRING,
+ .help = "firmware image",
+ },{
+ .name = "iommu",
+ .type = QEMU_OPT_BOOL,
+ .help = "Set on/off to enable/disable Intel IOMMU (VT-d)",
+ },{
+ .name = "suppress-vmdesc",
+ .type = QEMU_OPT_BOOL,
+ .help = "Set on to disable self-describing migration",
+ },{
+ .name = "aes-key-wrap",
+ .type = QEMU_OPT_BOOL,
+ .help = "enable/disable AES key wrapping using the CPACF wrapping key",
+ },{
+ .name = "dea-key-wrap",
+ .type = QEMU_OPT_BOOL,
+ .help = "enable/disable DEA key wrapping using the CPACF wrapping key",
+ },{
+ .name = "loadparm",
+ .type = QEMU_OPT_STRING,
+ .help = "Up to 8 chars in set of [A-Za-z0-9. ](lower case chars"
+ " converted to upper case) to pass to machine"
+ " loader, boot manager, and guest kernel",
+ },
+ { /* End of list */ }
+ }
+};
+
+CommandLineOptionInfoList *qmp_query_command_line_options(bool has_option,
+ const char *option,
+ Error **errp)
+{
+ CommandLineOptionInfoList *conf_list = NULL;
+ CommandLineOptionInfo *info;
+ int i;
+
+ for (i = 0; vm_config_groups[i] != NULL; i++) {
+ if (!has_option || !strcmp(option, vm_config_groups[i]->name)) {
+ info = g_malloc0(sizeof(*info));
+ info->option = g_strdup(vm_config_groups[i]->name);
+ if (!strcmp("drive", vm_config_groups[i]->name)) {
+ info->parameters = get_drive_infolist();
+ } else {
+ info->parameters =
+ query_option_descs(vm_config_groups[i]->desc);
+ }
+ QAPI_LIST_PREPEND(conf_list, info);
+ }
+ }
+
+ if (!has_option || !strcmp(option, "machine")) {
+ info = g_malloc0(sizeof(*info));
+ info->option = g_strdup("machine");
+ info->parameters = query_option_descs(machine_opts.desc);
+ QAPI_LIST_PREPEND(conf_list, info);
+ }
+
+ if (conf_list == NULL) {
+ error_setg(errp, "invalid option name: %s", option);
+ }
+
+ return conf_list;
+}
+
+QemuOptsList *qemu_find_opts_err(const char *group, Error **errp)
+{
+ return find_list(vm_config_groups, group, errp);
+}
+
+void qemu_add_drive_opts(QemuOptsList *list)
+{
+ int entries, i;
+
+ entries = ARRAY_SIZE(drive_config_groups);
+ entries--; /* keep list NULL terminated */
+ for (i = 0; i < entries; i++) {
+ if (drive_config_groups[i] == NULL) {
+ drive_config_groups[i] = list;
+ return;
+ }
+ }
+ fprintf(stderr, "ran out of space in drive_config_groups");
+ abort();
+}
+
+void qemu_add_opts(QemuOptsList *list)
+{
+ int entries, i;
+
+ entries = ARRAY_SIZE(vm_config_groups);
+ entries--; /* keep list NULL terminated */
+ for (i = 0; i < entries; i++) {
+ if (vm_config_groups[i] == NULL) {
+ vm_config_groups[i] = list;
+ return;
+ }
+ }
+ fprintf(stderr, "ran out of space in vm_config_groups");
+ abort();
+}
+
+struct ConfigWriteData {
+ QemuOptsList *list;
+ FILE *fp;
+};
+
+static int config_write_opt(void *opaque, const char *name, const char *value,
+ Error **errp)
+{
+ struct ConfigWriteData *data = opaque;
+
+ fprintf(data->fp, " %s = \"%s\"\n", name, value);
+ return 0;
+}
+
+static int config_write_opts(void *opaque, QemuOpts *opts, Error **errp)
+{
+ struct ConfigWriteData *data = opaque;
+ const char *id = qemu_opts_id(opts);
+
+ if (id) {
+ fprintf(data->fp, "[%s \"%s\"]\n", data->list->name, id);
+ } else {
+ fprintf(data->fp, "[%s]\n", data->list->name);
+ }
+ qemu_opt_foreach(opts, config_write_opt, data, NULL);
+ fprintf(data->fp, "\n");
+ return 0;
+}
+
+void qemu_config_write(FILE *fp)
+{
+ struct ConfigWriteData data = { .fp = fp };
+ QemuOptsList **lists = vm_config_groups;
+ int i;
+
+ fprintf(fp, "# qemu config file\n\n");
+ for (i = 0; lists[i] != NULL; i++) {
+ data.list = lists[i];
+ qemu_opts_foreach(data.list, config_write_opts, &data, NULL);
+ }
+}
+
+/* Returns number of config groups on success, -errno on error */
+static int qemu_config_foreach(FILE *fp, QEMUConfigCB *cb, void *opaque,
+ const char *fname, Error **errp)
+{
+ char line[1024], prev_group[64], group[64], arg[64], value[1024];
+ Location loc;
+ Error *local_err = NULL;
+ QDict *qdict = NULL;
+ int res = -EINVAL, lno = 0;
+ int count = 0;
+
+ loc_push_none(&loc);
+ while (fgets(line, sizeof(line), fp) != NULL) {
+ ++lno;
+ if (line[0] == '\n') {
+ /* skip empty lines */
+ continue;
+ }
+ if (line[0] == '#') {
+ /* comment */
+ continue;
+ }
+ if (line[0] == '[') {
+ QDict *prev = qdict;
+ if (sscanf(line, "[%63s \"%63[^\"]\"]", group, value) == 2) {
+ qdict = qdict_new();
+ qdict_put_str(qdict, "id", value);
+ count++;
+ } else if (sscanf(line, "[%63[^]]]", group) == 1) {
+ qdict = qdict_new();
+ count++;
+ }
+ if (qdict != prev) {
+ if (prev) {
+ cb(prev_group, prev, opaque, &local_err);
+ qobject_unref(prev);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ goto out;
+ }
+ }
+ strcpy(prev_group, group);
+ continue;
+ }
+ }
+ loc_set_file(fname, lno);
+ value[0] = '\0';
+ if (sscanf(line, " %63s = \"%1023[^\"]\"", arg, value) == 2 ||
+ sscanf(line, " %63s = \"\"", arg) == 1) {
+ /* arg = value */
+ if (qdict == NULL) {
+ error_setg(errp, "no group defined");
+ goto out;
+ }
+ qdict_put_str(qdict, arg, value);
+ continue;
+ }
+ error_setg(errp, "parse error");
+ goto out;
+ }
+ if (ferror(fp)) {
+ loc_pop(&loc);
+ error_setg_errno(errp, errno, "Cannot read config file");
+ goto out_no_loc;
+ }
+ res = count;
+ if (qdict) {
+ cb(group, qdict, opaque, errp);
+ }
+out:
+ loc_pop(&loc);
+out_no_loc:
+ qobject_unref(qdict);
+ return res;
+}
+
+void qemu_config_do_parse(const char *group, QDict *qdict, void *opaque, Error **errp)
+{
+ QemuOptsList **lists = opaque;
+ QemuOptsList *list;
+
+ list = find_list(lists, group, errp);
+ if (!list) {
+ return;
+ }
+
+ qemu_opts_from_qdict(list, qdict, errp);
+}
+
+int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname, Error **errp)
+{
+ return qemu_config_foreach(fp, qemu_config_do_parse, lists, fname, errp);
+}
+
+int qemu_read_config_file(const char *filename, QEMUConfigCB *cb, Error **errp)
+{
+ FILE *f = fopen(filename, "r");
+ int ret;
+
+ if (f == NULL) {
+ error_setg_file_open(errp, errno, filename);
+ return -errno;
+ }
+
+ ret = qemu_config_foreach(f, cb, vm_config_groups, filename, errp);
+ fclose(f);
+ return ret;
+}
+
+static void config_parse_qdict_section(QDict *options, QemuOptsList *opts,
+ Error **errp)
+{
+ QemuOpts *subopts;
+ QDict *subqdict;
+ QList *list = NULL;
+ size_t orig_size, enum_size;
+ char *prefix;
+
+ prefix = g_strdup_printf("%s.", opts->name);
+ qdict_extract_subqdict(options, &subqdict, prefix);
+ g_free(prefix);
+ orig_size = qdict_size(subqdict);
+ if (!orig_size) {
+ goto out;
+ }
+
+ subopts = qemu_opts_create(opts, NULL, 0, errp);
+ if (!subopts) {
+ goto out;
+ }
+
+ if (!qemu_opts_absorb_qdict(subopts, subqdict, errp)) {
+ goto out;
+ }
+
+ enum_size = qdict_size(subqdict);
+ if (enum_size < orig_size && enum_size) {
+ error_setg(errp, "Unknown option '%s' for [%s]",
+ qdict_first(subqdict)->key, opts->name);
+ goto out;
+ }
+
+ if (enum_size) {
+ /* Multiple, enumerated sections */
+ QListEntry *list_entry;
+ unsigned i = 0;
+
+ /* Not required anymore */
+ qemu_opts_del(subopts);
+
+ qdict_array_split(subqdict, &list);
+ if (qdict_size(subqdict)) {
+ error_setg(errp, "Unused option '%s' for [%s]",
+ qdict_first(subqdict)->key, opts->name);
+ goto out;
+ }
+
+ QLIST_FOREACH_ENTRY(list, list_entry) {
+ QDict *section = qobject_to(QDict, qlist_entry_obj(list_entry));
+ char *opt_name;
+
+ if (!section) {
+ error_setg(errp, "[%s] section (index %u) does not consist of "
+ "keys", opts->name, i);
+ goto out;
+ }
+
+ opt_name = g_strdup_printf("%s.%u", opts->name, i++);
+ subopts = qemu_opts_create(opts, opt_name, 1, errp);
+ g_free(opt_name);
+ if (!subopts) {
+ goto out;
+ }
+
+ if (!qemu_opts_absorb_qdict(subopts, section, errp)) {
+ qemu_opts_del(subopts);
+ goto out;
+ }
+
+ if (qdict_size(section)) {
+ error_setg(errp, "[%s] section doesn't support the option '%s'",
+ opts->name, qdict_first(section)->key);
+ qemu_opts_del(subopts);
+ goto out;
+ }
+ }
+ }
+
+out:
+ qobject_unref(subqdict);
+ qobject_unref(list);
+}
+
+void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists,
+ Error **errp)
+{
+ int i;
+ Error *local_err = NULL;
+
+ for (i = 0; lists[i]; i++) {
+ config_parse_qdict_section(options, lists[i], &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+}
diff --git a/util/qemu-coroutine-io.c b/util/qemu-coroutine-io.c
new file mode 100644
index 000000000..5b80bb416
--- /dev/null
+++ b/util/qemu-coroutine-io.c
@@ -0,0 +1,93 @@
+/*
+ * Coroutine-aware I/O functions
+ *
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ * Copyright (c) 2011, Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/sockets.h"
+#include "qemu/coroutine.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+
+ssize_t coroutine_fn
+qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt,
+ size_t offset, size_t bytes, bool do_send)
+{
+ size_t done = 0;
+ ssize_t ret;
+ while (done < bytes) {
+ ret = iov_send_recv(sockfd, iov, iov_cnt,
+ offset + done, bytes - done, do_send);
+ if (ret > 0) {
+ done += ret;
+ } else if (ret < 0) {
+ if (errno == EAGAIN || errno == EWOULDBLOCK) {
+ qemu_coroutine_yield();
+ } else if (done == 0) {
+ return -errno;
+ } else {
+ break;
+ }
+ } else if (ret == 0 && !do_send) {
+ /* write (send) should never return 0.
+ * read (recv) returns 0 for end-of-file (-data).
+ * In both cases there's little point retrying,
+ * but we do for write anyway, just in case */
+ break;
+ }
+ }
+ return done;
+}
+
+ssize_t coroutine_fn
+qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send)
+{
+ struct iovec iov = { .iov_base = buf, .iov_len = bytes };
+ return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send);
+}
+
+typedef struct {
+ AioContext *ctx;
+ Coroutine *co;
+ int fd;
+} FDYieldUntilData;
+
+static void fd_coroutine_enter(void *opaque)
+{
+ FDYieldUntilData *data = opaque;
+ aio_set_fd_handler(data->ctx, data->fd, false, NULL, NULL, NULL, NULL);
+ qemu_coroutine_enter(data->co);
+}
+
+void coroutine_fn yield_until_fd_readable(int fd)
+{
+ FDYieldUntilData data;
+
+ assert(qemu_in_coroutine());
+ data.ctx = qemu_get_current_aio_context();
+ data.co = qemu_coroutine_self();
+ data.fd = fd;
+ aio_set_fd_handler(
+ data.ctx, fd, false, fd_coroutine_enter, NULL, NULL, &data);
+ qemu_coroutine_yield();
+}
diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c
new file mode 100644
index 000000000..266940383
--- /dev/null
+++ b/util/qemu-coroutine-lock.c
@@ -0,0 +1,467 @@
+/*
+ * coroutine queues and locks
+ *
+ * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * The lock-free mutex implementation is based on OSv
+ * (core/lfmutex.cc, include/lockfree/mutex.hh).
+ * Copyright (C) 2013 Cloudius Systems, Ltd.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/processor.h"
+#include "qemu/queue.h"
+#include "block/aio.h"
+#include "trace.h"
+
+void qemu_co_queue_init(CoQueue *queue)
+{
+ QSIMPLEQ_INIT(&queue->entries);
+}
+
+void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock)
+{
+ Coroutine *self = qemu_coroutine_self();
+ QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next);
+
+ if (lock) {
+ qemu_lockable_unlock(lock);
+ }
+
+ /* There is no race condition here. Other threads will call
+ * aio_co_schedule on our AioContext, which can reenter this
+ * coroutine but only after this yield and after the main loop
+ * has gone through the next iteration.
+ */
+ qemu_coroutine_yield();
+ assert(qemu_in_coroutine());
+
+ /* TODO: OSv implements wait morphing here, where the wakeup
+ * primitive automatically places the woken coroutine on the
+ * mutex's queue. This avoids the thundering herd effect.
+ * This could be implemented for CoMutexes, but not really for
+ * other cases of QemuLockable.
+ */
+ if (lock) {
+ qemu_lockable_lock(lock);
+ }
+}
+
+static bool qemu_co_queue_do_restart(CoQueue *queue, bool single)
+{
+ Coroutine *next;
+
+ if (QSIMPLEQ_EMPTY(&queue->entries)) {
+ return false;
+ }
+
+ while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) {
+ QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
+ aio_co_wake(next);
+ if (single) {
+ break;
+ }
+ }
+ return true;
+}
+
+bool qemu_co_queue_next(CoQueue *queue)
+{
+ return qemu_co_queue_do_restart(queue, true);
+}
+
+void qemu_co_queue_restart_all(CoQueue *queue)
+{
+ qemu_co_queue_do_restart(queue, false);
+}
+
+bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock)
+{
+ Coroutine *next;
+
+ next = QSIMPLEQ_FIRST(&queue->entries);
+ if (!next) {
+ return false;
+ }
+
+ QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next);
+ if (lock) {
+ qemu_lockable_unlock(lock);
+ }
+ aio_co_wake(next);
+ if (lock) {
+ qemu_lockable_lock(lock);
+ }
+ return true;
+}
+
+bool qemu_co_queue_empty(CoQueue *queue)
+{
+ return QSIMPLEQ_FIRST(&queue->entries) == NULL;
+}
+
+/* The wait records are handled with a multiple-producer, single-consumer
+ * lock-free queue. There cannot be two concurrent pop_waiter() calls
+ * because pop_waiter() can only be called while mutex->handoff is zero.
+ * This can happen in three cases:
+ * - in qemu_co_mutex_unlock, before the hand-off protocol has started.
+ * In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and
+ * not take part in the handoff.
+ * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from
+ * qemu_co_mutex_unlock. In this case, qemu_co_mutex_unlock will fail
+ * the cmpxchg (it will see either 0 or the next sequence value) and
+ * exit. The next hand-off cannot begin until qemu_co_mutex_lock has
+ * woken up someone.
+ * - in qemu_co_mutex_unlock, if it takes the hand-off token itself.
+ * In this case another iteration starts with mutex->handoff == 0;
+ * a concurrent qemu_co_mutex_lock will fail the cmpxchg, and
+ * qemu_co_mutex_unlock will go back to case (1).
+ *
+ * The following functions manage this queue.
+ */
+typedef struct CoWaitRecord {
+ Coroutine *co;
+ QSLIST_ENTRY(CoWaitRecord) next;
+} CoWaitRecord;
+
+static void push_waiter(CoMutex *mutex, CoWaitRecord *w)
+{
+ w->co = qemu_coroutine_self();
+ QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next);
+}
+
+static void move_waiters(CoMutex *mutex)
+{
+ QSLIST_HEAD(, CoWaitRecord) reversed;
+ QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push);
+ while (!QSLIST_EMPTY(&reversed)) {
+ CoWaitRecord *w = QSLIST_FIRST(&reversed);
+ QSLIST_REMOVE_HEAD(&reversed, next);
+ QSLIST_INSERT_HEAD(&mutex->to_pop, w, next);
+ }
+}
+
+static CoWaitRecord *pop_waiter(CoMutex *mutex)
+{
+ CoWaitRecord *w;
+
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
+ move_waiters(mutex);
+ if (QSLIST_EMPTY(&mutex->to_pop)) {
+ return NULL;
+ }
+ }
+ w = QSLIST_FIRST(&mutex->to_pop);
+ QSLIST_REMOVE_HEAD(&mutex->to_pop, next);
+ return w;
+}
+
+static bool has_waiters(CoMutex *mutex)
+{
+ return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push);
+}
+
+void qemu_co_mutex_init(CoMutex *mutex)
+{
+ memset(mutex, 0, sizeof(*mutex));
+}
+
+static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co)
+{
+ /* Read co before co->ctx; pairs with smp_wmb() in
+ * qemu_coroutine_enter().
+ */
+ smp_read_barrier_depends();
+ mutex->ctx = co->ctx;
+ aio_co_wake(co);
+}
+
+static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx,
+ CoMutex *mutex)
+{
+ Coroutine *self = qemu_coroutine_self();
+ CoWaitRecord w;
+ unsigned old_handoff;
+
+ trace_qemu_co_mutex_lock_entry(mutex, self);
+ push_waiter(mutex, &w);
+
+ /* This is the "Responsibility Hand-Off" protocol; a lock() picks from
+ * a concurrent unlock() the responsibility of waking somebody up.
+ */
+ old_handoff = qatomic_mb_read(&mutex->handoff);
+ if (old_handoff &&
+ has_waiters(mutex) &&
+ qatomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) {
+ /* There can be no concurrent pops, because there can be only
+ * one active handoff at a time.
+ */
+ CoWaitRecord *to_wake = pop_waiter(mutex);
+ Coroutine *co = to_wake->co;
+ if (co == self) {
+ /* We got the lock ourselves! */
+ assert(to_wake == &w);
+ mutex->ctx = ctx;
+ return;
+ }
+
+ qemu_co_mutex_wake(mutex, co);
+ }
+
+ qemu_coroutine_yield();
+ trace_qemu_co_mutex_lock_return(mutex, self);
+}
+
+void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex)
+{
+ AioContext *ctx = qemu_get_current_aio_context();
+ Coroutine *self = qemu_coroutine_self();
+ int waiters, i;
+
+ /* Running a very small critical section on pthread_mutex_t and CoMutex
+ * shows that pthread_mutex_t is much faster because it doesn't actually
+ * go to sleep. What happens is that the critical section is shorter
+ * than the latency of entering the kernel and thus FUTEX_WAIT always
+ * fails. With CoMutex there is no such latency but you still want to
+ * avoid wait and wakeup. So introduce it artificially.
+ */
+ i = 0;
+retry_fast_path:
+ waiters = qatomic_cmpxchg(&mutex->locked, 0, 1);
+ if (waiters != 0) {
+ while (waiters == 1 && ++i < 1000) {
+ if (qatomic_read(&mutex->ctx) == ctx) {
+ break;
+ }
+ if (qatomic_read(&mutex->locked) == 0) {
+ goto retry_fast_path;
+ }
+ cpu_relax();
+ }
+ waiters = qatomic_fetch_inc(&mutex->locked);
+ }
+
+ if (waiters == 0) {
+ /* Uncontended. */
+ trace_qemu_co_mutex_lock_uncontended(mutex, self);
+ mutex->ctx = ctx;
+ } else {
+ qemu_co_mutex_lock_slowpath(ctx, mutex);
+ }
+ mutex->holder = self;
+ self->locks_held++;
+}
+
+void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex)
+{
+ Coroutine *self = qemu_coroutine_self();
+
+ trace_qemu_co_mutex_unlock_entry(mutex, self);
+
+ assert(mutex->locked);
+ assert(mutex->holder == self);
+ assert(qemu_in_coroutine());
+
+ mutex->ctx = NULL;
+ mutex->holder = NULL;
+ self->locks_held--;
+ if (qatomic_fetch_dec(&mutex->locked) == 1) {
+ /* No waiting qemu_co_mutex_lock(). Pfew, that was easy! */
+ return;
+ }
+
+ for (;;) {
+ CoWaitRecord *to_wake = pop_waiter(mutex);
+ unsigned our_handoff;
+
+ if (to_wake) {
+ qemu_co_mutex_wake(mutex, to_wake->co);
+ break;
+ }
+
+ /* Some concurrent lock() is in progress (we know this because
+ * mutex->locked was >1) but it hasn't yet put itself on the wait
+ * queue. Pick a sequence number for the handoff protocol (not 0).
+ */
+ if (++mutex->sequence == 0) {
+ mutex->sequence = 1;
+ }
+
+ our_handoff = mutex->sequence;
+ qatomic_mb_set(&mutex->handoff, our_handoff);
+ if (!has_waiters(mutex)) {
+ /* The concurrent lock has not added itself yet, so it
+ * will be able to pick our handoff.
+ */
+ break;
+ }
+
+ /* Try to do the handoff protocol ourselves; if somebody else has
+ * already taken it, however, we're done and they're responsible.
+ */
+ if (qatomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) {
+ break;
+ }
+ }
+
+ trace_qemu_co_mutex_unlock_return(mutex, self);
+}
+
+struct CoRwTicket {
+ bool read;
+ Coroutine *co;
+ QSIMPLEQ_ENTRY(CoRwTicket) next;
+};
+
+void qemu_co_rwlock_init(CoRwlock *lock)
+{
+ qemu_co_mutex_init(&lock->mutex);
+ lock->owners = 0;
+ QSIMPLEQ_INIT(&lock->tickets);
+}
+
+/* Releases the internal CoMutex. */
+static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock)
+{
+ CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets);
+ Coroutine *co = NULL;
+
+ /*
+ * Setting lock->owners here prevents rdlock and wrlock from
+ * sneaking in between unlock and wake.
+ */
+
+ if (tkt) {
+ if (tkt->read) {
+ if (lock->owners >= 0) {
+ lock->owners++;
+ co = tkt->co;
+ }
+ } else {
+ if (lock->owners == 0) {
+ lock->owners = -1;
+ co = tkt->co;
+ }
+ }
+ }
+
+ if (co) {
+ QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next);
+ qemu_co_mutex_unlock(&lock->mutex);
+ aio_co_wake(co);
+ } else {
+ qemu_co_mutex_unlock(&lock->mutex);
+ }
+}
+
+void qemu_co_rwlock_rdlock(CoRwlock *lock)
+{
+ Coroutine *self = qemu_coroutine_self();
+
+ qemu_co_mutex_lock(&lock->mutex);
+ /* For fairness, wait if a writer is in line. */
+ if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) {
+ lock->owners++;
+ qemu_co_mutex_unlock(&lock->mutex);
+ } else {
+ CoRwTicket my_ticket = { true, self };
+
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+ qemu_co_mutex_unlock(&lock->mutex);
+ qemu_coroutine_yield();
+ assert(lock->owners >= 1);
+
+ /* Possibly wake another reader, which will wake the next in line. */
+ qemu_co_mutex_lock(&lock->mutex);
+ qemu_co_rwlock_maybe_wake_one(lock);
+ }
+
+ self->locks_held++;
+}
+
+void qemu_co_rwlock_unlock(CoRwlock *lock)
+{
+ Coroutine *self = qemu_coroutine_self();
+
+ assert(qemu_in_coroutine());
+ self->locks_held--;
+
+ qemu_co_mutex_lock(&lock->mutex);
+ if (lock->owners > 0) {
+ lock->owners--;
+ } else {
+ assert(lock->owners == -1);
+ lock->owners = 0;
+ }
+
+ qemu_co_rwlock_maybe_wake_one(lock);
+}
+
+void qemu_co_rwlock_downgrade(CoRwlock *lock)
+{
+ qemu_co_mutex_lock(&lock->mutex);
+ assert(lock->owners == -1);
+ lock->owners = 1;
+
+ /* Possibly wake another reader, which will wake the next in line. */
+ qemu_co_rwlock_maybe_wake_one(lock);
+}
+
+void qemu_co_rwlock_wrlock(CoRwlock *lock)
+{
+ Coroutine *self = qemu_coroutine_self();
+
+ qemu_co_mutex_lock(&lock->mutex);
+ if (lock->owners == 0) {
+ lock->owners = -1;
+ qemu_co_mutex_unlock(&lock->mutex);
+ } else {
+ CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+ qemu_co_mutex_unlock(&lock->mutex);
+ qemu_coroutine_yield();
+ assert(lock->owners == -1);
+ }
+
+ self->locks_held++;
+}
+
+void qemu_co_rwlock_upgrade(CoRwlock *lock)
+{
+ qemu_co_mutex_lock(&lock->mutex);
+ assert(lock->owners > 0);
+ /* For fairness, wait if a writer is in line. */
+ if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) {
+ lock->owners = -1;
+ qemu_co_mutex_unlock(&lock->mutex);
+ } else {
+ CoRwTicket my_ticket = { false, qemu_coroutine_self() };
+
+ lock->owners--;
+ QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next);
+ qemu_co_rwlock_maybe_wake_one(lock);
+ qemu_coroutine_yield();
+ assert(lock->owners == -1);
+ }
+}
diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c
new file mode 100644
index 000000000..571ab521f
--- /dev/null
+++ b/util/qemu-coroutine-sleep.c
@@ -0,0 +1,80 @@
+/*
+ * QEMU coroutine sleep
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "qemu/timer.h"
+#include "block/aio.h"
+
+static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns";
+
+void qemu_co_sleep_wake(QemuCoSleep *w)
+{
+ Coroutine *co;
+
+ co = w->to_wake;
+ w->to_wake = NULL;
+ if (co) {
+ /* Write of schedule protected by barrier write in aio_co_schedule */
+ const char *scheduled = qatomic_cmpxchg(&co->scheduled,
+ qemu_co_sleep_ns__scheduled, NULL);
+
+ assert(scheduled == qemu_co_sleep_ns__scheduled);
+ aio_co_wake(co);
+ }
+}
+
+static void co_sleep_cb(void *opaque)
+{
+ QemuCoSleep *w = opaque;
+ qemu_co_sleep_wake(w);
+}
+
+void coroutine_fn qemu_co_sleep(QemuCoSleep *w)
+{
+ Coroutine *co = qemu_coroutine_self();
+
+ const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL,
+ qemu_co_sleep_ns__scheduled);
+ if (scheduled) {
+ fprintf(stderr,
+ "%s: Co-routine was already scheduled in '%s'\n",
+ __func__, scheduled);
+ abort();
+ }
+
+ w->to_wake = co;
+ qemu_coroutine_yield();
+
+ /* w->to_wake is cleared before resuming this coroutine. */
+ assert(w->to_wake == NULL);
+}
+
+void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w,
+ QEMUClockType type, int64_t ns)
+{
+ AioContext *ctx = qemu_get_current_aio_context();
+ QEMUTimer ts;
+
+ aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w);
+ timer_mod(&ts, qemu_clock_get_ns(type) + ns);
+
+ /*
+ * The timer will fire in the current AiOContext, so the callback
+ * must happen after qemu_co_sleep yields and there is no race
+ * between timer_mod and qemu_co_sleep.
+ */
+ qemu_co_sleep(w);
+ timer_del(&ts);
+}
diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c
new file mode 100644
index 000000000..38fb6d308
--- /dev/null
+++ b/util/qemu-coroutine.c
@@ -0,0 +1,204 @@
+/*
+ * QEMU coroutines
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Kevin Wolf <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "qemu/coroutine.h"
+#include "qemu/coroutine_int.h"
+#include "block/aio.h"
+
+enum {
+ POOL_BATCH_SIZE = 64,
+};
+
+/** Free list to speed up creation */
+static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool);
+static unsigned int release_pool_size;
+static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool);
+static __thread unsigned int alloc_pool_size;
+static __thread Notifier coroutine_pool_cleanup_notifier;
+
+static void coroutine_pool_cleanup(Notifier *n, void *value)
+{
+ Coroutine *co;
+ Coroutine *tmp;
+
+ QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) {
+ QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+ qemu_coroutine_delete(co);
+ }
+}
+
+Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque)
+{
+ Coroutine *co = NULL;
+
+ if (CONFIG_COROUTINE_POOL) {
+ co = QSLIST_FIRST(&alloc_pool);
+ if (!co) {
+ if (release_pool_size > POOL_BATCH_SIZE) {
+ /* Slow path; a good place to register the destructor, too. */
+ if (!coroutine_pool_cleanup_notifier.notify) {
+ coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup;
+ qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier);
+ }
+
+ /* This is not exact; there could be a little skew between
+ * release_pool_size and the actual size of release_pool. But
+ * it is just a heuristic, it does not need to be perfect.
+ */
+ alloc_pool_size = qatomic_xchg(&release_pool_size, 0);
+ QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool);
+ co = QSLIST_FIRST(&alloc_pool);
+ }
+ }
+ if (co) {
+ QSLIST_REMOVE_HEAD(&alloc_pool, pool_next);
+ alloc_pool_size--;
+ }
+ }
+
+ if (!co) {
+ co = qemu_coroutine_new();
+ }
+
+ co->entry = entry;
+ co->entry_arg = opaque;
+ QSIMPLEQ_INIT(&co->co_queue_wakeup);
+ return co;
+}
+
+static void coroutine_delete(Coroutine *co)
+{
+ co->caller = NULL;
+
+ if (CONFIG_COROUTINE_POOL) {
+ if (release_pool_size < POOL_BATCH_SIZE * 2) {
+ QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next);
+ qatomic_inc(&release_pool_size);
+ return;
+ }
+ if (alloc_pool_size < POOL_BATCH_SIZE) {
+ QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next);
+ alloc_pool_size++;
+ return;
+ }
+ }
+
+ qemu_coroutine_delete(co);
+}
+
+void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co)
+{
+ QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending);
+ Coroutine *from = qemu_coroutine_self();
+
+ QSIMPLEQ_INSERT_TAIL(&pending, co, co_queue_next);
+
+ /* Run co and any queued coroutines */
+ while (!QSIMPLEQ_EMPTY(&pending)) {
+ Coroutine *to = QSIMPLEQ_FIRST(&pending);
+ CoroutineAction ret;
+
+ /* Cannot rely on the read barrier for to in aio_co_wake(), as there are
+ * callers outside of aio_co_wake() */
+ const char *scheduled = qatomic_mb_read(&to->scheduled);
+
+ QSIMPLEQ_REMOVE_HEAD(&pending, co_queue_next);
+
+ trace_qemu_aio_coroutine_enter(ctx, from, to, to->entry_arg);
+
+ /* if the Coroutine has already been scheduled, entering it again will
+ * cause us to enter it twice, potentially even after the coroutine has
+ * been deleted */
+ if (scheduled) {
+ fprintf(stderr,
+ "%s: Co-routine was already scheduled in '%s'\n",
+ __func__, scheduled);
+ abort();
+ }
+
+ if (to->caller) {
+ fprintf(stderr, "Co-routine re-entered recursively\n");
+ abort();
+ }
+
+ to->caller = from;
+ to->ctx = ctx;
+
+ /* Store to->ctx before anything that stores to. Matches
+ * barrier in aio_co_wake and qemu_co_mutex_wake.
+ */
+ smp_wmb();
+
+ ret = qemu_coroutine_switch(from, to, COROUTINE_ENTER);
+
+ /* Queued coroutines are run depth-first; previously pending coroutines
+ * run after those queued more recently.
+ */
+ QSIMPLEQ_PREPEND(&pending, &to->co_queue_wakeup);
+
+ switch (ret) {
+ case COROUTINE_YIELD:
+ break;
+ case COROUTINE_TERMINATE:
+ assert(!to->locks_held);
+ trace_qemu_coroutine_terminate(to);
+ coroutine_delete(to);
+ break;
+ default:
+ abort();
+ }
+ }
+}
+
+void qemu_coroutine_enter(Coroutine *co)
+{
+ qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co);
+}
+
+void qemu_coroutine_enter_if_inactive(Coroutine *co)
+{
+ if (!qemu_coroutine_entered(co)) {
+ qemu_coroutine_enter(co);
+ }
+}
+
+void coroutine_fn qemu_coroutine_yield(void)
+{
+ Coroutine *self = qemu_coroutine_self();
+ Coroutine *to = self->caller;
+
+ trace_qemu_coroutine_yield(self, to);
+
+ if (!to) {
+ fprintf(stderr, "Co-routine is yielding to no one\n");
+ abort();
+ }
+
+ self->caller = NULL;
+ qemu_coroutine_switch(self, to, COROUTINE_YIELD);
+}
+
+bool qemu_coroutine_entered(Coroutine *co)
+{
+ return co->caller;
+}
+
+AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co)
+{
+ return co->ctx;
+}
diff --git a/util/qemu-error.c b/util/qemu-error.c
new file mode 100644
index 000000000..52a9e013c
--- /dev/null
+++ b/util/qemu-error.c
@@ -0,0 +1,413 @@
+/*
+ * Error reporting
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors:
+ * Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "qemu/error-report.h"
+
+/*
+ * @report_type is the type of message: error, warning or
+ * informational.
+ */
+typedef enum {
+ REPORT_TYPE_ERROR,
+ REPORT_TYPE_WARNING,
+ REPORT_TYPE_INFO,
+} report_type;
+
+/* Prepend timestamp to messages */
+bool message_with_timestamp;
+bool error_with_guestname;
+const char *error_guest_name;
+
+int error_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = error_vprintf(fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int error_printf_unless_qmp(const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = error_vprintf_unless_qmp(fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+static Location std_loc = {
+ .kind = LOC_NONE
+};
+static Location *cur_loc = &std_loc;
+
+/*
+ * Push location saved in LOC onto the location stack, return it.
+ * The top of that stack is the current location.
+ * Needs a matching loc_pop().
+ */
+Location *loc_push_restore(Location *loc)
+{
+ assert(!loc->prev);
+ loc->prev = cur_loc;
+ cur_loc = loc;
+ return loc;
+}
+
+/*
+ * Initialize *LOC to "nowhere", push it onto the location stack.
+ * The top of that stack is the current location.
+ * Needs a matching loc_pop().
+ * Return LOC.
+ */
+Location *loc_push_none(Location *loc)
+{
+ loc->kind = LOC_NONE;
+ loc->prev = NULL;
+ return loc_push_restore(loc);
+}
+
+/*
+ * Pop the location stack.
+ * LOC must be the current location, i.e. the top of the stack.
+ */
+Location *loc_pop(Location *loc)
+{
+ assert(cur_loc == loc && loc->prev);
+ cur_loc = loc->prev;
+ loc->prev = NULL;
+ return loc;
+}
+
+/*
+ * Save the current location in LOC, return LOC.
+ */
+Location *loc_save(Location *loc)
+{
+ *loc = *cur_loc;
+ loc->prev = NULL;
+ return loc;
+}
+
+/*
+ * Change the current location to the one saved in LOC.
+ */
+void loc_restore(Location *loc)
+{
+ Location *prev = cur_loc->prev;
+ assert(!loc->prev);
+ *cur_loc = *loc;
+ cur_loc->prev = prev;
+}
+
+/*
+ * Change the current location to "nowhere in particular".
+ */
+void loc_set_none(void)
+{
+ cur_loc->kind = LOC_NONE;
+}
+
+/*
+ * Change the current location to argument ARGV[IDX..IDX+CNT-1].
+ */
+void loc_set_cmdline(char **argv, int idx, int cnt)
+{
+ cur_loc->kind = LOC_CMDLINE;
+ cur_loc->num = cnt;
+ cur_loc->ptr = argv + idx;
+}
+
+/*
+ * Change the current location to file FNAME, line LNO.
+ */
+void loc_set_file(const char *fname, int lno)
+{
+ assert (fname || cur_loc->kind == LOC_FILE);
+ cur_loc->kind = LOC_FILE;
+ cur_loc->num = lno;
+ if (fname) {
+ cur_loc->ptr = fname;
+ }
+}
+
+static const char *progname;
+
+/*
+ * Set the program name for error_print_loc().
+ */
+static void error_set_progname(const char *argv0)
+{
+ const char *p = strrchr(argv0, '/');
+ progname = p ? p + 1 : argv0;
+}
+
+const char *error_get_progname(void)
+{
+ return progname;
+}
+
+/*
+ * Print current location to current monitor if we have one, else to stderr.
+ */
+static void print_loc(void)
+{
+ const char *sep = "";
+ int i;
+ const char *const *argp;
+
+ if (!monitor_cur() && progname) {
+ fprintf(stderr, "%s:", progname);
+ sep = " ";
+ }
+ switch (cur_loc->kind) {
+ case LOC_CMDLINE:
+ argp = cur_loc->ptr;
+ for (i = 0; i < cur_loc->num; i++) {
+ error_printf("%s%s", sep, argp[i]);
+ sep = " ";
+ }
+ error_printf(": ");
+ break;
+ case LOC_FILE:
+ error_printf("%s:", (const char *)cur_loc->ptr);
+ if (cur_loc->num) {
+ error_printf("%d:", cur_loc->num);
+ }
+ error_printf(" ");
+ break;
+ default:
+ error_printf("%s", sep);
+ }
+}
+
+/*
+ * Print a message to current monitor if we have one, else to stderr.
+ * @report_type is the type of message: error, warning or informational.
+ * Format arguments like vsprintf(). The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+static void vreport(report_type type, const char *fmt, va_list ap)
+{
+ GTimeVal tv;
+ gchar *timestr;
+
+ if (message_with_timestamp && !monitor_cur()) {
+ g_get_current_time(&tv);
+ timestr = g_time_val_to_iso8601(&tv);
+ error_printf("%s ", timestr);
+ g_free(timestr);
+ }
+
+ /* Only prepend guest name if -msg guest-name and -name guest=... are set */
+ if (error_with_guestname && error_guest_name && !monitor_cur()) {
+ error_printf("%s ", error_guest_name);
+ }
+
+ print_loc();
+
+ switch (type) {
+ case REPORT_TYPE_ERROR:
+ break;
+ case REPORT_TYPE_WARNING:
+ error_printf("warning: ");
+ break;
+ case REPORT_TYPE_INFO:
+ error_printf("info: ");
+ break;
+ }
+
+ error_vprintf(fmt, ap);
+ error_printf("\n");
+}
+
+/*
+ * Print an error message to current monitor if we have one, else to stderr.
+ * Format arguments like vsprintf(). The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ * It's wrong to call this in a QMP monitor. Use error_setg() there.
+ */
+void error_vreport(const char *fmt, va_list ap)
+{
+ vreport(REPORT_TYPE_ERROR, fmt, ap);
+}
+
+/*
+ * Print a warning message to current monitor if we have one, else to stderr.
+ * Format arguments like vsprintf(). The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void warn_vreport(const char *fmt, va_list ap)
+{
+ vreport(REPORT_TYPE_WARNING, fmt, ap);
+}
+
+/*
+ * Print an information message to current monitor if we have one, else to
+ * stderr.
+ * Format arguments like vsprintf(). The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void info_vreport(const char *fmt, va_list ap)
+{
+ vreport(REPORT_TYPE_INFO, fmt, ap);
+}
+
+/*
+ * Print an error message to current monitor if we have one, else to stderr.
+ * Format arguments like sprintf(). The resulting message should be
+ * a single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ * It's wrong to call this in a QMP monitor. Use error_setg() there.
+ */
+void error_report(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vreport(REPORT_TYPE_ERROR, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Print a warning message to current monitor if we have one, else to stderr.
+ * Format arguments like sprintf(). The resulting message should be a
+ * single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void warn_report(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vreport(REPORT_TYPE_WARNING, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Print an information message to current monitor if we have one, else to
+ * stderr.
+ * Format arguments like sprintf(). The resulting message should be a
+ * single phrase, with no newline or trailing punctuation.
+ * Prepend the current location and append a newline.
+ */
+void info_report(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vreport(REPORT_TYPE_INFO, fmt, ap);
+ va_end(ap);
+}
+
+/*
+ * Like error_report(), except print just once.
+ * If *printed is false, print the message, and flip *printed to true.
+ * Return whether the message was printed.
+ */
+bool error_report_once_cond(bool *printed, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(printed);
+ if (*printed) {
+ return false;
+ }
+ *printed = true;
+ va_start(ap, fmt);
+ vreport(REPORT_TYPE_ERROR, fmt, ap);
+ va_end(ap);
+ return true;
+}
+
+/*
+ * Like warn_report(), except print just once.
+ * If *printed is false, print the message, and flip *printed to true.
+ * Return whether the message was printed.
+ */
+bool warn_report_once_cond(bool *printed, const char *fmt, ...)
+{
+ va_list ap;
+
+ assert(printed);
+ if (*printed) {
+ return false;
+ }
+ *printed = true;
+ va_start(ap, fmt);
+ vreport(REPORT_TYPE_WARNING, fmt, ap);
+ va_end(ap);
+ return true;
+}
+
+static char *qemu_glog_domains;
+
+static void qemu_log_func(const gchar *log_domain,
+ GLogLevelFlags log_level,
+ const gchar *message,
+ gpointer user_data)
+{
+ switch (log_level & G_LOG_LEVEL_MASK) {
+ case G_LOG_LEVEL_DEBUG:
+ case G_LOG_LEVEL_INFO:
+ /*
+ * Use same G_MESSAGES_DEBUG logic as glib to enable/disable debug
+ * messages
+ */
+ if (qemu_glog_domains == NULL) {
+ break;
+ }
+ if (strcmp(qemu_glog_domains, "all") != 0 &&
+ (log_domain == NULL || !strstr(qemu_glog_domains, log_domain))) {
+ break;
+ }
+ /* Fall through */
+ case G_LOG_LEVEL_MESSAGE:
+ info_report("%s%s%s",
+ log_domain ?: "", log_domain ? ": " : "", message);
+
+ break;
+ case G_LOG_LEVEL_WARNING:
+ warn_report("%s%s%s",
+ log_domain ?: "", log_domain ? ": " : "", message);
+ break;
+ case G_LOG_LEVEL_CRITICAL:
+ case G_LOG_LEVEL_ERROR:
+ error_report("%s%s%s",
+ log_domain ?: "", log_domain ? ": " : "", message);
+ break;
+ }
+}
+
+void error_init(const char *argv0)
+{
+ /* Set the program name for error_print_loc(). */
+ error_set_progname(argv0);
+
+ /*
+ * This sets up glib logging so libraries using it also print their logs
+ * through error_report(), warn_report(), info_report().
+ */
+ g_log_set_default_handler(qemu_log_func, NULL);
+ g_warn_if_fail(qemu_glog_domains == NULL);
+ qemu_glog_domains = g_strdup(g_getenv("G_MESSAGES_DEBUG"));
+}
diff --git a/util/qemu-openpty.c b/util/qemu-openpty.c
new file mode 100644
index 000000000..427f43a76
--- /dev/null
+++ b/util/qemu-openpty.c
@@ -0,0 +1,139 @@
+/*
+ * qemu-openpty.c
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2010 Red Hat, Inc.
+ *
+ * Wrapper function qemu_openpty() implementation.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+ * This is not part of oslib-posix.c because this function
+ * uses openpty() which often in -lutil, and if we add this
+ * dependency to oslib-posix.o, every app will have to be
+ * linked with -lutil.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#if defined HAVE_PTY_H
+# include <pty.h>
+#elif defined CONFIG_BSD
+# include <termios.h>
+# if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__)
+# include <libutil.h>
+# else
+# include <util.h>
+# endif
+#elif defined CONFIG_SOLARIS
+# include <termios.h>
+# include <stropts.h>
+#else
+# include <termios.h>
+#endif
+
+#ifdef __sun__
+
+#if !defined(HAVE_OPENPTY)
+/* Once illumos has openpty(), this is going to be removed. */
+static int openpty(int *amaster, int *aslave, char *name,
+ struct termios *termp, struct winsize *winp)
+{
+ const char *slave;
+ int mfd = -1, sfd = -1;
+
+ *amaster = *aslave = -1;
+
+ mfd = open("/dev/ptmx", O_RDWR | O_NOCTTY);
+ if (mfd < 0)
+ goto err;
+
+ if (grantpt(mfd) == -1 || unlockpt(mfd) == -1)
+ goto err;
+
+ if ((slave = ptsname(mfd)) == NULL)
+ goto err;
+
+ if ((sfd = open(slave, O_RDONLY | O_NOCTTY)) == -1)
+ goto err;
+
+ if (ioctl(sfd, I_PUSH, "ptem") == -1 ||
+ (termp != NULL && tcgetattr(sfd, termp) < 0))
+ goto err;
+
+ *amaster = mfd;
+ *aslave = sfd;
+
+ if (winp)
+ ioctl(sfd, TIOCSWINSZ, winp);
+
+ return 0;
+
+err:
+ if (sfd != -1)
+ close(sfd);
+ close(mfd);
+ return -1;
+}
+#endif
+
+static void cfmakeraw (struct termios *termios_p)
+{
+ termios_p->c_iflag &=
+ ~(IGNBRK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON);
+ termios_p->c_oflag &= ~OPOST;
+ termios_p->c_lflag &= ~(ECHO|ECHONL|ICANON|ISIG|IEXTEN);
+ termios_p->c_cflag &= ~(CSIZE|PARENB);
+ termios_p->c_cflag |= CS8;
+
+ termios_p->c_cc[VMIN] = 0;
+ termios_p->c_cc[VTIME] = 0;
+}
+#endif
+
+int qemu_openpty_raw(int *aslave, char *pty_name)
+{
+ int amaster;
+ struct termios tty;
+#if defined(__OpenBSD__) || defined(__DragonFly__)
+ char pty_buf[PATH_MAX];
+#define q_ptsname(x) pty_buf
+#else
+ char *pty_buf = NULL;
+#define q_ptsname(x) ptsname(x)
+#endif
+
+ if (openpty(&amaster, aslave, pty_buf, NULL, NULL) < 0) {
+ return -1;
+ }
+
+ /* Set raw attributes on the pty. */
+ tcgetattr(*aslave, &tty);
+ cfmakeraw(&tty);
+ tcsetattr(*aslave, TCSAFLUSH, &tty);
+
+ if (pty_name) {
+ strcpy(pty_name, q_ptsname(amaster));
+ }
+
+ return amaster;
+}
diff --git a/util/qemu-option.c b/util/qemu-option.c
new file mode 100644
index 000000000..eedd08929
--- /dev/null
+++ b/util/qemu-option.c
@@ -0,0 +1,1226 @@
+/*
+ * Commandline option parsing functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Kevin Wolf <kwolf@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qbool.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qnum.h"
+#include "qapi/qmp/qstring.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/option_int.h"
+#include "qemu/cutils.h"
+#include "qemu/id.h"
+#include "qemu/help_option.h"
+
+/*
+ * Extracts the name of an option from the parameter string (@p points at the
+ * first byte of the option name)
+ *
+ * The option name is @len characters long and is copied into @option. The
+ * caller is responsible for free'ing @option when no longer required.
+ *
+ * The return value is the position of the delimiter/zero byte after the option
+ * name in @p.
+ */
+static const char *get_opt_name(const char *p, char **option, size_t len)
+{
+ *option = g_strndup(p, len);
+ return p + len;
+}
+
+/*
+ * Extracts the value of an option from the parameter string p (p points at the
+ * first byte of the option value)
+ *
+ * This function is comparable to get_opt_name with the difference that the
+ * delimiter is fixed to be comma which starts a new option. To specify an
+ * option value that contains commas, double each comma.
+ */
+const char *get_opt_value(const char *p, char **value)
+{
+ size_t capacity = 0, length;
+ const char *offset;
+
+ *value = NULL;
+ while (1) {
+ offset = qemu_strchrnul(p, ',');
+ length = offset - p;
+ if (*offset != '\0' && *(offset + 1) == ',') {
+ length++;
+ }
+ *value = g_renew(char, *value, capacity + length + 1);
+ strncpy(*value + capacity, p, length);
+ (*value)[capacity + length] = '\0';
+ capacity += length;
+ if (*offset == '\0' ||
+ *(offset + 1) != ',') {
+ break;
+ }
+
+ p += (offset - p) + 2;
+ }
+
+ return offset;
+}
+
+static bool parse_option_number(const char *name, const char *value,
+ uint64_t *ret, Error **errp)
+{
+ uint64_t number;
+ int err;
+
+ err = qemu_strtou64(value, NULL, 0, &number);
+ if (err == -ERANGE) {
+ error_setg(errp, "Value '%s' is too large for parameter '%s'",
+ value, name);
+ return false;
+ }
+ if (err) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, "a number");
+ return false;
+ }
+ *ret = number;
+ return true;
+}
+
+static const QemuOptDesc *find_desc_by_name(const QemuOptDesc *desc,
+ const char *name)
+{
+ int i;
+
+ for (i = 0; desc[i].name != NULL; i++) {
+ if (strcmp(desc[i].name, name) == 0) {
+ return &desc[i];
+ }
+ }
+
+ return NULL;
+}
+
+static const char *find_default_by_name(QemuOpts *opts, const char *name)
+{
+ const QemuOptDesc *desc = find_desc_by_name(opts->list->desc, name);
+
+ return desc ? desc->def_value_str : NULL;
+}
+
+bool parse_option_size(const char *name, const char *value,
+ uint64_t *ret, Error **errp)
+{
+ uint64_t size;
+ int err;
+
+ err = qemu_strtosz(value, NULL, &size);
+ if (err == -ERANGE) {
+ error_setg(errp, "Value '%s' is out of range for parameter '%s'",
+ value, name);
+ return false;
+ }
+ if (err) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name,
+ "a non-negative number below 2^64");
+ error_append_hint(errp, "Optional suffix k, M, G, T, P or E means"
+ " kilo-, mega-, giga-, tera-, peta-\n"
+ "and exabytes, respectively.\n");
+ return false;
+ }
+ *ret = size;
+ return true;
+}
+
+static const char *opt_type_to_string(enum QemuOptType type)
+{
+ switch (type) {
+ case QEMU_OPT_STRING:
+ return "str";
+ case QEMU_OPT_BOOL:
+ return "bool (on/off)";
+ case QEMU_OPT_NUMBER:
+ return "num";
+ case QEMU_OPT_SIZE:
+ return "size";
+ }
+
+ g_assert_not_reached();
+}
+
+/**
+ * Print the list of options available in the given list. If
+ * @print_caption is true, a caption (including the list name, if it
+ * exists) is printed. The options itself will be indented, so
+ * @print_caption should only be set to false if the caller prints its
+ * own custom caption (so that the indentation makes sense).
+ */
+void qemu_opts_print_help(QemuOptsList *list, bool print_caption)
+{
+ QemuOptDesc *desc;
+ int i;
+ GPtrArray *array = g_ptr_array_new();
+
+ assert(list);
+ desc = list->desc;
+ while (desc && desc->name) {
+ GString *str = g_string_new(NULL);
+ g_string_append_printf(str, " %s=<%s>", desc->name,
+ opt_type_to_string(desc->type));
+ if (desc->help) {
+ if (str->len < 24) {
+ g_string_append_printf(str, "%*s", 24 - (int)str->len, "");
+ }
+ g_string_append_printf(str, " - %s", desc->help);
+ }
+ g_ptr_array_add(array, g_string_free(str, false));
+ desc++;
+ }
+
+ g_ptr_array_sort(array, (GCompareFunc)qemu_pstrcmp0);
+ if (print_caption && array->len > 0) {
+ if (list->name) {
+ printf("%s options:\n", list->name);
+ } else {
+ printf("Options:\n");
+ }
+ } else if (array->len == 0) {
+ if (list->name) {
+ printf("There are no options for %s.\n", list->name);
+ } else {
+ printf("No options available.\n");
+ }
+ }
+ for (i = 0; i < array->len; i++) {
+ printf("%s\n", (char *)array->pdata[i]);
+ }
+ g_ptr_array_set_free_func(array, g_free);
+ g_ptr_array_free(array, true);
+
+}
+/* ------------------------------------------------------------------ */
+
+QemuOpt *qemu_opt_find(QemuOpts *opts, const char *name)
+{
+ QemuOpt *opt;
+
+ QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) {
+ if (strcmp(opt->name, name) != 0)
+ continue;
+ return opt;
+ }
+ return NULL;
+}
+
+static void qemu_opt_del(QemuOpt *opt)
+{
+ QTAILQ_REMOVE(&opt->opts->head, opt, next);
+ g_free(opt->name);
+ g_free(opt->str);
+ g_free(opt);
+}
+
+/* qemu_opt_set allows many settings for the same option.
+ * This function deletes all settings for an option.
+ */
+static void qemu_opt_del_all(QemuOpts *opts, const char *name)
+{
+ QemuOpt *opt, *next_opt;
+
+ QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next_opt) {
+ if (!strcmp(opt->name, name)) {
+ qemu_opt_del(opt);
+ }
+ }
+}
+
+const char *qemu_opt_get(QemuOpts *opts, const char *name)
+{
+ QemuOpt *opt;
+
+ if (opts == NULL) {
+ return NULL;
+ }
+
+ opt = qemu_opt_find(opts, name);
+ if (!opt) {
+ return find_default_by_name(opts, name);
+ }
+
+ return opt->str;
+}
+
+void qemu_opt_iter_init(QemuOptsIter *iter, QemuOpts *opts, const char *name)
+{
+ iter->opts = opts;
+ iter->opt = QTAILQ_FIRST(&opts->head);
+ iter->name = name;
+}
+
+const char *qemu_opt_iter_next(QemuOptsIter *iter)
+{
+ QemuOpt *ret = iter->opt;
+ if (iter->name) {
+ while (ret && !g_str_equal(iter->name, ret->name)) {
+ ret = QTAILQ_NEXT(ret, next);
+ }
+ }
+ iter->opt = ret ? QTAILQ_NEXT(ret, next) : NULL;
+ return ret ? ret->str : NULL;
+}
+
+/* Get a known option (or its default) and remove it from the list
+ * all in one action. Return a malloced string of the option value.
+ * Result must be freed by caller with g_free().
+ */
+char *qemu_opt_get_del(QemuOpts *opts, const char *name)
+{
+ QemuOpt *opt;
+ char *str;
+
+ if (opts == NULL) {
+ return NULL;
+ }
+
+ opt = qemu_opt_find(opts, name);
+ if (!opt) {
+ return g_strdup(find_default_by_name(opts, name));
+ }
+ str = opt->str;
+ opt->str = NULL;
+ qemu_opt_del_all(opts, name);
+ return str;
+}
+
+bool qemu_opt_has_help_opt(QemuOpts *opts)
+{
+ QemuOpt *opt;
+
+ QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) {
+ if (is_help_option(opt->name)) {
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool qemu_opt_get_bool_helper(QemuOpts *opts, const char *name,
+ bool defval, bool del)
+{
+ QemuOpt *opt;
+ const char *def_val;
+ bool ret = defval;
+
+ if (opts == NULL) {
+ return ret;
+ }
+
+ opt = qemu_opt_find(opts, name);
+ if (opt == NULL) {
+ def_val = find_default_by_name(opts, name);
+ if (def_val) {
+ qapi_bool_parse(name, def_val, &ret, &error_abort);
+ }
+ return ret;
+ }
+ assert(opt->desc && opt->desc->type == QEMU_OPT_BOOL);
+ ret = opt->value.boolean;
+ if (del) {
+ qemu_opt_del_all(opts, name);
+ }
+ return ret;
+}
+
+bool qemu_opt_get_bool(QemuOpts *opts, const char *name, bool defval)
+{
+ return qemu_opt_get_bool_helper(opts, name, defval, false);
+}
+
+bool qemu_opt_get_bool_del(QemuOpts *opts, const char *name, bool defval)
+{
+ return qemu_opt_get_bool_helper(opts, name, defval, true);
+}
+
+static uint64_t qemu_opt_get_number_helper(QemuOpts *opts, const char *name,
+ uint64_t defval, bool del)
+{
+ QemuOpt *opt;
+ const char *def_val;
+ uint64_t ret = defval;
+
+ if (opts == NULL) {
+ return ret;
+ }
+
+ opt = qemu_opt_find(opts, name);
+ if (opt == NULL) {
+ def_val = find_default_by_name(opts, name);
+ if (def_val) {
+ parse_option_number(name, def_val, &ret, &error_abort);
+ }
+ return ret;
+ }
+ assert(opt->desc && opt->desc->type == QEMU_OPT_NUMBER);
+ ret = opt->value.uint;
+ if (del) {
+ qemu_opt_del_all(opts, name);
+ }
+ return ret;
+}
+
+uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval)
+{
+ return qemu_opt_get_number_helper(opts, name, defval, false);
+}
+
+uint64_t qemu_opt_get_number_del(QemuOpts *opts, const char *name,
+ uint64_t defval)
+{
+ return qemu_opt_get_number_helper(opts, name, defval, true);
+}
+
+static uint64_t qemu_opt_get_size_helper(QemuOpts *opts, const char *name,
+ uint64_t defval, bool del)
+{
+ QemuOpt *opt;
+ const char *def_val;
+ uint64_t ret = defval;
+
+ if (opts == NULL) {
+ return ret;
+ }
+
+ opt = qemu_opt_find(opts, name);
+ if (opt == NULL) {
+ def_val = find_default_by_name(opts, name);
+ if (def_val) {
+ parse_option_size(name, def_val, &ret, &error_abort);
+ }
+ return ret;
+ }
+ assert(opt->desc && opt->desc->type == QEMU_OPT_SIZE);
+ ret = opt->value.uint;
+ if (del) {
+ qemu_opt_del_all(opts, name);
+ }
+ return ret;
+}
+
+uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval)
+{
+ return qemu_opt_get_size_helper(opts, name, defval, false);
+}
+
+uint64_t qemu_opt_get_size_del(QemuOpts *opts, const char *name,
+ uint64_t defval)
+{
+ return qemu_opt_get_size_helper(opts, name, defval, true);
+}
+
+static bool qemu_opt_parse(QemuOpt *opt, Error **errp)
+{
+ if (opt->desc == NULL)
+ return true;
+
+ switch (opt->desc->type) {
+ case QEMU_OPT_STRING:
+ /* nothing */
+ return true;
+ case QEMU_OPT_BOOL:
+ return qapi_bool_parse(opt->name, opt->str, &opt->value.boolean, errp);
+ case QEMU_OPT_NUMBER:
+ return parse_option_number(opt->name, opt->str, &opt->value.uint,
+ errp);
+ case QEMU_OPT_SIZE:
+ return parse_option_size(opt->name, opt->str, &opt->value.uint,
+ errp);
+ default:
+ abort();
+ }
+}
+
+static bool opts_accepts_any(const QemuOptsList *list)
+{
+ return list->desc[0].name == NULL;
+}
+
+int qemu_opt_unset(QemuOpts *opts, const char *name)
+{
+ QemuOpt *opt = qemu_opt_find(opts, name);
+
+ assert(opts_accepts_any(opts->list));
+
+ if (opt == NULL) {
+ return -1;
+ } else {
+ qemu_opt_del(opt);
+ return 0;
+ }
+}
+
+static QemuOpt *opt_create(QemuOpts *opts, const char *name, char *value)
+{
+ QemuOpt *opt = g_malloc0(sizeof(*opt));
+
+ opt->name = g_strdup(name);
+ opt->str = value;
+ opt->opts = opts;
+ QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+
+ return opt;
+}
+
+static bool opt_validate(QemuOpt *opt, Error **errp)
+{
+ const QemuOptDesc *desc;
+ const QemuOptsList *list = opt->opts->list;
+
+ desc = find_desc_by_name(list->desc, opt->name);
+ if (!desc && !opts_accepts_any(list)) {
+ error_setg(errp, QERR_INVALID_PARAMETER, opt->name);
+ return false;
+ }
+
+ opt->desc = desc;
+ if (!qemu_opt_parse(opt, errp)) {
+ return false;
+ }
+
+ return true;
+}
+
+bool qemu_opt_set(QemuOpts *opts, const char *name, const char *value,
+ Error **errp)
+{
+ QemuOpt *opt = opt_create(opts, name, g_strdup(value));
+
+ if (!opt_validate(opt, errp)) {
+ qemu_opt_del(opt);
+ return false;
+ }
+ return true;
+}
+
+bool qemu_opt_set_bool(QemuOpts *opts, const char *name, bool val,
+ Error **errp)
+{
+ QemuOpt *opt;
+ const QemuOptDesc *desc;
+ const QemuOptsList *list = opts->list;
+
+ desc = find_desc_by_name(list->desc, name);
+ if (!desc && !opts_accepts_any(list)) {
+ error_setg(errp, QERR_INVALID_PARAMETER, name);
+ return false;
+ }
+
+ opt = g_malloc0(sizeof(*opt));
+ opt->name = g_strdup(name);
+ opt->opts = opts;
+ opt->desc = desc;
+ opt->value.boolean = !!val;
+ opt->str = g_strdup(val ? "on" : "off");
+ QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+ return true;
+}
+
+bool qemu_opt_set_number(QemuOpts *opts, const char *name, int64_t val,
+ Error **errp)
+{
+ QemuOpt *opt;
+ const QemuOptDesc *desc;
+ const QemuOptsList *list = opts->list;
+
+ desc = find_desc_by_name(list->desc, name);
+ if (!desc && !opts_accepts_any(list)) {
+ error_setg(errp, QERR_INVALID_PARAMETER, name);
+ return false;
+ }
+
+ opt = g_malloc0(sizeof(*opt));
+ opt->name = g_strdup(name);
+ opt->opts = opts;
+ opt->desc = desc;
+ opt->value.uint = val;
+ opt->str = g_strdup_printf("%" PRId64, val);
+ QTAILQ_INSERT_TAIL(&opts->head, opt, next);
+ return true;
+}
+
+/**
+ * For each member of @opts, call @func(@opaque, name, value, @errp).
+ * @func() may store an Error through @errp, but must return non-zero then.
+ * When @func() returns non-zero, break the loop and return that value.
+ * Return zero when the loop completes.
+ */
+int qemu_opt_foreach(QemuOpts *opts, qemu_opt_loopfunc func, void *opaque,
+ Error **errp)
+{
+ QemuOpt *opt;
+ int rc;
+
+ QTAILQ_FOREACH(opt, &opts->head, next) {
+ rc = func(opaque, opt->name, opt->str, errp);
+ if (rc) {
+ return rc;
+ }
+ assert(!errp || !*errp);
+ }
+ return 0;
+}
+
+QemuOpts *qemu_opts_find(QemuOptsList *list, const char *id)
+{
+ QemuOpts *opts;
+
+ QTAILQ_FOREACH(opts, &list->head, next) {
+ if (!opts->id && !id) {
+ return opts;
+ }
+ if (opts->id && id && !strcmp(opts->id, id)) {
+ return opts;
+ }
+ }
+ return NULL;
+}
+
+QemuOpts *qemu_opts_create(QemuOptsList *list, const char *id,
+ int fail_if_exists, Error **errp)
+{
+ QemuOpts *opts = NULL;
+
+ if (list->merge_lists) {
+ if (id) {
+ error_setg(errp, QERR_INVALID_PARAMETER, "id");
+ return NULL;
+ }
+ opts = qemu_opts_find(list, NULL);
+ if (opts) {
+ return opts;
+ }
+ } else if (id) {
+ assert(fail_if_exists);
+ if (!id_wellformed(id)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id",
+ "an identifier");
+ error_append_hint(errp, "Identifiers consist of letters, digits, "
+ "'-', '.', '_', starting with a letter.\n");
+ return NULL;
+ }
+ opts = qemu_opts_find(list, id);
+ if (opts != NULL) {
+ error_setg(errp, "Duplicate ID '%s' for %s", id, list->name);
+ return NULL;
+ }
+ }
+ opts = g_malloc0(sizeof(*opts));
+ opts->id = g_strdup(id);
+ opts->list = list;
+ loc_save(&opts->loc);
+ QTAILQ_INIT(&opts->head);
+ QTAILQ_INSERT_TAIL(&list->head, opts, next);
+ return opts;
+}
+
+void qemu_opts_reset(QemuOptsList *list)
+{
+ QemuOpts *opts, *next_opts;
+
+ QTAILQ_FOREACH_SAFE(opts, &list->head, next, next_opts) {
+ qemu_opts_del(opts);
+ }
+}
+
+void qemu_opts_loc_restore(QemuOpts *opts)
+{
+ loc_restore(&opts->loc);
+}
+
+const char *qemu_opts_id(QemuOpts *opts)
+{
+ return opts->id;
+}
+
+/* The id string will be g_free()d by qemu_opts_del */
+void qemu_opts_set_id(QemuOpts *opts, char *id)
+{
+ opts->id = id;
+}
+
+void qemu_opts_del(QemuOpts *opts)
+{
+ QemuOpt *opt;
+
+ if (opts == NULL) {
+ return;
+ }
+
+ for (;;) {
+ opt = QTAILQ_FIRST(&opts->head);
+ if (opt == NULL)
+ break;
+ qemu_opt_del(opt);
+ }
+ QTAILQ_REMOVE(&opts->list->head, opts, next);
+ g_free(opts->id);
+ g_free(opts);
+}
+
+/* print value, escaping any commas in value */
+static void escaped_print(const char *value)
+{
+ const char *ptr;
+
+ for (ptr = value; *ptr; ++ptr) {
+ if (*ptr == ',') {
+ putchar(',');
+ }
+ putchar(*ptr);
+ }
+}
+
+void qemu_opts_print(QemuOpts *opts, const char *separator)
+{
+ QemuOpt *opt;
+ QemuOptDesc *desc = opts->list->desc;
+ const char *sep = "";
+
+ if (opts->id) {
+ printf("id=%s", opts->id); /* passed id_wellformed -> no commas */
+ sep = separator;
+ }
+
+ if (desc[0].name == NULL) {
+ QTAILQ_FOREACH(opt, &opts->head, next) {
+ printf("%s%s=", sep, opt->name);
+ escaped_print(opt->str);
+ sep = separator;
+ }
+ return;
+ }
+ for (; desc && desc->name; desc++) {
+ const char *value;
+ opt = qemu_opt_find(opts, desc->name);
+
+ value = opt ? opt->str : desc->def_value_str;
+ if (!value) {
+ continue;
+ }
+ if (desc->type == QEMU_OPT_STRING) {
+ printf("%s%s=", sep, desc->name);
+ escaped_print(value);
+ } else if ((desc->type == QEMU_OPT_SIZE ||
+ desc->type == QEMU_OPT_NUMBER) && opt) {
+ printf("%s%s=%" PRId64, sep, desc->name, opt->value.uint);
+ } else {
+ printf("%s%s=%s", sep, desc->name, value);
+ }
+ sep = separator;
+ }
+}
+
+static const char *get_opt_name_value(const char *params,
+ const char *firstname,
+ bool warn_on_flag,
+ bool *help_wanted,
+ char **name, char **value)
+{
+ const char *p;
+ const char *prefix = "";
+ size_t len;
+ bool is_help = false;
+
+ len = strcspn(params, "=,");
+ if (params[len] != '=') {
+ /* found "foo,more" */
+ if (firstname) {
+ /* implicitly named first option */
+ *name = g_strdup(firstname);
+ p = get_opt_value(params, value);
+ } else {
+ /* option without value, must be a flag */
+ p = get_opt_name(params, name, len);
+ if (strncmp(*name, "no", 2) == 0) {
+ memmove(*name, *name + 2, strlen(*name + 2) + 1);
+ *value = g_strdup("off");
+ prefix = "no";
+ } else {
+ *value = g_strdup("on");
+ is_help = is_help_option(*name);
+ }
+ if (!is_help && warn_on_flag) {
+ warn_report("short-form boolean option '%s%s' deprecated", prefix, *name);
+ if (g_str_equal(*name, "delay")) {
+ error_printf("Please use nodelay=%s instead\n", prefix[0] ? "on" : "off");
+ } else {
+ error_printf("Please use %s=%s instead\n", *name, *value);
+ }
+ }
+ }
+ } else {
+ /* found "foo=bar,more" */
+ p = get_opt_name(params, name, len);
+ assert(*p == '=');
+ p++;
+ p = get_opt_value(p, value);
+ }
+
+ assert(!*p || *p == ',');
+ if (help_wanted && is_help) {
+ *help_wanted = true;
+ }
+ if (*p == ',') {
+ p++;
+ }
+ return p;
+}
+
+static bool opts_do_parse(QemuOpts *opts, const char *params,
+ const char *firstname,
+ bool warn_on_flag, bool *help_wanted, Error **errp)
+{
+ char *option, *value;
+ const char *p;
+ QemuOpt *opt;
+
+ for (p = params; *p;) {
+ p = get_opt_name_value(p, firstname, warn_on_flag, help_wanted, &option, &value);
+ if (help_wanted && *help_wanted) {
+ g_free(option);
+ g_free(value);
+ return false;
+ }
+ firstname = NULL;
+
+ if (!strcmp(option, "id")) {
+ g_free(option);
+ g_free(value);
+ continue;
+ }
+
+ opt = opt_create(opts, option, value);
+ g_free(option);
+ if (!opt_validate(opt, errp)) {
+ qemu_opt_del(opt);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static char *opts_parse_id(const char *params)
+{
+ const char *p;
+ char *name, *value;
+
+ for (p = params; *p;) {
+ p = get_opt_name_value(p, NULL, false, NULL, &name, &value);
+ if (!strcmp(name, "id")) {
+ g_free(name);
+ return value;
+ }
+ g_free(name);
+ g_free(value);
+ }
+
+ return NULL;
+}
+
+bool has_help_option(const char *params)
+{
+ const char *p;
+ char *name, *value;
+ bool ret = false;
+
+ for (p = params; *p;) {
+ p = get_opt_name_value(p, NULL, false, &ret, &name, &value);
+ g_free(name);
+ g_free(value);
+ if (ret) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * Store options parsed from @params into @opts.
+ * If @firstname is non-null, the first key=value in @params may omit
+ * key=, and is treated as if key was @firstname.
+ * On error, store an error object through @errp if non-null.
+ */
+bool qemu_opts_do_parse(QemuOpts *opts, const char *params,
+ const char *firstname, Error **errp)
+{
+ return opts_do_parse(opts, params, firstname, false, NULL, errp);
+}
+
+static QemuOpts *opts_parse(QemuOptsList *list, const char *params,
+ bool permit_abbrev,
+ bool warn_on_flag, bool *help_wanted, Error **errp)
+{
+ const char *firstname;
+ char *id = opts_parse_id(params);
+ QemuOpts *opts;
+
+ assert(!permit_abbrev || list->implied_opt_name);
+ firstname = permit_abbrev ? list->implied_opt_name : NULL;
+
+ opts = qemu_opts_create(list, id, !list->merge_lists, errp);
+ g_free(id);
+ if (opts == NULL) {
+ return NULL;
+ }
+
+ if (!opts_do_parse(opts, params, firstname,
+ warn_on_flag, help_wanted, errp)) {
+ qemu_opts_del(opts);
+ return NULL;
+ }
+
+ return opts;
+}
+
+/**
+ * Create a QemuOpts in @list and with options parsed from @params.
+ * If @permit_abbrev, the first key=value in @params may omit key=,
+ * and is treated as if key was @list->implied_opt_name.
+ * On error, store an error object through @errp if non-null.
+ * Return the new QemuOpts on success, null pointer on error.
+ */
+QemuOpts *qemu_opts_parse(QemuOptsList *list, const char *params,
+ bool permit_abbrev, Error **errp)
+{
+ return opts_parse(list, params, permit_abbrev, false, NULL, errp);
+}
+
+/**
+ * Create a QemuOpts in @list and with options parsed from @params.
+ * If @permit_abbrev, the first key=value in @params may omit key=,
+ * and is treated as if key was @list->implied_opt_name.
+ * Report errors with error_report_err(). This is inappropriate in
+ * QMP context. Do not use this function there!
+ * Return the new QemuOpts on success, null pointer on error.
+ */
+QemuOpts *qemu_opts_parse_noisily(QemuOptsList *list, const char *params,
+ bool permit_abbrev)
+{
+ Error *err = NULL;
+ QemuOpts *opts;
+ bool help_wanted = false;
+
+ opts = opts_parse(list, params, permit_abbrev, true,
+ opts_accepts_any(list) ? NULL : &help_wanted,
+ &err);
+ if (!opts) {
+ assert(!!err + !!help_wanted == 1);
+ if (help_wanted) {
+ qemu_opts_print_help(list, true);
+ } else {
+ error_report_err(err);
+ }
+ }
+ return opts;
+}
+
+static bool qemu_opts_from_qdict_entry(QemuOpts *opts,
+ const QDictEntry *entry,
+ Error **errp)
+{
+ const char *key = qdict_entry_key(entry);
+ QObject *obj = qdict_entry_value(entry);
+ char buf[32];
+ g_autofree char *tmp = NULL;
+ const char *value;
+
+ if (!strcmp(key, "id")) {
+ return true;
+ }
+
+ switch (qobject_type(obj)) {
+ case QTYPE_QSTRING:
+ value = qstring_get_str(qobject_to(QString, obj));
+ break;
+ case QTYPE_QNUM:
+ tmp = qnum_to_string(qobject_to(QNum, obj));
+ value = tmp;
+ break;
+ case QTYPE_QBOOL:
+ pstrcpy(buf, sizeof(buf),
+ qbool_get_bool(qobject_to(QBool, obj)) ? "on" : "off");
+ value = buf;
+ break;
+ default:
+ return true;
+ }
+
+ return qemu_opt_set(opts, key, value, errp);
+}
+
+/*
+ * Create QemuOpts from a QDict.
+ * Use value of key "id" as ID if it exists and is a QString. Only
+ * QStrings, QNums and QBools are copied. Entries with other types
+ * are silently ignored.
+ */
+QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict,
+ Error **errp)
+{
+ QemuOpts *opts;
+ const QDictEntry *entry;
+
+ opts = qemu_opts_create(list, qdict_get_try_str(qdict, "id"), 1, errp);
+ if (!opts) {
+ return NULL;
+ }
+
+ for (entry = qdict_first(qdict);
+ entry;
+ entry = qdict_next(qdict, entry)) {
+ if (!qemu_opts_from_qdict_entry(opts, entry, errp)) {
+ qemu_opts_del(opts);
+ return NULL;
+ }
+ }
+
+ return opts;
+}
+
+/*
+ * Adds all QDict entries to the QemuOpts that can be added and removes them
+ * from the QDict. When this function returns, the QDict contains only those
+ * entries that couldn't be added to the QemuOpts.
+ */
+bool qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp)
+{
+ const QDictEntry *entry, *next;
+
+ entry = qdict_first(qdict);
+
+ while (entry != NULL) {
+ next = qdict_next(qdict, entry);
+
+ if (opts_accepts_any(opts->list) ||
+ find_desc_by_name(opts->list->desc, entry->key)) {
+ if (!qemu_opts_from_qdict_entry(opts, entry, errp)) {
+ return false;
+ }
+ qdict_del(qdict, entry->key);
+ }
+
+ entry = next;
+ }
+
+ return true;
+}
+
+/*
+ * Convert from QemuOpts to QDict. The QDict values are of type QString.
+ *
+ * If @list is given, only add those options to the QDict that are contained in
+ * the list. If @del is true, any options added to the QDict are removed from
+ * the QemuOpts, otherwise they remain there.
+ *
+ * If two options in @opts have the same name, they are processed in order
+ * so that the last one wins (consistent with the reverse iteration in
+ * qemu_opt_find()), but all of them are deleted if @del is true.
+ *
+ * TODO We'll want to use types appropriate for opt->desc->type, but
+ * this is enough for now.
+ */
+QDict *qemu_opts_to_qdict_filtered(QemuOpts *opts, QDict *qdict,
+ QemuOptsList *list, bool del)
+{
+ QemuOpt *opt, *next;
+
+ if (!qdict) {
+ qdict = qdict_new();
+ }
+ if (opts->id) {
+ qdict_put_str(qdict, "id", opts->id);
+ }
+ QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next) {
+ if (list) {
+ QemuOptDesc *desc;
+ bool found = false;
+ for (desc = list->desc; desc->name; desc++) {
+ if (!strcmp(desc->name, opt->name)) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ continue;
+ }
+ }
+ qdict_put_str(qdict, opt->name, opt->str);
+ if (del) {
+ qemu_opt_del(opt);
+ }
+ }
+ return qdict;
+}
+
+/* Copy all options in a QemuOpts to the given QDict. See
+ * qemu_opts_to_qdict_filtered() for details. */
+QDict *qemu_opts_to_qdict(QemuOpts *opts, QDict *qdict)
+{
+ return qemu_opts_to_qdict_filtered(opts, qdict, NULL, false);
+}
+
+/* Validate parsed opts against descriptions where no
+ * descriptions were provided in the QemuOptsList.
+ */
+bool qemu_opts_validate(QemuOpts *opts, const QemuOptDesc *desc, Error **errp)
+{
+ QemuOpt *opt;
+
+ assert(opts_accepts_any(opts->list));
+
+ QTAILQ_FOREACH(opt, &opts->head, next) {
+ opt->desc = find_desc_by_name(desc, opt->name);
+ if (!opt->desc) {
+ error_setg(errp, QERR_INVALID_PARAMETER, opt->name);
+ return false;
+ }
+
+ if (!qemu_opt_parse(opt, errp)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * For each member of @list, call @func(@opaque, member, @errp).
+ * Call it with the current location temporarily set to the member's.
+ * @func() may store an Error through @errp, but must return non-zero then.
+ * When @func() returns non-zero, break the loop and return that value.
+ * Return zero when the loop completes.
+ */
+int qemu_opts_foreach(QemuOptsList *list, qemu_opts_loopfunc func,
+ void *opaque, Error **errp)
+{
+ Location loc;
+ QemuOpts *opts, *next;
+ int rc = 0;
+
+ loc_push_none(&loc);
+ QTAILQ_FOREACH_SAFE(opts, &list->head, next, next) {
+ loc_restore(&opts->loc);
+ rc = func(opaque, opts, errp);
+ if (rc) {
+ break;
+ }
+ assert(!errp || !*errp);
+ }
+ loc_pop(&loc);
+ return rc;
+}
+
+static size_t count_opts_list(QemuOptsList *list)
+{
+ QemuOptDesc *desc = NULL;
+ size_t num_opts = 0;
+
+ if (!list) {
+ return 0;
+ }
+
+ desc = list->desc;
+ while (desc && desc->name) {
+ num_opts++;
+ desc++;
+ }
+
+ return num_opts;
+}
+
+void qemu_opts_free(QemuOptsList *list)
+{
+ g_free(list);
+}
+
+/* Realloc dst option list and append options from an option list (list)
+ * to it. dst could be NULL or a malloced list.
+ * The lifetime of dst must be shorter than the input list because the
+ * QemuOptDesc->name, ->help, and ->def_value_str strings are shared.
+ */
+QemuOptsList *qemu_opts_append(QemuOptsList *dst,
+ QemuOptsList *list)
+{
+ size_t num_opts, num_dst_opts;
+ QemuOptDesc *desc;
+ bool need_init = false;
+ bool need_head_update;
+
+ if (!list) {
+ return dst;
+ }
+
+ /* If dst is NULL, after realloc, some area of dst should be initialized
+ * before adding options to it.
+ */
+ if (!dst) {
+ need_init = true;
+ need_head_update = true;
+ } else {
+ /* Moreover, even if dst is not NULL, the realloc may move it to a
+ * different address in which case we may get a stale tail pointer
+ * in dst->head. */
+ need_head_update = QTAILQ_EMPTY(&dst->head);
+ }
+
+ num_opts = count_opts_list(dst);
+ num_dst_opts = num_opts;
+ num_opts += count_opts_list(list);
+ dst = g_realloc(dst, sizeof(QemuOptsList) +
+ (num_opts + 1) * sizeof(QemuOptDesc));
+ if (need_init) {
+ dst->name = NULL;
+ dst->implied_opt_name = NULL;
+ dst->merge_lists = false;
+ }
+ if (need_head_update) {
+ QTAILQ_INIT(&dst->head);
+ }
+ dst->desc[num_dst_opts].name = NULL;
+
+ /* append list->desc to dst->desc */
+ if (list) {
+ desc = list->desc;
+ while (desc && desc->name) {
+ if (find_desc_by_name(dst->desc, desc->name) == NULL) {
+ dst->desc[num_dst_opts++] = *desc;
+ dst->desc[num_dst_opts].name = NULL;
+ }
+ desc++;
+ }
+ }
+
+ return dst;
+}
diff --git a/util/qemu-print.c b/util/qemu-print.c
new file mode 100644
index 000000000..69ba612f5
--- /dev/null
+++ b/util/qemu-print.c
@@ -0,0 +1,70 @@
+/*
+ * Print to stream or current monitor
+ *
+ * Copyright (C) 2019 Red Hat Inc.
+ *
+ * Authors:
+ * Markus Armbruster <armbru@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "monitor/monitor.h"
+#include "qemu/qemu-print.h"
+
+/*
+ * Print like vprintf().
+ * Print to current monitor if we have one, else to stdout.
+ */
+int qemu_vprintf(const char *fmt, va_list ap)
+{
+ Monitor *cur_mon = monitor_cur();
+ if (cur_mon) {
+ return monitor_vprintf(cur_mon, fmt, ap);
+ }
+ return vprintf(fmt, ap);
+}
+
+/*
+ * Print like printf().
+ * Print to current monitor if we have one, else to stdout.
+ */
+int qemu_printf(const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = qemu_vprintf(fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+/*
+ * Print like vfprintf()
+ * Print to @stream if non-null, else to current monitor.
+ */
+int qemu_vfprintf(FILE *stream, const char *fmt, va_list ap)
+{
+ if (!stream) {
+ return monitor_vprintf(monitor_cur(), fmt, ap);
+ }
+ return vfprintf(stream, fmt, ap);
+}
+
+/*
+ * Print like fprintf().
+ * Print to @stream if non-null, else to current monitor.
+ */
+int qemu_fprintf(FILE *stream, const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+
+ va_start(ap, fmt);
+ ret = qemu_vfprintf(stream, fmt, ap);
+ va_end(ap);
+ return ret;
+}
diff --git a/util/qemu-progress.c b/util/qemu-progress.c
new file mode 100644
index 000000000..20d51f8c1
--- /dev/null
+++ b/util/qemu-progress.c
@@ -0,0 +1,162 @@
+/*
+ * QEMU progress printing utility functions
+ *
+ * Copyright (C) 2011 Jes Sorensen <Jes.Sorensen@redhat.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+struct progress_state {
+ float current;
+ float last_print;
+ float min_skip;
+ void (*print)(void);
+ void (*end)(void);
+};
+
+static struct progress_state state;
+static volatile sig_atomic_t print_pending;
+
+/*
+ * Simple progress print function.
+ * @percent relative percent of current operation
+ * @max percent of total operation
+ */
+static void progress_simple_print(void)
+{
+ printf(" (%3.2f/100%%)\r", state.current);
+ fflush(stdout);
+}
+
+static void progress_simple_end(void)
+{
+ printf("\n");
+}
+
+static void progress_simple_init(void)
+{
+ state.print = progress_simple_print;
+ state.end = progress_simple_end;
+}
+
+#ifdef CONFIG_POSIX
+static void sigusr_print(int signal)
+{
+ print_pending = 1;
+}
+#endif
+
+static void progress_dummy_print(void)
+{
+ if (print_pending) {
+ fprintf(stderr, " (%3.2f/100%%)\n", state.current);
+ print_pending = 0;
+ }
+}
+
+static void progress_dummy_end(void)
+{
+}
+
+static void progress_dummy_init(void)
+{
+#ifdef CONFIG_POSIX
+ struct sigaction action;
+ sigset_t set;
+
+ memset(&action, 0, sizeof(action));
+ sigfillset(&action.sa_mask);
+ action.sa_handler = sigusr_print;
+ action.sa_flags = 0;
+ sigaction(SIGUSR1, &action, NULL);
+#ifdef SIGINFO
+ sigaction(SIGINFO, &action, NULL);
+#endif
+
+ /*
+ * SIGUSR1 is SIG_IPI and gets blocked in qemu_init_main_loop(). In the
+ * tools that use the progress report SIGUSR1 isn't used in this meaning
+ * and instead should print the progress, so reenable it.
+ */
+ sigemptyset(&set);
+ sigaddset(&set, SIGUSR1);
+ pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+#endif
+
+ state.print = progress_dummy_print;
+ state.end = progress_dummy_end;
+}
+
+/*
+ * Initialize progress reporting.
+ * If @enabled is false, actual reporting is suppressed. The user can
+ * still trigger a report by sending a SIGUSR1.
+ * Reports are also suppressed unless we've had at least @min_skip
+ * percent progress since the last report.
+ */
+void qemu_progress_init(int enabled, float min_skip)
+{
+ state.min_skip = min_skip;
+ if (enabled) {
+ progress_simple_init();
+ } else {
+ progress_dummy_init();
+ }
+}
+
+void qemu_progress_end(void)
+{
+ state.end();
+}
+
+/*
+ * Report progress.
+ * @delta is how much progress we made.
+ * If @max is zero, @delta is an absolute value of the total job done.
+ * Else, @delta is a progress delta since the last call, as a fraction
+ * of @max. I.e. the delta is @delta * @max / 100. This allows
+ * relative accounting of functions which may be a different fraction of
+ * the full job, depending on the context they are called in. I.e.
+ * a function might be considered 40% of the full job if used from
+ * bdrv_img_create() but only 20% if called from img_convert().
+ */
+void qemu_progress_print(float delta, int max)
+{
+ float current;
+
+ if (max == 0) {
+ current = delta;
+ } else {
+ current = state.current + delta / 100 * max;
+ }
+ if (current > 100) {
+ current = 100;
+ }
+ state.current = current;
+
+ if (current > (state.last_print + state.min_skip) ||
+ current < (state.last_print - state.min_skip) ||
+ current == 100 || current == 0) {
+ state.last_print = state.current;
+ state.print();
+ }
+}
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
new file mode 100644
index 000000000..0585e7a62
--- /dev/null
+++ b/util/qemu-sockets.c
@@ -0,0 +1,1482 @@
+/*
+ * inet and unix socket functions for qemu
+ *
+ * (c) 2008 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include "qemu/osdep.h"
+
+#ifdef CONFIG_AF_VSOCK
+#include <linux/vm_sockets.h>
+#endif /* CONFIG_AF_VSOCK */
+
+#include "qemu-common.h"
+#include "monitor/monitor.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/error.h"
+#include "qapi/qapi-visit-sockets.h"
+#include "qemu/sockets.h"
+#include "qemu/main-loop.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
+#include "qemu/cutils.h"
+#include "trace.h"
+
+#ifndef AI_ADDRCONFIG
+# define AI_ADDRCONFIG 0
+#endif
+
+#ifndef AI_V4MAPPED
+# define AI_V4MAPPED 0
+#endif
+
+#ifndef AI_NUMERICSERV
+# define AI_NUMERICSERV 0
+#endif
+
+
+static int inet_getport(struct addrinfo *e)
+{
+ struct sockaddr_in *i4;
+ struct sockaddr_in6 *i6;
+
+ switch (e->ai_family) {
+ case PF_INET6:
+ i6 = (void*)e->ai_addr;
+ return ntohs(i6->sin6_port);
+ case PF_INET:
+ i4 = (void*)e->ai_addr;
+ return ntohs(i4->sin_port);
+ default:
+ return 0;
+ }
+}
+
+static void inet_setport(struct addrinfo *e, int port)
+{
+ struct sockaddr_in *i4;
+ struct sockaddr_in6 *i6;
+
+ switch (e->ai_family) {
+ case PF_INET6:
+ i6 = (void*)e->ai_addr;
+ i6->sin6_port = htons(port);
+ break;
+ case PF_INET:
+ i4 = (void*)e->ai_addr;
+ i4->sin_port = htons(port);
+ break;
+ }
+}
+
+NetworkAddressFamily inet_netfamily(int family)
+{
+ switch (family) {
+ case PF_INET6: return NETWORK_ADDRESS_FAMILY_IPV6;
+ case PF_INET: return NETWORK_ADDRESS_FAMILY_IPV4;
+ case PF_UNIX: return NETWORK_ADDRESS_FAMILY_UNIX;
+#ifdef CONFIG_AF_VSOCK
+ case PF_VSOCK: return NETWORK_ADDRESS_FAMILY_VSOCK;
+#endif /* CONFIG_AF_VSOCK */
+ }
+ return NETWORK_ADDRESS_FAMILY_UNKNOWN;
+}
+
+bool fd_is_socket(int fd)
+{
+ int optval;
+ socklen_t optlen = sizeof(optval);
+ return !qemu_getsockopt(fd, SOL_SOCKET, SO_TYPE, &optval, &optlen);
+}
+
+
+/*
+ * Matrix we're trying to apply
+ *
+ * ipv4 ipv6 family
+ * - - PF_UNSPEC
+ * - f PF_INET
+ * - t PF_INET6
+ * f - PF_INET6
+ * f f <error>
+ * f t PF_INET6
+ * t - PF_INET
+ * t f PF_INET
+ * t t PF_INET6/PF_UNSPEC
+ *
+ * NB, this matrix is only about getting the necessary results
+ * from getaddrinfo(). Some of the cases require further work
+ * after reading results from getaddrinfo in order to fully
+ * apply the logic the end user wants.
+ *
+ * In the first and last cases, we must set IPV6_V6ONLY=0
+ * when binding, to allow a single listener to potentially
+ * accept both IPv4+6 addresses.
+ */
+int inet_ai_family_from_address(InetSocketAddress *addr,
+ Error **errp)
+{
+ if (addr->has_ipv6 && addr->has_ipv4 &&
+ !addr->ipv6 && !addr->ipv4) {
+ error_setg(errp, "Cannot disable IPv4 and IPv6 at same time");
+ return PF_UNSPEC;
+ }
+ if ((addr->has_ipv6 && addr->ipv6) && (addr->has_ipv4 && addr->ipv4)) {
+ /*
+ * Some backends can only do a single listener. In that case
+ * we want empty hostname to resolve to "::" and then use the
+ * flag IPV6_V6ONLY==0 to get both protocols on 1 socket. This
+ * doesn't work for addresses other than "", so they're just
+ * inevitably broken until multiple listeners can be used,
+ * and thus we honour getaddrinfo automatic protocol detection
+ * Once all backends do multi-listener, remove the PF_INET6
+ * branch entirely.
+ */
+ if (!addr->host || g_str_equal(addr->host, "")) {
+ return PF_INET6;
+ } else {
+ return PF_UNSPEC;
+ }
+ }
+ if ((addr->has_ipv6 && addr->ipv6) || (addr->has_ipv4 && !addr->ipv4)) {
+ return PF_INET6;
+ }
+ if ((addr->has_ipv4 && addr->ipv4) || (addr->has_ipv6 && !addr->ipv6)) {
+ return PF_INET;
+ }
+ return PF_UNSPEC;
+}
+
+static int create_fast_reuse_socket(struct addrinfo *e)
+{
+ int slisten = qemu_socket(e->ai_family, e->ai_socktype, e->ai_protocol);
+ if (slisten < 0) {
+ return -1;
+ }
+ socket_set_fast_reuse(slisten);
+ return slisten;
+}
+
+static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e)
+{
+#ifndef IPV6_V6ONLY
+ return bind(socket, e->ai_addr, e->ai_addrlen);
+#else
+ /*
+ * Deals with first & last cases in matrix in comment
+ * for inet_ai_family_from_address().
+ */
+ int v6only =
+ ((!saddr->has_ipv4 && !saddr->has_ipv6) ||
+ (saddr->has_ipv4 && saddr->ipv4 &&
+ saddr->has_ipv6 && saddr->ipv6)) ? 0 : 1;
+ int stat;
+
+ rebind:
+ if (e->ai_family == PF_INET6) {
+ qemu_setsockopt(socket, IPPROTO_IPV6, IPV6_V6ONLY, &v6only,
+ sizeof(v6only));
+ }
+
+ stat = bind(socket, e->ai_addr, e->ai_addrlen);
+ if (!stat) {
+ return 0;
+ }
+
+ /* If we got EADDRINUSE from an IPv6 bind & v6only is unset,
+ * it could be that the IPv4 port is already claimed, so retry
+ * with v6only set
+ */
+ if (e->ai_family == PF_INET6 && errno == EADDRINUSE && !v6only) {
+ v6only = 1;
+ goto rebind;
+ }
+ return stat;
+#endif
+}
+
+static int inet_listen_saddr(InetSocketAddress *saddr,
+ int port_offset,
+ int num,
+ Error **errp)
+{
+ struct addrinfo ai,*res,*e;
+ char port[33];
+ char uaddr[INET6_ADDRSTRLEN+1];
+ char uport[33];
+ int rc, port_min, port_max, p;
+ int slisten = -1;
+ int saved_errno = 0;
+ bool socket_created = false;
+ Error *err = NULL;
+
+ if (saddr->keep_alive) {
+ error_setg(errp, "keep-alive option is not supported for passive "
+ "sockets");
+ return -1;
+ }
+
+ memset(&ai,0, sizeof(ai));
+ ai.ai_flags = AI_PASSIVE;
+ if (saddr->has_numeric && saddr->numeric) {
+ ai.ai_flags |= AI_NUMERICHOST | AI_NUMERICSERV;
+ }
+ ai.ai_family = inet_ai_family_from_address(saddr, &err);
+ ai.ai_socktype = SOCK_STREAM;
+
+ if (err) {
+ error_propagate(errp, err);
+ return -1;
+ }
+
+ if (saddr->host == NULL) {
+ error_setg(errp, "host not specified");
+ return -1;
+ }
+ if (saddr->port != NULL) {
+ pstrcpy(port, sizeof(port), saddr->port);
+ } else {
+ port[0] = '\0';
+ }
+
+ /* lookup */
+ if (port_offset) {
+ unsigned long long baseport;
+ if (strlen(port) == 0) {
+ error_setg(errp, "port not specified");
+ return -1;
+ }
+ if (parse_uint_full(port, &baseport, 10) < 0) {
+ error_setg(errp, "can't convert to a number: %s", port);
+ return -1;
+ }
+ if (baseport > 65535 ||
+ baseport + port_offset > 65535) {
+ error_setg(errp, "port %s out of range", port);
+ return -1;
+ }
+ snprintf(port, sizeof(port), "%d", (int)baseport + port_offset);
+ }
+ rc = getaddrinfo(strlen(saddr->host) ? saddr->host : NULL,
+ strlen(port) ? port : NULL, &ai, &res);
+ if (rc != 0) {
+ error_setg(errp, "address resolution failed for %s:%s: %s",
+ saddr->host, port, gai_strerror(rc));
+ return -1;
+ }
+
+ /* create socket + bind/listen */
+ for (e = res; e != NULL; e = e->ai_next) {
+#ifdef HAVE_IPPROTO_MPTCP
+ if (saddr->has_mptcp && saddr->mptcp) {
+ e->ai_protocol = IPPROTO_MPTCP;
+ }
+#endif
+ getnameinfo((struct sockaddr*)e->ai_addr,e->ai_addrlen,
+ uaddr,INET6_ADDRSTRLEN,uport,32,
+ NI_NUMERICHOST | NI_NUMERICSERV);
+
+ port_min = inet_getport(e);
+ port_max = saddr->has_to ? saddr->to + port_offset : port_min;
+ for (p = port_min; p <= port_max; p++) {
+ inet_setport(e, p);
+
+ slisten = create_fast_reuse_socket(e);
+ if (slisten < 0) {
+ /* First time we expect we might fail to create the socket
+ * eg if 'e' has AF_INET6 but ipv6 kmod is not loaded.
+ * Later iterations should always succeed if first iteration
+ * worked though, so treat that as fatal.
+ */
+ if (p == port_min) {
+ continue;
+ } else {
+ error_setg_errno(errp, errno,
+ "Failed to recreate failed listening socket");
+ goto listen_failed;
+ }
+ }
+ socket_created = true;
+
+ rc = try_bind(slisten, saddr, e);
+ if (rc < 0) {
+ if (errno != EADDRINUSE) {
+ error_setg_errno(errp, errno, "Failed to bind socket");
+ goto listen_failed;
+ }
+ } else {
+ if (!listen(slisten, num)) {
+ goto listen_ok;
+ }
+ if (errno != EADDRINUSE) {
+ error_setg_errno(errp, errno, "Failed to listen on socket");
+ goto listen_failed;
+ }
+ }
+ /* Someone else managed to bind to the same port and beat us
+ * to listen on it! Socket semantics does not allow us to
+ * recover from this situation, so we need to recreate the
+ * socket to allow bind attempts for subsequent ports:
+ */
+ closesocket(slisten);
+ slisten = -1;
+ }
+ }
+ error_setg_errno(errp, errno,
+ socket_created ?
+ "Failed to find an available port" :
+ "Failed to create a socket");
+listen_failed:
+ saved_errno = errno;
+ if (slisten >= 0) {
+ closesocket(slisten);
+ }
+ freeaddrinfo(res);
+ errno = saved_errno;
+ return -1;
+
+listen_ok:
+ freeaddrinfo(res);
+ return slisten;
+}
+
+#ifdef _WIN32
+#define QEMU_SOCKET_RC_INPROGRESS(rc) \
+ ((rc) == -EINPROGRESS || (rc) == -EWOULDBLOCK || (rc) == -WSAEALREADY)
+#else
+#define QEMU_SOCKET_RC_INPROGRESS(rc) \
+ ((rc) == -EINPROGRESS)
+#endif
+
+static int inet_connect_addr(const InetSocketAddress *saddr,
+ struct addrinfo *addr, Error **errp)
+{
+ int sock, rc;
+
+ sock = qemu_socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol);
+ if (sock < 0) {
+ error_setg_errno(errp, errno, "Failed to create socket family %d",
+ addr->ai_family);
+ return -1;
+ }
+ socket_set_fast_reuse(sock);
+
+ /* connect to peer */
+ do {
+ rc = 0;
+ if (connect(sock, addr->ai_addr, addr->ai_addrlen) < 0) {
+ rc = -errno;
+ }
+ } while (rc == -EINTR);
+
+ if (rc < 0) {
+ error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ saddr->host, saddr->port);
+ closesocket(sock);
+ return -1;
+ }
+
+ return sock;
+}
+
+static struct addrinfo *inet_parse_connect_saddr(InetSocketAddress *saddr,
+ Error **errp)
+{
+ struct addrinfo ai, *res;
+ int rc;
+ Error *err = NULL;
+ static int useV4Mapped = 1;
+
+ memset(&ai, 0, sizeof(ai));
+
+ ai.ai_flags = AI_CANONNAME | AI_ADDRCONFIG;
+ if (qatomic_read(&useV4Mapped)) {
+ ai.ai_flags |= AI_V4MAPPED;
+ }
+ ai.ai_family = inet_ai_family_from_address(saddr, &err);
+ ai.ai_socktype = SOCK_STREAM;
+
+ if (err) {
+ error_propagate(errp, err);
+ return NULL;
+ }
+
+ if (saddr->host == NULL || saddr->port == NULL) {
+ error_setg(errp, "host and/or port not specified");
+ return NULL;
+ }
+
+ /* lookup */
+ rc = getaddrinfo(saddr->host, saddr->port, &ai, &res);
+
+ /* At least FreeBSD and OS-X 10.6 declare AI_V4MAPPED but
+ * then don't implement it in their getaddrinfo(). Detect
+ * this and retry without the flag since that's preferable
+ * to a fatal error
+ */
+ if (rc == EAI_BADFLAGS &&
+ (ai.ai_flags & AI_V4MAPPED)) {
+ qatomic_set(&useV4Mapped, 0);
+ ai.ai_flags &= ~AI_V4MAPPED;
+ rc = getaddrinfo(saddr->host, saddr->port, &ai, &res);
+ }
+ if (rc != 0) {
+ error_setg(errp, "address resolution failed for %s:%s: %s",
+ saddr->host, saddr->port, gai_strerror(rc));
+ return NULL;
+ }
+ return res;
+}
+
+/**
+ * Create a socket and connect it to an address.
+ *
+ * @saddr: Inet socket address specification
+ * @errp: set on error
+ *
+ * Returns: -1 on error, file descriptor on success.
+ */
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp)
+{
+ Error *local_err = NULL;
+ struct addrinfo *res, *e;
+ int sock = -1;
+
+ res = inet_parse_connect_saddr(saddr, errp);
+ if (!res) {
+ return -1;
+ }
+
+ for (e = res; e != NULL; e = e->ai_next) {
+ error_free(local_err);
+ local_err = NULL;
+
+#ifdef HAVE_IPPROTO_MPTCP
+ if (saddr->has_mptcp && saddr->mptcp) {
+ e->ai_protocol = IPPROTO_MPTCP;
+ }
+#endif
+
+ sock = inet_connect_addr(saddr, e, &local_err);
+ if (sock >= 0) {
+ break;
+ }
+ }
+
+ freeaddrinfo(res);
+
+ if (sock < 0) {
+ error_propagate(errp, local_err);
+ return sock;
+ }
+
+ if (saddr->keep_alive) {
+ int val = 1;
+ int ret = qemu_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
+ &val, sizeof(val));
+
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "Unable to set KEEPALIVE");
+ close(sock);
+ return -1;
+ }
+ }
+
+ return sock;
+}
+
+static int inet_dgram_saddr(InetSocketAddress *sraddr,
+ InetSocketAddress *sladdr,
+ Error **errp)
+{
+ struct addrinfo ai, *peer = NULL, *local = NULL;
+ const char *addr;
+ const char *port;
+ int sock = -1, rc;
+ Error *err = NULL;
+
+ /* lookup peer addr */
+ memset(&ai,0, sizeof(ai));
+ ai.ai_flags = AI_CANONNAME | AI_V4MAPPED | AI_ADDRCONFIG;
+ ai.ai_family = inet_ai_family_from_address(sraddr, &err);
+ ai.ai_socktype = SOCK_DGRAM;
+
+ if (err) {
+ error_propagate(errp, err);
+ goto err;
+ }
+
+ addr = sraddr->host;
+ port = sraddr->port;
+ if (addr == NULL || strlen(addr) == 0) {
+ addr = "localhost";
+ }
+ if (port == NULL || strlen(port) == 0) {
+ error_setg(errp, "remote port not specified");
+ goto err;
+ }
+
+ if ((rc = getaddrinfo(addr, port, &ai, &peer)) != 0) {
+ error_setg(errp, "address resolution failed for %s:%s: %s", addr, port,
+ gai_strerror(rc));
+ goto err;
+ }
+
+ /* lookup local addr */
+ memset(&ai,0, sizeof(ai));
+ ai.ai_flags = AI_PASSIVE;
+ ai.ai_family = peer->ai_family;
+ ai.ai_socktype = SOCK_DGRAM;
+
+ if (sladdr) {
+ addr = sladdr->host;
+ port = sladdr->port;
+ if (addr == NULL || strlen(addr) == 0) {
+ addr = NULL;
+ }
+ if (!port || strlen(port) == 0) {
+ port = "0";
+ }
+ } else {
+ addr = NULL;
+ port = "0";
+ }
+
+ if ((rc = getaddrinfo(addr, port, &ai, &local)) != 0) {
+ error_setg(errp, "address resolution failed for %s:%s: %s", addr, port,
+ gai_strerror(rc));
+ goto err;
+ }
+
+ /* create socket */
+ sock = qemu_socket(peer->ai_family, peer->ai_socktype, peer->ai_protocol);
+ if (sock < 0) {
+ error_setg_errno(errp, errno, "Failed to create socket family %d",
+ peer->ai_family);
+ goto err;
+ }
+ socket_set_fast_reuse(sock);
+
+ /* bind socket */
+ if (bind(sock, local->ai_addr, local->ai_addrlen) < 0) {
+ error_setg_errno(errp, errno, "Failed to bind socket");
+ goto err;
+ }
+
+ /* connect to peer */
+ if (connect(sock,peer->ai_addr,peer->ai_addrlen) < 0) {
+ error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ addr, port);
+ goto err;
+ }
+
+ freeaddrinfo(local);
+ freeaddrinfo(peer);
+ return sock;
+
+err:
+ if (sock != -1) {
+ closesocket(sock);
+ }
+ if (local) {
+ freeaddrinfo(local);
+ }
+ if (peer) {
+ freeaddrinfo(peer);
+ }
+
+ return -1;
+}
+
+/* compatibility wrapper */
+static int inet_parse_flag(const char *flagname, const char *optstr, bool *val,
+ Error **errp)
+{
+ char *end;
+ size_t len;
+
+ end = strstr(optstr, ",");
+ if (end) {
+ if (end[1] == ',') { /* Reject 'ipv6=on,,foo' */
+ error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr);
+ return -1;
+ }
+ len = end - optstr;
+ } else {
+ len = strlen(optstr);
+ }
+ if (len == 0 || (len == 3 && strncmp(optstr, "=on", len) == 0)) {
+ *val = true;
+ } else if (len == 4 && strncmp(optstr, "=off", len) == 0) {
+ *val = false;
+ } else {
+ error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr);
+ return -1;
+ }
+ return 0;
+}
+
+int inet_parse(InetSocketAddress *addr, const char *str, Error **errp)
+{
+ const char *optstr, *h;
+ char host[65];
+ char port[33];
+ int to;
+ int pos;
+ char *begin;
+
+ memset(addr, 0, sizeof(*addr));
+
+ /* parse address */
+ if (str[0] == ':') {
+ /* no host given */
+ host[0] = '\0';
+ if (sscanf(str, ":%32[^,]%n", port, &pos) != 1) {
+ error_setg(errp, "error parsing port in address '%s'", str);
+ return -1;
+ }
+ } else if (str[0] == '[') {
+ /* IPv6 addr */
+ if (sscanf(str, "[%64[^]]]:%32[^,]%n", host, port, &pos) != 2) {
+ error_setg(errp, "error parsing IPv6 address '%s'", str);
+ return -1;
+ }
+ } else {
+ /* hostname or IPv4 addr */
+ if (sscanf(str, "%64[^:]:%32[^,]%n", host, port, &pos) != 2) {
+ error_setg(errp, "error parsing address '%s'", str);
+ return -1;
+ }
+ }
+
+ addr->host = g_strdup(host);
+ addr->port = g_strdup(port);
+
+ /* parse options */
+ optstr = str + pos;
+ h = strstr(optstr, ",to=");
+ if (h) {
+ h += 4;
+ if (sscanf(h, "%d%n", &to, &pos) != 1 ||
+ (h[pos] != '\0' && h[pos] != ',')) {
+ error_setg(errp, "error parsing to= argument");
+ return -1;
+ }
+ addr->has_to = true;
+ addr->to = to;
+ }
+ begin = strstr(optstr, ",ipv4");
+ if (begin) {
+ if (inet_parse_flag("ipv4", begin + 5, &addr->ipv4, errp) < 0) {
+ return -1;
+ }
+ addr->has_ipv4 = true;
+ }
+ begin = strstr(optstr, ",ipv6");
+ if (begin) {
+ if (inet_parse_flag("ipv6", begin + 5, &addr->ipv6, errp) < 0) {
+ return -1;
+ }
+ addr->has_ipv6 = true;
+ }
+ begin = strstr(optstr, ",keep-alive");
+ if (begin) {
+ if (inet_parse_flag("keep-alive", begin + strlen(",keep-alive"),
+ &addr->keep_alive, errp) < 0)
+ {
+ return -1;
+ }
+ addr->has_keep_alive = true;
+ }
+#ifdef HAVE_IPPROTO_MPTCP
+ begin = strstr(optstr, ",mptcp");
+ if (begin) {
+ if (inet_parse_flag("mptcp", begin + strlen(",mptcp"),
+ &addr->mptcp, errp) < 0)
+ {
+ return -1;
+ }
+ addr->has_mptcp = true;
+ }
+#endif
+ return 0;
+}
+
+
+/**
+ * Create a blocking socket and connect it to an address.
+ *
+ * @str: address string
+ * @errp: set in case of an error
+ *
+ * Returns -1 in case of error, file descriptor on success
+ **/
+int inet_connect(const char *str, Error **errp)
+{
+ int sock = -1;
+ InetSocketAddress *addr = g_new(InetSocketAddress, 1);
+
+ if (!inet_parse(addr, str, errp)) {
+ sock = inet_connect_saddr(addr, errp);
+ }
+ qapi_free_InetSocketAddress(addr);
+ return sock;
+}
+
+#ifdef CONFIG_AF_VSOCK
+static bool vsock_parse_vaddr_to_sockaddr(const VsockSocketAddress *vaddr,
+ struct sockaddr_vm *svm,
+ Error **errp)
+{
+ unsigned long long val;
+
+ memset(svm, 0, sizeof(*svm));
+ svm->svm_family = AF_VSOCK;
+
+ if (parse_uint_full(vaddr->cid, &val, 10) < 0 ||
+ val > UINT32_MAX) {
+ error_setg(errp, "Failed to parse cid '%s'", vaddr->cid);
+ return false;
+ }
+ svm->svm_cid = val;
+
+ if (parse_uint_full(vaddr->port, &val, 10) < 0 ||
+ val > UINT32_MAX) {
+ error_setg(errp, "Failed to parse port '%s'", vaddr->port);
+ return false;
+ }
+ svm->svm_port = val;
+
+ return true;
+}
+
+static int vsock_connect_addr(const VsockSocketAddress *vaddr,
+ const struct sockaddr_vm *svm, Error **errp)
+{
+ int sock, rc;
+
+ sock = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+ if (sock < 0) {
+ error_setg_errno(errp, errno, "Failed to create socket family %d",
+ AF_VSOCK);
+ return -1;
+ }
+
+ /* connect to peer */
+ do {
+ rc = 0;
+ if (connect(sock, (const struct sockaddr *)svm, sizeof(*svm)) < 0) {
+ rc = -errno;
+ }
+ } while (rc == -EINTR);
+
+ if (rc < 0) {
+ error_setg_errno(errp, errno, "Failed to connect to '%s:%s'",
+ vaddr->cid, vaddr->port);
+ closesocket(sock);
+ return -1;
+ }
+
+ return sock;
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp)
+{
+ struct sockaddr_vm svm;
+
+ if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+ return -1;
+ }
+
+ return vsock_connect_addr(vaddr, &svm, errp);
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+ int num,
+ Error **errp)
+{
+ struct sockaddr_vm svm;
+ int slisten;
+
+ if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) {
+ return -1;
+ }
+
+ slisten = qemu_socket(AF_VSOCK, SOCK_STREAM, 0);
+ if (slisten < 0) {
+ error_setg_errno(errp, errno, "Failed to create socket");
+ return -1;
+ }
+
+ if (bind(slisten, (const struct sockaddr *)&svm, sizeof(svm)) != 0) {
+ error_setg_errno(errp, errno, "Failed to bind socket");
+ closesocket(slisten);
+ return -1;
+ }
+
+ if (listen(slisten, num) != 0) {
+ error_setg_errno(errp, errno, "Failed to listen on socket");
+ closesocket(slisten);
+ return -1;
+ }
+ return slisten;
+}
+
+static int vsock_parse(VsockSocketAddress *addr, const char *str,
+ Error **errp)
+{
+ char cid[33];
+ char port[33];
+ int n;
+
+ if (sscanf(str, "%32[^:]:%32[^,]%n", cid, port, &n) != 2) {
+ error_setg(errp, "error parsing address '%s'", str);
+ return -1;
+ }
+ if (str[n] != '\0') {
+ error_setg(errp, "trailing characters in address '%s'", str);
+ return -1;
+ }
+
+ addr->cid = g_strdup(cid);
+ addr->port = g_strdup(port);
+ return 0;
+}
+#else
+static void vsock_unsupported(Error **errp)
+{
+ error_setg(errp, "socket family AF_VSOCK unsupported");
+}
+
+static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp)
+{
+ vsock_unsupported(errp);
+ return -1;
+}
+
+static int vsock_listen_saddr(VsockSocketAddress *vaddr,
+ int num,
+ Error **errp)
+{
+ vsock_unsupported(errp);
+ return -1;
+}
+
+static int vsock_parse(VsockSocketAddress *addr, const char *str,
+ Error **errp)
+{
+ vsock_unsupported(errp);
+ return -1;
+}
+#endif /* CONFIG_AF_VSOCK */
+
+#ifndef _WIN32
+
+static bool saddr_is_abstract(UnixSocketAddress *saddr)
+{
+#ifdef CONFIG_LINUX
+ return saddr->abstract;
+#else
+ return false;
+#endif
+}
+
+static bool saddr_is_tight(UnixSocketAddress *saddr)
+{
+#ifdef CONFIG_LINUX
+ return !saddr->has_tight || saddr->tight;
+#else
+ return false;
+#endif
+}
+
+static int unix_listen_saddr(UnixSocketAddress *saddr,
+ int num,
+ Error **errp)
+{
+ bool abstract = saddr_is_abstract(saddr);
+ struct sockaddr_un un;
+ int sock, fd;
+ char *pathbuf = NULL;
+ const char *path;
+ size_t pathlen;
+ size_t addrlen;
+
+ sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0);
+ if (sock < 0) {
+ error_setg_errno(errp, errno, "Failed to create Unix socket");
+ return -1;
+ }
+
+ if (saddr->path[0] || abstract) {
+ path = saddr->path;
+ } else {
+ const char *tmpdir = getenv("TMPDIR");
+ tmpdir = tmpdir ? tmpdir : "/tmp";
+ path = pathbuf = g_strdup_printf("%s/qemu-socket-XXXXXX", tmpdir);
+ }
+
+ pathlen = strlen(path);
+ if (pathlen > sizeof(un.sun_path) ||
+ (abstract && pathlen > (sizeof(un.sun_path) - 1))) {
+ error_setg(errp, "UNIX socket path '%s' is too long", path);
+ error_append_hint(errp, "Path must be less than %zu bytes\n",
+ abstract ? sizeof(un.sun_path) - 1 :
+ sizeof(un.sun_path));
+ goto err;
+ }
+
+ if (pathbuf != NULL) {
+ /*
+ * This dummy fd usage silences the mktemp() unsecure warning.
+ * Using mkstemp() doesn't make things more secure here
+ * though. bind() complains about existing files, so we have
+ * to unlink first and thus re-open the race window. The
+ * worst case possible is bind() failing, i.e. a DoS attack.
+ */
+ fd = mkstemp(pathbuf);
+ if (fd < 0) {
+ error_setg_errno(errp, errno,
+ "Failed to make a temporary socket %s", pathbuf);
+ goto err;
+ }
+ close(fd);
+ }
+
+ if (!abstract && unlink(path) < 0 && errno != ENOENT) {
+ error_setg_errno(errp, errno,
+ "Failed to unlink socket %s", path);
+ goto err;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ addrlen = sizeof(un);
+
+ if (abstract) {
+ un.sun_path[0] = '\0';
+ memcpy(&un.sun_path[1], path, pathlen);
+ if (saddr_is_tight(saddr)) {
+ addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen;
+ }
+ } else {
+ memcpy(un.sun_path, path, pathlen);
+ }
+
+ if (bind(sock, (struct sockaddr *) &un, addrlen) < 0) {
+ error_setg_errno(errp, errno, "Failed to bind socket to %s", path);
+ goto err;
+ }
+ if (listen(sock, num) < 0) {
+ error_setg_errno(errp, errno, "Failed to listen on socket");
+ goto err;
+ }
+
+ g_free(pathbuf);
+ return sock;
+
+err:
+ g_free(pathbuf);
+ closesocket(sock);
+ return -1;
+}
+
+static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp)
+{
+ bool abstract = saddr_is_abstract(saddr);
+ struct sockaddr_un un;
+ int sock, rc;
+ size_t pathlen;
+ size_t addrlen;
+
+ if (saddr->path == NULL) {
+ error_setg(errp, "unix connect: no path specified");
+ return -1;
+ }
+
+ sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0);
+ if (sock < 0) {
+ error_setg_errno(errp, errno, "Failed to create socket");
+ return -1;
+ }
+
+ pathlen = strlen(saddr->path);
+ if (pathlen > sizeof(un.sun_path) ||
+ (abstract && pathlen > (sizeof(un.sun_path) - 1))) {
+ error_setg(errp, "UNIX socket path '%s' is too long", saddr->path);
+ error_append_hint(errp, "Path must be less than %zu bytes\n",
+ abstract ? sizeof(un.sun_path) - 1 :
+ sizeof(un.sun_path));
+ goto err;
+ }
+
+ memset(&un, 0, sizeof(un));
+ un.sun_family = AF_UNIX;
+ addrlen = sizeof(un);
+
+ if (abstract) {
+ un.sun_path[0] = '\0';
+ memcpy(&un.sun_path[1], saddr->path, pathlen);
+ if (saddr_is_tight(saddr)) {
+ addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen;
+ }
+ } else {
+ memcpy(un.sun_path, saddr->path, pathlen);
+ }
+ /* connect to peer */
+ do {
+ rc = 0;
+ if (connect(sock, (struct sockaddr *) &un, addrlen) < 0) {
+ rc = -errno;
+ }
+ } while (rc == -EINTR);
+
+ if (rc < 0) {
+ error_setg_errno(errp, -rc, "Failed to connect to '%s'",
+ saddr->path);
+ goto err;
+ }
+
+ return sock;
+
+ err:
+ close(sock);
+ return -1;
+}
+
+#else
+
+static int unix_listen_saddr(UnixSocketAddress *saddr,
+ int num,
+ Error **errp)
+{
+ error_setg(errp, "unix sockets are not available on windows");
+ errno = ENOTSUP;
+ return -1;
+}
+
+static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp)
+{
+ error_setg(errp, "unix sockets are not available on windows");
+ errno = ENOTSUP;
+ return -1;
+}
+#endif
+
+/* compatibility wrapper */
+int unix_listen(const char *str, Error **errp)
+{
+ UnixSocketAddress *saddr;
+ int sock;
+
+ saddr = g_new0(UnixSocketAddress, 1);
+ saddr->path = g_strdup(str);
+ sock = unix_listen_saddr(saddr, 1, errp);
+ qapi_free_UnixSocketAddress(saddr);
+ return sock;
+}
+
+int unix_connect(const char *path, Error **errp)
+{
+ UnixSocketAddress *saddr;
+ int sock;
+
+ saddr = g_new0(UnixSocketAddress, 1);
+ saddr->path = g_strdup(path);
+ sock = unix_connect_saddr(saddr, errp);
+ qapi_free_UnixSocketAddress(saddr);
+ return sock;
+}
+
+
+SocketAddress *socket_parse(const char *str, Error **errp)
+{
+ SocketAddress *addr;
+
+ addr = g_new0(SocketAddress, 1);
+ if (strstart(str, "unix:", NULL)) {
+ if (str[5] == '\0') {
+ error_setg(errp, "invalid Unix socket address");
+ goto fail;
+ } else {
+ addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+ addr->u.q_unix.path = g_strdup(str + 5);
+ }
+ } else if (strstart(str, "fd:", NULL)) {
+ if (str[3] == '\0') {
+ error_setg(errp, "invalid file descriptor address");
+ goto fail;
+ } else {
+ addr->type = SOCKET_ADDRESS_TYPE_FD;
+ addr->u.fd.str = g_strdup(str + 3);
+ }
+ } else if (strstart(str, "vsock:", NULL)) {
+ addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+ if (vsock_parse(&addr->u.vsock, str + strlen("vsock:"), errp)) {
+ goto fail;
+ }
+ } else {
+ addr->type = SOCKET_ADDRESS_TYPE_INET;
+ if (inet_parse(&addr->u.inet, str, errp)) {
+ goto fail;
+ }
+ }
+ return addr;
+
+fail:
+ qapi_free_SocketAddress(addr);
+ return NULL;
+}
+
+static int socket_get_fd(const char *fdstr, Error **errp)
+{
+ Monitor *cur_mon = monitor_cur();
+ int fd;
+ if (cur_mon) {
+ fd = monitor_get_fd(cur_mon, fdstr, errp);
+ if (fd < 0) {
+ return -1;
+ }
+ } else {
+ if (qemu_strtoi(fdstr, NULL, 10, &fd) < 0) {
+ error_setg_errno(errp, errno,
+ "Unable to parse FD number %s",
+ fdstr);
+ return -1;
+ }
+ }
+ if (!fd_is_socket(fd)) {
+ error_setg(errp, "File descriptor '%s' is not a socket", fdstr);
+ close(fd);
+ return -1;
+ }
+ return fd;
+}
+
+int socket_address_parse_named_fd(SocketAddress *addr, Error **errp)
+{
+ int fd;
+
+ if (addr->type != SOCKET_ADDRESS_TYPE_FD) {
+ return 0;
+ }
+
+ fd = socket_get_fd(addr->u.fd.str, errp);
+ if (fd < 0) {
+ return fd;
+ }
+
+ g_free(addr->u.fd.str);
+ addr->u.fd.str = g_strdup_printf("%d", fd);
+
+ return 0;
+}
+
+int socket_connect(SocketAddress *addr, Error **errp)
+{
+ int fd;
+
+ switch (addr->type) {
+ case SOCKET_ADDRESS_TYPE_INET:
+ fd = inet_connect_saddr(&addr->u.inet, errp);
+ break;
+
+ case SOCKET_ADDRESS_TYPE_UNIX:
+ fd = unix_connect_saddr(&addr->u.q_unix, errp);
+ break;
+
+ case SOCKET_ADDRESS_TYPE_FD:
+ fd = socket_get_fd(addr->u.fd.str, errp);
+ break;
+
+ case SOCKET_ADDRESS_TYPE_VSOCK:
+ fd = vsock_connect_saddr(&addr->u.vsock, errp);
+ break;
+
+ default:
+ abort();
+ }
+ return fd;
+}
+
+int socket_listen(SocketAddress *addr, int num, Error **errp)
+{
+ int fd;
+
+ trace_socket_listen(num);
+ switch (addr->type) {
+ case SOCKET_ADDRESS_TYPE_INET:
+ fd = inet_listen_saddr(&addr->u.inet, 0, num, errp);
+ break;
+
+ case SOCKET_ADDRESS_TYPE_UNIX:
+ fd = unix_listen_saddr(&addr->u.q_unix, num, errp);
+ break;
+
+ case SOCKET_ADDRESS_TYPE_FD:
+ fd = socket_get_fd(addr->u.fd.str, errp);
+ if (fd < 0) {
+ return -1;
+ }
+
+ /*
+ * If the socket is not yet in the listen state, then transition it to
+ * the listen state now.
+ *
+ * If it's already listening then this updates the backlog value as
+ * requested.
+ *
+ * If this socket cannot listen because it's already in another state
+ * (e.g. unbound or connected) then we'll catch the error here.
+ */
+ if (listen(fd, num) != 0) {
+ error_setg_errno(errp, errno, "Failed to listen on fd socket");
+ closesocket(fd);
+ return -1;
+ }
+ break;
+
+ case SOCKET_ADDRESS_TYPE_VSOCK:
+ fd = vsock_listen_saddr(&addr->u.vsock, num, errp);
+ break;
+
+ default:
+ abort();
+ }
+ return fd;
+}
+
+void socket_listen_cleanup(int fd, Error **errp)
+{
+ SocketAddress *addr;
+
+ addr = socket_local_address(fd, errp);
+ if (!addr) {
+ return;
+ }
+
+ if (addr->type == SOCKET_ADDRESS_TYPE_UNIX
+ && addr->u.q_unix.path) {
+ if (unlink(addr->u.q_unix.path) < 0 && errno != ENOENT) {
+ error_setg_errno(errp, errno,
+ "Failed to unlink socket %s",
+ addr->u.q_unix.path);
+ }
+ }
+
+ qapi_free_SocketAddress(addr);
+}
+
+int socket_dgram(SocketAddress *remote, SocketAddress *local, Error **errp)
+{
+ int fd;
+
+ /*
+ * TODO SOCKET_ADDRESS_TYPE_FD when fd is AF_INET or AF_INET6
+ * (although other address families can do SOCK_DGRAM, too)
+ */
+ switch (remote->type) {
+ case SOCKET_ADDRESS_TYPE_INET:
+ fd = inet_dgram_saddr(&remote->u.inet,
+ local ? &local->u.inet : NULL, errp);
+ break;
+
+ default:
+ error_setg(errp, "socket type unsupported for datagram");
+ fd = -1;
+ }
+ return fd;
+}
+
+
+static SocketAddress *
+socket_sockaddr_to_address_inet(struct sockaddr_storage *sa,
+ socklen_t salen,
+ Error **errp)
+{
+ char host[NI_MAXHOST];
+ char serv[NI_MAXSERV];
+ SocketAddress *addr;
+ InetSocketAddress *inet;
+ int ret;
+
+ ret = getnameinfo((struct sockaddr *)sa, salen,
+ host, sizeof(host),
+ serv, sizeof(serv),
+ NI_NUMERICHOST | NI_NUMERICSERV);
+ if (ret != 0) {
+ error_setg(errp, "Cannot format numeric socket address: %s",
+ gai_strerror(ret));
+ return NULL;
+ }
+
+ addr = g_new0(SocketAddress, 1);
+ addr->type = SOCKET_ADDRESS_TYPE_INET;
+ inet = &addr->u.inet;
+ inet->host = g_strdup(host);
+ inet->port = g_strdup(serv);
+ if (sa->ss_family == AF_INET) {
+ inet->has_ipv4 = inet->ipv4 = true;
+ } else {
+ inet->has_ipv6 = inet->ipv6 = true;
+ }
+
+ return addr;
+}
+
+
+#ifndef WIN32
+static SocketAddress *
+socket_sockaddr_to_address_unix(struct sockaddr_storage *sa,
+ socklen_t salen,
+ Error **errp)
+{
+ SocketAddress *addr;
+ struct sockaddr_un *su = (struct sockaddr_un *)sa;
+
+ addr = g_new0(SocketAddress, 1);
+ addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+ salen -= offsetof(struct sockaddr_un, sun_path);
+#ifdef CONFIG_LINUX
+ if (salen > 0 && !su->sun_path[0]) {
+ /* Linux abstract socket */
+ addr->u.q_unix.path = g_strndup(su->sun_path + 1, salen - 1);
+ addr->u.q_unix.has_abstract = true;
+ addr->u.q_unix.abstract = true;
+ addr->u.q_unix.has_tight = true;
+ addr->u.q_unix.tight = salen < sizeof(su->sun_path);
+ return addr;
+ }
+#endif
+
+ addr->u.q_unix.path = g_strndup(su->sun_path, salen);
+ return addr;
+}
+#endif /* WIN32 */
+
+#ifdef CONFIG_AF_VSOCK
+static SocketAddress *
+socket_sockaddr_to_address_vsock(struct sockaddr_storage *sa,
+ socklen_t salen,
+ Error **errp)
+{
+ SocketAddress *addr;
+ VsockSocketAddress *vaddr;
+ struct sockaddr_vm *svm = (struct sockaddr_vm *)sa;
+
+ addr = g_new0(SocketAddress, 1);
+ addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+ vaddr = &addr->u.vsock;
+ vaddr->cid = g_strdup_printf("%u", svm->svm_cid);
+ vaddr->port = g_strdup_printf("%u", svm->svm_port);
+
+ return addr;
+}
+#endif /* CONFIG_AF_VSOCK */
+
+SocketAddress *
+socket_sockaddr_to_address(struct sockaddr_storage *sa,
+ socklen_t salen,
+ Error **errp)
+{
+ switch (sa->ss_family) {
+ case AF_INET:
+ case AF_INET6:
+ return socket_sockaddr_to_address_inet(sa, salen, errp);
+
+#ifndef WIN32
+ case AF_UNIX:
+ return socket_sockaddr_to_address_unix(sa, salen, errp);
+#endif /* WIN32 */
+
+#ifdef CONFIG_AF_VSOCK
+ case AF_VSOCK:
+ return socket_sockaddr_to_address_vsock(sa, salen, errp);
+#endif
+
+ default:
+ error_setg(errp, "socket family %d unsupported",
+ sa->ss_family);
+ return NULL;
+ }
+ return 0;
+}
+
+
+SocketAddress *socket_local_address(int fd, Error **errp)
+{
+ struct sockaddr_storage ss;
+ socklen_t sslen = sizeof(ss);
+
+ if (getsockname(fd, (struct sockaddr *)&ss, &sslen) < 0) {
+ error_setg_errno(errp, errno, "%s",
+ "Unable to query local socket address");
+ return NULL;
+ }
+
+ return socket_sockaddr_to_address(&ss, sslen, errp);
+}
+
+
+SocketAddress *socket_remote_address(int fd, Error **errp)
+{
+ struct sockaddr_storage ss;
+ socklen_t sslen = sizeof(ss);
+
+ if (getpeername(fd, (struct sockaddr *)&ss, &sslen) < 0) {
+ error_setg_errno(errp, errno, "%s",
+ "Unable to query remote socket address");
+ return NULL;
+ }
+
+ return socket_sockaddr_to_address(&ss, sslen, errp);
+}
+
+
+SocketAddress *socket_address_flatten(SocketAddressLegacy *addr_legacy)
+{
+ SocketAddress *addr;
+
+ if (!addr_legacy) {
+ return NULL;
+ }
+
+ addr = g_new(SocketAddress, 1);
+
+ switch (addr_legacy->type) {
+ case SOCKET_ADDRESS_TYPE_INET:
+ addr->type = SOCKET_ADDRESS_TYPE_INET;
+ QAPI_CLONE_MEMBERS(InetSocketAddress, &addr->u.inet,
+ addr_legacy->u.inet.data);
+ break;
+ case SOCKET_ADDRESS_TYPE_UNIX:
+ addr->type = SOCKET_ADDRESS_TYPE_UNIX;
+ QAPI_CLONE_MEMBERS(UnixSocketAddress, &addr->u.q_unix,
+ addr_legacy->u.q_unix.data);
+ break;
+ case SOCKET_ADDRESS_TYPE_VSOCK:
+ addr->type = SOCKET_ADDRESS_TYPE_VSOCK;
+ QAPI_CLONE_MEMBERS(VsockSocketAddress, &addr->u.vsock,
+ addr_legacy->u.vsock.data);
+ break;
+ case SOCKET_ADDRESS_TYPE_FD:
+ addr->type = SOCKET_ADDRESS_TYPE_FD;
+ QAPI_CLONE_MEMBERS(String, &addr->u.fd, addr_legacy->u.fd.data);
+ break;
+ default:
+ abort();
+ }
+
+ return addr;
+}
diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h
new file mode 100644
index 000000000..2af6b1208
--- /dev/null
+++ b/util/qemu-thread-common.h
@@ -0,0 +1,54 @@
+/*
+ * Common qemu-thread implementation header file.
+ *
+ * Copyright Red Hat, Inc. 2018
+ *
+ * Authors:
+ * Peter Xu <peterx@redhat.com>,
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_THREAD_COMMON_H
+#define QEMU_THREAD_COMMON_H
+
+#include "qemu/thread.h"
+#include "trace.h"
+
+static inline void qemu_mutex_post_init(QemuMutex *mutex)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+ mutex->file = NULL;
+ mutex->line = 0;
+#endif
+ mutex->initialized = true;
+}
+
+static inline void qemu_mutex_pre_lock(QemuMutex *mutex,
+ const char *file, int line)
+{
+ trace_qemu_mutex_lock(mutex, file, line);
+}
+
+static inline void qemu_mutex_post_lock(QemuMutex *mutex,
+ const char *file, int line)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+ mutex->file = file;
+ mutex->line = line;
+#endif
+ trace_qemu_mutex_locked(mutex, file, line);
+}
+
+static inline void qemu_mutex_pre_unlock(QemuMutex *mutex,
+ const char *file, int line)
+{
+#ifdef CONFIG_DEBUG_MUTEX
+ mutex->file = NULL;
+ mutex->line = 0;
+#endif
+ trace_qemu_mutex_unlock(mutex, file, line);
+}
+
+#endif
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
new file mode 100644
index 000000000..e1225b63b
--- /dev/null
+++ b/util/qemu-thread-posix.c
@@ -0,0 +1,632 @@
+/*
+ * Wrappers around mutex/cond/thread functions
+ *
+ * Copyright Red Hat, Inc. 2009
+ *
+ * Author:
+ * Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#include "qemu/osdep.h"
+#include "qemu/thread.h"
+#include "qemu/atomic.h"
+#include "qemu/notify.h"
+#include "qemu-thread-common.h"
+#include "qemu/tsan.h"
+
+static bool name_threads;
+
+void qemu_thread_naming(bool enable)
+{
+ name_threads = enable;
+
+#if !defined CONFIG_PTHREAD_SETNAME_NP_W_TID && \
+ !defined CONFIG_PTHREAD_SETNAME_NP_WO_TID
+ /* This is a debugging option, not fatal */
+ if (enable) {
+ fprintf(stderr, "qemu: thread naming not supported on this host\n");
+ }
+#endif
+}
+
+static void error_exit(int err, const char *msg)
+{
+ fprintf(stderr, "qemu: %s: %s\n", msg, strerror(err));
+ abort();
+}
+
+static void compute_abs_deadline(struct timespec *ts, int ms)
+{
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ ts->tv_nsec = tv.tv_usec * 1000 + (ms % 1000) * 1000000;
+ ts->tv_sec = tv.tv_sec + ms / 1000;
+ if (ts->tv_nsec >= 1000000000) {
+ ts->tv_sec++;
+ ts->tv_nsec -= 1000000000;
+ }
+}
+
+void qemu_mutex_init(QemuMutex *mutex)
+{
+ int err;
+
+ err = pthread_mutex_init(&mutex->lock, NULL);
+ if (err)
+ error_exit(err, __func__);
+ qemu_mutex_post_init(mutex);
+}
+
+void qemu_mutex_destroy(QemuMutex *mutex)
+{
+ int err;
+
+ assert(mutex->initialized);
+ mutex->initialized = false;
+ err = pthread_mutex_destroy(&mutex->lock);
+ if (err)
+ error_exit(err, __func__);
+}
+
+void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ int err;
+
+ assert(mutex->initialized);
+ qemu_mutex_pre_lock(mutex, file, line);
+ err = pthread_mutex_lock(&mutex->lock);
+ if (err)
+ error_exit(err, __func__);
+ qemu_mutex_post_lock(mutex, file, line);
+}
+
+int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ int err;
+
+ assert(mutex->initialized);
+ err = pthread_mutex_trylock(&mutex->lock);
+ if (err == 0) {
+ qemu_mutex_post_lock(mutex, file, line);
+ return 0;
+ }
+ if (err != EBUSY) {
+ error_exit(err, __func__);
+ }
+ return -EBUSY;
+}
+
+void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ int err;
+
+ assert(mutex->initialized);
+ qemu_mutex_pre_unlock(mutex, file, line);
+ err = pthread_mutex_unlock(&mutex->lock);
+ if (err)
+ error_exit(err, __func__);
+}
+
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+ int err;
+ pthread_mutexattr_t attr;
+
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+ err = pthread_mutex_init(&mutex->m.lock, &attr);
+ pthread_mutexattr_destroy(&attr);
+ if (err) {
+ error_exit(err, __func__);
+ }
+ mutex->m.initialized = true;
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+ qemu_mutex_destroy(&mutex->m);
+}
+
+void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ qemu_mutex_lock_impl(&mutex->m, file, line);
+}
+
+int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ return qemu_mutex_trylock_impl(&mutex->m, file, line);
+}
+
+void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ qemu_mutex_unlock_impl(&mutex->m, file, line);
+}
+
+void qemu_cond_init(QemuCond *cond)
+{
+ int err;
+
+ err = pthread_cond_init(&cond->cond, NULL);
+ if (err)
+ error_exit(err, __func__);
+ cond->initialized = true;
+}
+
+void qemu_cond_destroy(QemuCond *cond)
+{
+ int err;
+
+ assert(cond->initialized);
+ cond->initialized = false;
+ err = pthread_cond_destroy(&cond->cond);
+ if (err)
+ error_exit(err, __func__);
+}
+
+void qemu_cond_signal(QemuCond *cond)
+{
+ int err;
+
+ assert(cond->initialized);
+ err = pthread_cond_signal(&cond->cond);
+ if (err)
+ error_exit(err, __func__);
+}
+
+void qemu_cond_broadcast(QemuCond *cond)
+{
+ int err;
+
+ assert(cond->initialized);
+ err = pthread_cond_broadcast(&cond->cond);
+ if (err)
+ error_exit(err, __func__);
+}
+
+void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
+{
+ int err;
+
+ assert(cond->initialized);
+ qemu_mutex_pre_unlock(mutex, file, line);
+ err = pthread_cond_wait(&cond->cond, &mutex->lock);
+ qemu_mutex_post_lock(mutex, file, line);
+ if (err)
+ error_exit(err, __func__);
+}
+
+bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms,
+ const char *file, const int line)
+{
+ int err;
+ struct timespec ts;
+
+ assert(cond->initialized);
+ trace_qemu_mutex_unlock(mutex, file, line);
+ compute_abs_deadline(&ts, ms);
+ err = pthread_cond_timedwait(&cond->cond, &mutex->lock, &ts);
+ trace_qemu_mutex_locked(mutex, file, line);
+ if (err && err != ETIMEDOUT) {
+ error_exit(err, __func__);
+ }
+ return err != ETIMEDOUT;
+}
+
+void qemu_sem_init(QemuSemaphore *sem, int init)
+{
+ int rc;
+
+#ifndef CONFIG_SEM_TIMEDWAIT
+ rc = pthread_mutex_init(&sem->lock, NULL);
+ if (rc != 0) {
+ error_exit(rc, __func__);
+ }
+ rc = pthread_cond_init(&sem->cond, NULL);
+ if (rc != 0) {
+ error_exit(rc, __func__);
+ }
+ if (init < 0) {
+ error_exit(EINVAL, __func__);
+ }
+ sem->count = init;
+#else
+ rc = sem_init(&sem->sem, 0, init);
+ if (rc < 0) {
+ error_exit(errno, __func__);
+ }
+#endif
+ sem->initialized = true;
+}
+
+void qemu_sem_destroy(QemuSemaphore *sem)
+{
+ int rc;
+
+ assert(sem->initialized);
+ sem->initialized = false;
+#ifndef CONFIG_SEM_TIMEDWAIT
+ rc = pthread_cond_destroy(&sem->cond);
+ if (rc < 0) {
+ error_exit(rc, __func__);
+ }
+ rc = pthread_mutex_destroy(&sem->lock);
+ if (rc < 0) {
+ error_exit(rc, __func__);
+ }
+#else
+ rc = sem_destroy(&sem->sem);
+ if (rc < 0) {
+ error_exit(errno, __func__);
+ }
+#endif
+}
+
+void qemu_sem_post(QemuSemaphore *sem)
+{
+ int rc;
+
+ assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+ pthread_mutex_lock(&sem->lock);
+ if (sem->count == UINT_MAX) {
+ rc = EINVAL;
+ } else {
+ sem->count++;
+ rc = pthread_cond_signal(&sem->cond);
+ }
+ pthread_mutex_unlock(&sem->lock);
+ if (rc != 0) {
+ error_exit(rc, __func__);
+ }
+#else
+ rc = sem_post(&sem->sem);
+ if (rc < 0) {
+ error_exit(errno, __func__);
+ }
+#endif
+}
+
+int qemu_sem_timedwait(QemuSemaphore *sem, int ms)
+{
+ int rc;
+ struct timespec ts;
+
+ assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+ rc = 0;
+ compute_abs_deadline(&ts, ms);
+ pthread_mutex_lock(&sem->lock);
+ while (sem->count == 0) {
+ rc = pthread_cond_timedwait(&sem->cond, &sem->lock, &ts);
+ if (rc == ETIMEDOUT) {
+ break;
+ }
+ if (rc != 0) {
+ error_exit(rc, __func__);
+ }
+ }
+ if (rc != ETIMEDOUT) {
+ --sem->count;
+ }
+ pthread_mutex_unlock(&sem->lock);
+ return (rc == ETIMEDOUT ? -1 : 0);
+#else
+ if (ms <= 0) {
+ /* This is cheaper than sem_timedwait. */
+ do {
+ rc = sem_trywait(&sem->sem);
+ } while (rc == -1 && errno == EINTR);
+ if (rc == -1 && errno == EAGAIN) {
+ return -1;
+ }
+ } else {
+ compute_abs_deadline(&ts, ms);
+ do {
+ rc = sem_timedwait(&sem->sem, &ts);
+ } while (rc == -1 && errno == EINTR);
+ if (rc == -1 && errno == ETIMEDOUT) {
+ return -1;
+ }
+ }
+ if (rc < 0) {
+ error_exit(errno, __func__);
+ }
+ return 0;
+#endif
+}
+
+void qemu_sem_wait(QemuSemaphore *sem)
+{
+ int rc;
+
+ assert(sem->initialized);
+#ifndef CONFIG_SEM_TIMEDWAIT
+ pthread_mutex_lock(&sem->lock);
+ while (sem->count == 0) {
+ rc = pthread_cond_wait(&sem->cond, &sem->lock);
+ if (rc != 0) {
+ error_exit(rc, __func__);
+ }
+ }
+ --sem->count;
+ pthread_mutex_unlock(&sem->lock);
+#else
+ do {
+ rc = sem_wait(&sem->sem);
+ } while (rc == -1 && errno == EINTR);
+ if (rc < 0) {
+ error_exit(errno, __func__);
+ }
+#endif
+}
+
+#ifdef __linux__
+#include "qemu/futex.h"
+#else
+static inline void qemu_futex_wake(QemuEvent *ev, int n)
+{
+ assert(ev->initialized);
+ pthread_mutex_lock(&ev->lock);
+ if (n == 1) {
+ pthread_cond_signal(&ev->cond);
+ } else {
+ pthread_cond_broadcast(&ev->cond);
+ }
+ pthread_mutex_unlock(&ev->lock);
+}
+
+static inline void qemu_futex_wait(QemuEvent *ev, unsigned val)
+{
+ assert(ev->initialized);
+ pthread_mutex_lock(&ev->lock);
+ if (ev->value == val) {
+ pthread_cond_wait(&ev->cond, &ev->lock);
+ }
+ pthread_mutex_unlock(&ev->lock);
+}
+#endif
+
+/* Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by qemu_futex_wake
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy.
+ */
+
+#define EV_SET 0
+#define EV_FREE 1
+#define EV_BUSY -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+#ifndef __linux__
+ pthread_mutex_init(&ev->lock, NULL);
+ pthread_cond_init(&ev->cond, NULL);
+#endif
+
+ ev->value = (init ? EV_SET : EV_FREE);
+ ev->initialized = true;
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+ assert(ev->initialized);
+ ev->initialized = false;
+#ifndef __linux__
+ pthread_mutex_destroy(&ev->lock);
+ pthread_cond_destroy(&ev->cond);
+#endif
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+ /* qemu_event_set has release semantics, but because it *loads*
+ * ev->value we need a full memory barrier here.
+ */
+ assert(ev->initialized);
+ smp_mb();
+ if (qatomic_read(&ev->value) != EV_SET) {
+ if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+ /* There were waiters, wake them up. */
+ qemu_futex_wake(ev, INT_MAX);
+ }
+ }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+ unsigned value;
+
+ assert(ev->initialized);
+ value = qatomic_read(&ev->value);
+ smp_mb_acquire();
+ if (value == EV_SET) {
+ /*
+ * If there was a concurrent reset (or even reset+wait),
+ * do nothing. Otherwise change EV_SET->EV_FREE.
+ */
+ qatomic_or(&ev->value, EV_FREE);
+ }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+ unsigned value;
+
+ assert(ev->initialized);
+ value = qatomic_read(&ev->value);
+ smp_mb_acquire();
+ if (value != EV_SET) {
+ if (value == EV_FREE) {
+ /*
+ * Leave the event reset and tell qemu_event_set that there
+ * are waiters. No need to retry, because there cannot be
+ * a concurrent busy->free transition. After the CAS, the
+ * event will be either set or busy.
+ */
+ if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+ return;
+ }
+ }
+ qemu_futex_wait(ev, EV_BUSY);
+ }
+}
+
+static __thread NotifierList thread_exit;
+
+/*
+ * Note that in this implementation you can register a thread-exit
+ * notifier for the main thread, but it will never be called.
+ * This is OK because main thread exit can only happen when the
+ * entire process is exiting, and the API allows notifiers to not
+ * be called on process exit.
+ */
+void qemu_thread_atexit_add(Notifier *notifier)
+{
+ notifier_list_add(&thread_exit, notifier);
+}
+
+void qemu_thread_atexit_remove(Notifier *notifier)
+{
+ notifier_remove(notifier);
+}
+
+static void qemu_thread_atexit_notify(void *arg)
+{
+ /*
+ * Called when non-main thread exits (via qemu_thread_exit()
+ * or by returning from its start routine.)
+ */
+ notifier_list_notify(&thread_exit, NULL);
+}
+
+typedef struct {
+ void *(*start_routine)(void *);
+ void *arg;
+ char *name;
+} QemuThreadArgs;
+
+static void *qemu_thread_start(void *args)
+{
+ QemuThreadArgs *qemu_thread_args = args;
+ void *(*start_routine)(void *) = qemu_thread_args->start_routine;
+ void *arg = qemu_thread_args->arg;
+ void *r;
+
+ /* Attempt to set the threads name; note that this is for debug, so
+ * we're not going to fail if we can't set it.
+ */
+ if (name_threads && qemu_thread_args->name) {
+# if defined(CONFIG_PTHREAD_SETNAME_NP_W_TID)
+ pthread_setname_np(pthread_self(), qemu_thread_args->name);
+# elif defined(CONFIG_PTHREAD_SETNAME_NP_WO_TID)
+ pthread_setname_np(qemu_thread_args->name);
+# endif
+ }
+ QEMU_TSAN_ANNOTATE_THREAD_NAME(qemu_thread_args->name);
+ g_free(qemu_thread_args->name);
+ g_free(qemu_thread_args);
+
+ /*
+ * GCC 11 with glibc 2.17 on PowerPC reports
+ *
+ * qemu-thread-posix.c:540:5: error: ‘__sigsetjmp’ accessing 656 bytes
+ * in a region of size 528 [-Werror=stringop-overflow=]
+ * 540 | pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
+ * | ^~~~~~~~~~~~~~~~~~~~
+ *
+ * which is clearly nonsense.
+ */
+#pragma GCC diagnostic push
+#ifndef __clang__
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
+
+ pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
+ r = start_routine(arg);
+ pthread_cleanup_pop(1);
+
+#pragma GCC diagnostic pop
+
+ return r;
+}
+
+void qemu_thread_create(QemuThread *thread, const char *name,
+ void *(*start_routine)(void*),
+ void *arg, int mode)
+{
+ sigset_t set, oldset;
+ int err;
+ pthread_attr_t attr;
+ QemuThreadArgs *qemu_thread_args;
+
+ err = pthread_attr_init(&attr);
+ if (err) {
+ error_exit(err, __func__);
+ }
+
+ if (mode == QEMU_THREAD_DETACHED) {
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+ }
+
+ /* Leave signal handling to the iothread. */
+ sigfillset(&set);
+ /* Blocking the signals can result in undefined behaviour. */
+ sigdelset(&set, SIGSEGV);
+ sigdelset(&set, SIGFPE);
+ sigdelset(&set, SIGILL);
+ /* TODO avoid SIGBUS loss on macOS */
+ pthread_sigmask(SIG_SETMASK, &set, &oldset);
+
+ qemu_thread_args = g_new0(QemuThreadArgs, 1);
+ qemu_thread_args->name = g_strdup(name);
+ qemu_thread_args->start_routine = start_routine;
+ qemu_thread_args->arg = arg;
+
+ err = pthread_create(&thread->thread, &attr,
+ qemu_thread_start, qemu_thread_args);
+
+ if (err)
+ error_exit(err, __func__);
+
+ pthread_sigmask(SIG_SETMASK, &oldset, NULL);
+
+ pthread_attr_destroy(&attr);
+}
+
+void qemu_thread_get_self(QemuThread *thread)
+{
+ thread->thread = pthread_self();
+}
+
+bool qemu_thread_is_self(QemuThread *thread)
+{
+ return pthread_equal(pthread_self(), thread->thread);
+}
+
+void qemu_thread_exit(void *retval)
+{
+ pthread_exit(retval);
+}
+
+void *qemu_thread_join(QemuThread *thread)
+{
+ int err;
+ void *ret;
+
+ err = pthread_join(thread->thread, &ret);
+ if (err) {
+ error_exit(err, __func__);
+ }
+ return ret;
+}
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
new file mode 100644
index 000000000..52eb19f35
--- /dev/null
+++ b/util/qemu-thread-win32.c
@@ -0,0 +1,461 @@
+/*
+ * Win32 implementation for mutex/cond/thread functions
+ *
+ * Copyright Red Hat, Inc. 2010
+ *
+ * Author:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/thread.h"
+#include "qemu/notify.h"
+#include "qemu-thread-common.h"
+#include <process.h>
+
+static bool name_threads;
+
+void qemu_thread_naming(bool enable)
+{
+ /* But note we don't actually name them on Windows yet */
+ name_threads = enable;
+
+ fprintf(stderr, "qemu: thread naming not supported on this host\n");
+}
+
+static void error_exit(int err, const char *msg)
+{
+ char *pstr;
+
+ FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER,
+ NULL, err, 0, (LPTSTR)&pstr, 2, NULL);
+ fprintf(stderr, "qemu: %s: %s\n", msg, pstr);
+ LocalFree(pstr);
+ abort();
+}
+
+void qemu_mutex_init(QemuMutex *mutex)
+{
+ InitializeSRWLock(&mutex->lock);
+ qemu_mutex_post_init(mutex);
+}
+
+void qemu_mutex_destroy(QemuMutex *mutex)
+{
+ assert(mutex->initialized);
+ mutex->initialized = false;
+ InitializeSRWLock(&mutex->lock);
+}
+
+void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ assert(mutex->initialized);
+ qemu_mutex_pre_lock(mutex, file, line);
+ AcquireSRWLockExclusive(&mutex->lock);
+ qemu_mutex_post_lock(mutex, file, line);
+}
+
+int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ int owned;
+
+ assert(mutex->initialized);
+ owned = TryAcquireSRWLockExclusive(&mutex->lock);
+ if (owned) {
+ qemu_mutex_post_lock(mutex, file, line);
+ return 0;
+ }
+ return -EBUSY;
+}
+
+void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
+{
+ assert(mutex->initialized);
+ qemu_mutex_pre_unlock(mutex, file, line);
+ ReleaseSRWLockExclusive(&mutex->lock);
+}
+
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+ InitializeCriticalSection(&mutex->lock);
+ mutex->initialized = true;
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+ assert(mutex->initialized);
+ mutex->initialized = false;
+ DeleteCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ assert(mutex->initialized);
+ EnterCriticalSection(&mutex->lock);
+}
+
+int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ assert(mutex->initialized);
+ return !TryEnterCriticalSection(&mutex->lock);
+}
+
+void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line)
+{
+ assert(mutex->initialized);
+ LeaveCriticalSection(&mutex->lock);
+}
+
+void qemu_cond_init(QemuCond *cond)
+{
+ memset(cond, 0, sizeof(*cond));
+ InitializeConditionVariable(&cond->var);
+ cond->initialized = true;
+}
+
+void qemu_cond_destroy(QemuCond *cond)
+{
+ assert(cond->initialized);
+ cond->initialized = false;
+ InitializeConditionVariable(&cond->var);
+}
+
+void qemu_cond_signal(QemuCond *cond)
+{
+ assert(cond->initialized);
+ WakeConditionVariable(&cond->var);
+}
+
+void qemu_cond_broadcast(QemuCond *cond)
+{
+ assert(cond->initialized);
+ WakeAllConditionVariable(&cond->var);
+}
+
+void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
+{
+ assert(cond->initialized);
+ qemu_mutex_pre_unlock(mutex, file, line);
+ SleepConditionVariableSRW(&cond->var, &mutex->lock, INFINITE, 0);
+ qemu_mutex_post_lock(mutex, file, line);
+}
+
+bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms,
+ const char *file, const int line)
+{
+ int rc = 0;
+
+ assert(cond->initialized);
+ trace_qemu_mutex_unlock(mutex, file, line);
+ if (!SleepConditionVariableSRW(&cond->var, &mutex->lock, ms, 0)) {
+ rc = GetLastError();
+ }
+ trace_qemu_mutex_locked(mutex, file, line);
+ if (rc && rc != ERROR_TIMEOUT) {
+ error_exit(rc, __func__);
+ }
+ return rc != ERROR_TIMEOUT;
+}
+
+void qemu_sem_init(QemuSemaphore *sem, int init)
+{
+ /* Manual reset. */
+ sem->sema = CreateSemaphore(NULL, init, LONG_MAX, NULL);
+ sem->initialized = true;
+}
+
+void qemu_sem_destroy(QemuSemaphore *sem)
+{
+ assert(sem->initialized);
+ sem->initialized = false;
+ CloseHandle(sem->sema);
+}
+
+void qemu_sem_post(QemuSemaphore *sem)
+{
+ assert(sem->initialized);
+ ReleaseSemaphore(sem->sema, 1, NULL);
+}
+
+int qemu_sem_timedwait(QemuSemaphore *sem, int ms)
+{
+ int rc;
+
+ assert(sem->initialized);
+ rc = WaitForSingleObject(sem->sema, ms);
+ if (rc == WAIT_OBJECT_0) {
+ return 0;
+ }
+ if (rc != WAIT_TIMEOUT) {
+ error_exit(GetLastError(), __func__);
+ }
+ return -1;
+}
+
+void qemu_sem_wait(QemuSemaphore *sem)
+{
+ assert(sem->initialized);
+ if (WaitForSingleObject(sem->sema, INFINITE) != WAIT_OBJECT_0) {
+ error_exit(GetLastError(), __func__);
+ }
+}
+
+/* Wrap a Win32 manual-reset event with a fast userspace path. The idea
+ * is to reset the Win32 event lazily, as part of a test-reset-test-wait
+ * sequence. Such a sequence is, indeed, how QemuEvents are used by
+ * RCU and other subsystems!
+ *
+ * Valid transitions:
+ * - free->set, when setting the event
+ * - busy->set, when setting the event, followed by SetEvent
+ * - set->free, when resetting the event
+ * - free->busy, when waiting
+ *
+ * set->busy does not happen (it can be observed from the outside but
+ * it really is set->free->busy).
+ *
+ * busy->free provably cannot happen; to enforce it, the set->free transition
+ * is done with an OR, which becomes a no-op if the event has concurrently
+ * transitioned to free or busy (and is faster than cmpxchg).
+ */
+
+#define EV_SET 0
+#define EV_FREE 1
+#define EV_BUSY -1
+
+void qemu_event_init(QemuEvent *ev, bool init)
+{
+ /* Manual reset. */
+ ev->event = CreateEvent(NULL, TRUE, TRUE, NULL);
+ ev->value = (init ? EV_SET : EV_FREE);
+ ev->initialized = true;
+}
+
+void qemu_event_destroy(QemuEvent *ev)
+{
+ assert(ev->initialized);
+ ev->initialized = false;
+ CloseHandle(ev->event);
+}
+
+void qemu_event_set(QemuEvent *ev)
+{
+ assert(ev->initialized);
+ /* qemu_event_set has release semantics, but because it *loads*
+ * ev->value we need a full memory barrier here.
+ */
+ smp_mb();
+ if (qatomic_read(&ev->value) != EV_SET) {
+ if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) {
+ /* There were waiters, wake them up. */
+ SetEvent(ev->event);
+ }
+ }
+}
+
+void qemu_event_reset(QemuEvent *ev)
+{
+ unsigned value;
+
+ assert(ev->initialized);
+ value = qatomic_read(&ev->value);
+ smp_mb_acquire();
+ if (value == EV_SET) {
+ /* If there was a concurrent reset (or even reset+wait),
+ * do nothing. Otherwise change EV_SET->EV_FREE.
+ */
+ qatomic_or(&ev->value, EV_FREE);
+ }
+}
+
+void qemu_event_wait(QemuEvent *ev)
+{
+ unsigned value;
+
+ assert(ev->initialized);
+ value = qatomic_read(&ev->value);
+ smp_mb_acquire();
+ if (value != EV_SET) {
+ if (value == EV_FREE) {
+ /* qemu_event_set is not yet going to call SetEvent, but we are
+ * going to do another check for EV_SET below when setting EV_BUSY.
+ * At that point it is safe to call WaitForSingleObject.
+ */
+ ResetEvent(ev->event);
+
+ /* Tell qemu_event_set that there are waiters. No need to retry
+ * because there cannot be a concurrent busy->free transition.
+ * After the CAS, the event will be either set or busy.
+ */
+ if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
+ value = EV_SET;
+ } else {
+ value = EV_BUSY;
+ }
+ }
+ if (value == EV_BUSY) {
+ WaitForSingleObject(ev->event, INFINITE);
+ }
+ }
+}
+
+struct QemuThreadData {
+ /* Passed to win32_start_routine. */
+ void *(*start_routine)(void *);
+ void *arg;
+ short mode;
+ NotifierList exit;
+
+ /* Only used for joinable threads. */
+ bool exited;
+ void *ret;
+ CRITICAL_SECTION cs;
+};
+
+static bool atexit_registered;
+static NotifierList main_thread_exit;
+
+static __thread QemuThreadData *qemu_thread_data;
+
+static void run_main_thread_exit(void)
+{
+ notifier_list_notify(&main_thread_exit, NULL);
+}
+
+void qemu_thread_atexit_add(Notifier *notifier)
+{
+ if (!qemu_thread_data) {
+ if (!atexit_registered) {
+ atexit_registered = true;
+ atexit(run_main_thread_exit);
+ }
+ notifier_list_add(&main_thread_exit, notifier);
+ } else {
+ notifier_list_add(&qemu_thread_data->exit, notifier);
+ }
+}
+
+void qemu_thread_atexit_remove(Notifier *notifier)
+{
+ notifier_remove(notifier);
+}
+
+static unsigned __stdcall win32_start_routine(void *arg)
+{
+ QemuThreadData *data = (QemuThreadData *) arg;
+ void *(*start_routine)(void *) = data->start_routine;
+ void *thread_arg = data->arg;
+
+ qemu_thread_data = data;
+ qemu_thread_exit(start_routine(thread_arg));
+ abort();
+}
+
+void qemu_thread_exit(void *arg)
+{
+ QemuThreadData *data = qemu_thread_data;
+
+ notifier_list_notify(&data->exit, NULL);
+ if (data->mode == QEMU_THREAD_JOINABLE) {
+ data->ret = arg;
+ EnterCriticalSection(&data->cs);
+ data->exited = true;
+ LeaveCriticalSection(&data->cs);
+ } else {
+ g_free(data);
+ }
+ _endthreadex(0);
+}
+
+void *qemu_thread_join(QemuThread *thread)
+{
+ QemuThreadData *data;
+ void *ret;
+ HANDLE handle;
+
+ data = thread->data;
+ if (data->mode == QEMU_THREAD_DETACHED) {
+ return NULL;
+ }
+
+ /*
+ * Because multiple copies of the QemuThread can exist via
+ * qemu_thread_get_self, we need to store a value that cannot
+ * leak there. The simplest, non racy way is to store the TID,
+ * discard the handle that _beginthreadex gives back, and
+ * get another copy of the handle here.
+ */
+ handle = qemu_thread_get_handle(thread);
+ if (handle) {
+ WaitForSingleObject(handle, INFINITE);
+ CloseHandle(handle);
+ }
+ ret = data->ret;
+ DeleteCriticalSection(&data->cs);
+ g_free(data);
+ return ret;
+}
+
+void qemu_thread_create(QemuThread *thread, const char *name,
+ void *(*start_routine)(void *),
+ void *arg, int mode)
+{
+ HANDLE hThread;
+ struct QemuThreadData *data;
+
+ data = g_malloc(sizeof *data);
+ data->start_routine = start_routine;
+ data->arg = arg;
+ data->mode = mode;
+ data->exited = false;
+ notifier_list_init(&data->exit);
+
+ if (data->mode != QEMU_THREAD_DETACHED) {
+ InitializeCriticalSection(&data->cs);
+ }
+
+ hThread = (HANDLE) _beginthreadex(NULL, 0, win32_start_routine,
+ data, 0, &thread->tid);
+ if (!hThread) {
+ error_exit(GetLastError(), __func__);
+ }
+ CloseHandle(hThread);
+ thread->data = data;
+}
+
+void qemu_thread_get_self(QemuThread *thread)
+{
+ thread->data = qemu_thread_data;
+ thread->tid = GetCurrentThreadId();
+}
+
+HANDLE qemu_thread_get_handle(QemuThread *thread)
+{
+ QemuThreadData *data;
+ HANDLE handle;
+
+ data = thread->data;
+ if (data->mode == QEMU_THREAD_DETACHED) {
+ return NULL;
+ }
+
+ EnterCriticalSection(&data->cs);
+ if (!data->exited) {
+ handle = OpenThread(SYNCHRONIZE | THREAD_SUSPEND_RESUME |
+ THREAD_SET_CONTEXT, FALSE, thread->tid);
+ } else {
+ handle = NULL;
+ }
+ LeaveCriticalSection(&data->cs);
+ return handle;
+}
+
+bool qemu_thread_is_self(QemuThread *thread)
+{
+ return GetCurrentThreadId() == thread->tid;
+}
diff --git a/util/qemu-timer-common.c b/util/qemu-timer-common.c
new file mode 100644
index 000000000..cc1326f72
--- /dev/null
+++ b/util/qemu-timer-common.c
@@ -0,0 +1,63 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+
+/***********************************************************/
+/* real time host monotonic timer */
+
+int64_t clock_start;
+
+#ifdef _WIN32
+
+int64_t clock_freq;
+
+static void __attribute__((constructor)) init_get_clock(void)
+{
+ LARGE_INTEGER freq;
+ int ret;
+ ret = QueryPerformanceFrequency(&freq);
+ if (ret == 0) {
+ fprintf(stderr, "Could not calibrate ticks\n");
+ exit(1);
+ }
+ clock_freq = freq.QuadPart;
+ clock_start = get_clock();
+}
+
+#else
+
+int use_rt_clock;
+
+static void __attribute__((constructor)) init_get_clock(void)
+{
+ struct timespec ts;
+
+ use_rt_clock = 0;
+ if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) {
+ use_rt_clock = 1;
+ }
+ clock_start = get_clock();
+}
+#endif
diff --git a/util/qemu-timer.c b/util/qemu-timer.c
new file mode 100644
index 000000000..f36c75e59
--- /dev/null
+++ b/util/qemu-timer.c
@@ -0,0 +1,674 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/timer.h"
+#include "qemu/lockable.h"
+#include "sysemu/cpu-timers.h"
+#include "sysemu/replay.h"
+#include "sysemu/cpus.h"
+
+#ifdef CONFIG_POSIX
+#include <pthread.h>
+#endif
+
+#ifdef CONFIG_PPOLL
+#include <poll.h>
+#endif
+
+#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK
+#include <sys/prctl.h>
+#endif
+
+/***********************************************************/
+/* timers */
+
+typedef struct QEMUClock {
+ /* We rely on BQL to protect the timerlists */
+ QLIST_HEAD(, QEMUTimerList) timerlists;
+
+ QEMUClockType type;
+ bool enabled;
+} QEMUClock;
+
+QEMUTimerListGroup main_loop_tlg;
+static QEMUClock qemu_clocks[QEMU_CLOCK_MAX];
+
+/* A QEMUTimerList is a list of timers attached to a clock. More
+ * than one QEMUTimerList can be attached to each clock, for instance
+ * used by different AioContexts / threads. Each clock also has
+ * a list of the QEMUTimerLists associated with it, in order that
+ * reenabling the clock can call all the notifiers.
+ */
+
+struct QEMUTimerList {
+ QEMUClock *clock;
+ QemuMutex active_timers_lock;
+ QEMUTimer *active_timers;
+ QLIST_ENTRY(QEMUTimerList) list;
+ QEMUTimerListNotifyCB *notify_cb;
+ void *notify_opaque;
+
+ /* lightweight method to mark the end of timerlist's running */
+ QemuEvent timers_done_ev;
+};
+
+/**
+ * qemu_clock_ptr:
+ * @type: type of clock
+ *
+ * Translate a clock type into a pointer to QEMUClock object.
+ *
+ * Returns: a pointer to the QEMUClock object
+ */
+static inline QEMUClock *qemu_clock_ptr(QEMUClockType type)
+{
+ return &qemu_clocks[type];
+}
+
+static bool timer_expired_ns(QEMUTimer *timer_head, int64_t current_time)
+{
+ return timer_head && (timer_head->expire_time <= current_time);
+}
+
+QEMUTimerList *timerlist_new(QEMUClockType type,
+ QEMUTimerListNotifyCB *cb,
+ void *opaque)
+{
+ QEMUTimerList *timer_list;
+ QEMUClock *clock = qemu_clock_ptr(type);
+
+ timer_list = g_malloc0(sizeof(QEMUTimerList));
+ qemu_event_init(&timer_list->timers_done_ev, true);
+ timer_list->clock = clock;
+ timer_list->notify_cb = cb;
+ timer_list->notify_opaque = opaque;
+ qemu_mutex_init(&timer_list->active_timers_lock);
+ QLIST_INSERT_HEAD(&clock->timerlists, timer_list, list);
+ return timer_list;
+}
+
+void timerlist_free(QEMUTimerList *timer_list)
+{
+ assert(!timerlist_has_timers(timer_list));
+ if (timer_list->clock) {
+ QLIST_REMOVE(timer_list, list);
+ }
+ qemu_mutex_destroy(&timer_list->active_timers_lock);
+ g_free(timer_list);
+}
+
+static void qemu_clock_init(QEMUClockType type, QEMUTimerListNotifyCB *notify_cb)
+{
+ QEMUClock *clock = qemu_clock_ptr(type);
+
+ /* Assert that the clock of type TYPE has not been initialized yet. */
+ assert(main_loop_tlg.tl[type] == NULL);
+
+ clock->type = type;
+ clock->enabled = (type == QEMU_CLOCK_VIRTUAL ? false : true);
+ QLIST_INIT(&clock->timerlists);
+ main_loop_tlg.tl[type] = timerlist_new(type, notify_cb, NULL);
+}
+
+bool qemu_clock_use_for_deadline(QEMUClockType type)
+{
+ return !(icount_enabled() && (type == QEMU_CLOCK_VIRTUAL));
+}
+
+void qemu_clock_notify(QEMUClockType type)
+{
+ QEMUTimerList *timer_list;
+ QEMUClock *clock = qemu_clock_ptr(type);
+ QLIST_FOREACH(timer_list, &clock->timerlists, list) {
+ timerlist_notify(timer_list);
+ }
+}
+
+/* Disabling the clock will wait for related timerlists to stop
+ * executing qemu_run_timers. Thus, this functions should not
+ * be used from the callback of a timer that is based on @clock.
+ * Doing so would cause a deadlock.
+ *
+ * Caller should hold BQL.
+ */
+void qemu_clock_enable(QEMUClockType type, bool enabled)
+{
+ QEMUClock *clock = qemu_clock_ptr(type);
+ QEMUTimerList *tl;
+ bool old = clock->enabled;
+ clock->enabled = enabled;
+ if (enabled && !old) {
+ qemu_clock_notify(type);
+ } else if (!enabled && old) {
+ QLIST_FOREACH(tl, &clock->timerlists, list) {
+ qemu_event_wait(&tl->timers_done_ev);
+ }
+ }
+}
+
+bool timerlist_has_timers(QEMUTimerList *timer_list)
+{
+ return !!qatomic_read(&timer_list->active_timers);
+}
+
+bool qemu_clock_has_timers(QEMUClockType type)
+{
+ return timerlist_has_timers(
+ main_loop_tlg.tl[type]);
+}
+
+bool timerlist_expired(QEMUTimerList *timer_list)
+{
+ int64_t expire_time;
+
+ if (!qatomic_read(&timer_list->active_timers)) {
+ return false;
+ }
+
+ WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+ if (!timer_list->active_timers) {
+ return false;
+ }
+ expire_time = timer_list->active_timers->expire_time;
+ }
+
+ return expire_time <= qemu_clock_get_ns(timer_list->clock->type);
+}
+
+bool qemu_clock_expired(QEMUClockType type)
+{
+ return timerlist_expired(
+ main_loop_tlg.tl[type]);
+}
+
+/*
+ * As above, but return -1 for no deadline, and do not cap to 2^32
+ * as we know the result is always positive.
+ */
+
+int64_t timerlist_deadline_ns(QEMUTimerList *timer_list)
+{
+ int64_t delta;
+ int64_t expire_time;
+
+ if (!qatomic_read(&timer_list->active_timers)) {
+ return -1;
+ }
+
+ if (!timer_list->clock->enabled) {
+ return -1;
+ }
+
+ /* The active timers list may be modified before the caller uses our return
+ * value but ->notify_cb() is called when the deadline changes. Therefore
+ * the caller should notice the change and there is no race condition.
+ */
+ WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+ if (!timer_list->active_timers) {
+ return -1;
+ }
+ expire_time = timer_list->active_timers->expire_time;
+ }
+
+ delta = expire_time - qemu_clock_get_ns(timer_list->clock->type);
+
+ if (delta <= 0) {
+ return 0;
+ }
+
+ return delta;
+}
+
+/* Calculate the soonest deadline across all timerlists attached
+ * to the clock. This is used for the icount timeout so we
+ * ignore whether or not the clock should be used in deadline
+ * calculations.
+ */
+int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask)
+{
+ int64_t deadline = -1;
+ int64_t delta;
+ int64_t expire_time;
+ QEMUTimer *ts;
+ QEMUTimerList *timer_list;
+ QEMUClock *clock = qemu_clock_ptr(type);
+
+ if (!clock->enabled) {
+ return -1;
+ }
+
+ QLIST_FOREACH(timer_list, &clock->timerlists, list) {
+ qemu_mutex_lock(&timer_list->active_timers_lock);
+ ts = timer_list->active_timers;
+ /* Skip all external timers */
+ while (ts && (ts->attributes & ~attr_mask)) {
+ ts = ts->next;
+ }
+ if (!ts) {
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+ continue;
+ }
+ expire_time = ts->expire_time;
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+ delta = expire_time - qemu_clock_get_ns(type);
+ if (delta <= 0) {
+ delta = 0;
+ }
+ deadline = qemu_soonest_timeout(deadline, delta);
+ }
+ return deadline;
+}
+
+QEMUClockType timerlist_get_clock(QEMUTimerList *timer_list)
+{
+ return timer_list->clock->type;
+}
+
+QEMUTimerList *qemu_clock_get_main_loop_timerlist(QEMUClockType type)
+{
+ return main_loop_tlg.tl[type];
+}
+
+void timerlist_notify(QEMUTimerList *timer_list)
+{
+ if (timer_list->notify_cb) {
+ timer_list->notify_cb(timer_list->notify_opaque, timer_list->clock->type);
+ } else {
+ qemu_notify_event();
+ }
+}
+
+/* Transition function to convert a nanosecond timeout to ms
+ * This is used where a system does not support ppoll
+ */
+int qemu_timeout_ns_to_ms(int64_t ns)
+{
+ int64_t ms;
+ if (ns < 0) {
+ return -1;
+ }
+
+ if (!ns) {
+ return 0;
+ }
+
+ /* Always round up, because it's better to wait too long than to wait too
+ * little and effectively busy-wait
+ */
+ ms = DIV_ROUND_UP(ns, SCALE_MS);
+
+ /* To avoid overflow problems, limit this to 2^31, i.e. approx 25 days */
+ return MIN(ms, INT32_MAX);
+}
+
+
+/* qemu implementation of g_poll which uses a nanosecond timeout but is
+ * otherwise identical to g_poll
+ */
+int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout)
+{
+#ifdef CONFIG_PPOLL
+ if (timeout < 0) {
+ return ppoll((struct pollfd *)fds, nfds, NULL, NULL);
+ } else {
+ struct timespec ts;
+ int64_t tvsec = timeout / 1000000000LL;
+ /* Avoid possibly overflowing and specifying a negative number of
+ * seconds, which would turn a very long timeout into a busy-wait.
+ */
+ if (tvsec > (int64_t)INT32_MAX) {
+ tvsec = INT32_MAX;
+ }
+ ts.tv_sec = tvsec;
+ ts.tv_nsec = timeout % 1000000000LL;
+ return ppoll((struct pollfd *)fds, nfds, &ts, NULL);
+ }
+#else
+ return g_poll(fds, nfds, qemu_timeout_ns_to_ms(timeout));
+#endif
+}
+
+
+void timer_init_full(QEMUTimer *ts,
+ QEMUTimerListGroup *timer_list_group, QEMUClockType type,
+ int scale, int attributes,
+ QEMUTimerCB *cb, void *opaque)
+{
+ if (!timer_list_group) {
+ timer_list_group = &main_loop_tlg;
+ }
+ ts->timer_list = timer_list_group->tl[type];
+ ts->cb = cb;
+ ts->opaque = opaque;
+ ts->scale = scale;
+ ts->attributes = attributes;
+ ts->expire_time = -1;
+}
+
+void timer_deinit(QEMUTimer *ts)
+{
+ assert(ts->expire_time == -1);
+ ts->timer_list = NULL;
+}
+
+static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts)
+{
+ QEMUTimer **pt, *t;
+
+ ts->expire_time = -1;
+ pt = &timer_list->active_timers;
+ for(;;) {
+ t = *pt;
+ if (!t)
+ break;
+ if (t == ts) {
+ qatomic_set(pt, t->next);
+ break;
+ }
+ pt = &t->next;
+ }
+}
+
+static bool timer_mod_ns_locked(QEMUTimerList *timer_list,
+ QEMUTimer *ts, int64_t expire_time)
+{
+ QEMUTimer **pt, *t;
+
+ /* add the timer in the sorted list */
+ pt = &timer_list->active_timers;
+ for (;;) {
+ t = *pt;
+ if (!timer_expired_ns(t, expire_time)) {
+ break;
+ }
+ pt = &t->next;
+ }
+ ts->expire_time = MAX(expire_time, 0);
+ ts->next = *pt;
+ qatomic_set(pt, ts);
+
+ return pt == &timer_list->active_timers;
+}
+
+static void timerlist_rearm(QEMUTimerList *timer_list)
+{
+ /* Interrupt execution to force deadline recalculation. */
+ if (icount_enabled() && timer_list->clock->type == QEMU_CLOCK_VIRTUAL) {
+ icount_start_warp_timer();
+ }
+ timerlist_notify(timer_list);
+}
+
+/* stop a timer, but do not dealloc it */
+void timer_del(QEMUTimer *ts)
+{
+ QEMUTimerList *timer_list = ts->timer_list;
+
+ if (timer_list) {
+ qemu_mutex_lock(&timer_list->active_timers_lock);
+ timer_del_locked(timer_list, ts);
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+ }
+}
+
+/* modify the current timer so that it will be fired when current_time
+ >= expire_time. The corresponding callback will be called. */
+void timer_mod_ns(QEMUTimer *ts, int64_t expire_time)
+{
+ QEMUTimerList *timer_list = ts->timer_list;
+ bool rearm;
+
+ qemu_mutex_lock(&timer_list->active_timers_lock);
+ timer_del_locked(timer_list, ts);
+ rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+ if (rearm) {
+ timerlist_rearm(timer_list);
+ }
+}
+
+/* modify the current timer so that it will be fired when current_time
+ >= expire_time or the current deadline, whichever comes earlier.
+ The corresponding callback will be called. */
+void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time)
+{
+ QEMUTimerList *timer_list = ts->timer_list;
+ bool rearm;
+
+ WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) {
+ if (ts->expire_time == -1 || ts->expire_time > expire_time) {
+ if (ts->expire_time != -1) {
+ timer_del_locked(timer_list, ts);
+ }
+ rearm = timer_mod_ns_locked(timer_list, ts, expire_time);
+ } else {
+ rearm = false;
+ }
+ }
+ if (rearm) {
+ timerlist_rearm(timer_list);
+ }
+}
+
+void timer_mod(QEMUTimer *ts, int64_t expire_time)
+{
+ timer_mod_ns(ts, expire_time * ts->scale);
+}
+
+void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time)
+{
+ timer_mod_anticipate_ns(ts, expire_time * ts->scale);
+}
+
+bool timer_pending(QEMUTimer *ts)
+{
+ return ts->expire_time >= 0;
+}
+
+bool timer_expired(QEMUTimer *timer_head, int64_t current_time)
+{
+ return timer_expired_ns(timer_head, current_time * timer_head->scale);
+}
+
+bool timerlist_run_timers(QEMUTimerList *timer_list)
+{
+ QEMUTimer *ts;
+ int64_t current_time;
+ bool progress = false;
+ QEMUTimerCB *cb;
+ void *opaque;
+
+ if (!qatomic_read(&timer_list->active_timers)) {
+ return false;
+ }
+
+ qemu_event_reset(&timer_list->timers_done_ev);
+ if (!timer_list->clock->enabled) {
+ goto out;
+ }
+
+ switch (timer_list->clock->type) {
+ case QEMU_CLOCK_REALTIME:
+ break;
+ default:
+ case QEMU_CLOCK_VIRTUAL:
+ break;
+ case QEMU_CLOCK_HOST:
+ if (!replay_checkpoint(CHECKPOINT_CLOCK_HOST)) {
+ goto out;
+ }
+ break;
+ case QEMU_CLOCK_VIRTUAL_RT:
+ if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL_RT)) {
+ goto out;
+ }
+ break;
+ }
+
+ /*
+ * Extract expired timers from active timers list and process them.
+ *
+ * In rr mode we need "filtered" checkpointing for virtual clock. The
+ * checkpoint must be recorded/replayed before processing any non-EXTERNAL timer,
+ * and that must only be done once since the clock value stays the same. Because
+ * non-EXTERNAL timers may appear in the timers list while it being processed,
+ * the checkpoint can be issued at a time until no timers are left and we are
+ * done".
+ */
+ current_time = qemu_clock_get_ns(timer_list->clock->type);
+ qemu_mutex_lock(&timer_list->active_timers_lock);
+ while ((ts = timer_list->active_timers)) {
+ if (!timer_expired_ns(ts, current_time)) {
+ /* No expired timers left. The checkpoint can be skipped
+ * if no timers fired or they were all external.
+ */
+ break;
+ }
+ /* Checkpoint for virtual clock is redundant in cases where
+ * it's being triggered with only non-EXTERNAL timers, because
+ * these timers don't change guest state directly.
+ */
+ if (replay_mode != REPLAY_MODE_NONE
+ && timer_list->clock->type == QEMU_CLOCK_VIRTUAL
+ && !(ts->attributes & QEMU_TIMER_ATTR_EXTERNAL)
+ && !replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL)) {
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+ goto out;
+ }
+
+ /* remove timer from the list before calling the callback */
+ timer_list->active_timers = ts->next;
+ ts->next = NULL;
+ ts->expire_time = -1;
+ cb = ts->cb;
+ opaque = ts->opaque;
+
+ /* run the callback (the timer list can be modified) */
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+ cb(opaque);
+ qemu_mutex_lock(&timer_list->active_timers_lock);
+
+ progress = true;
+ }
+ qemu_mutex_unlock(&timer_list->active_timers_lock);
+
+out:
+ qemu_event_set(&timer_list->timers_done_ev);
+ return progress;
+}
+
+bool qemu_clock_run_timers(QEMUClockType type)
+{
+ return timerlist_run_timers(main_loop_tlg.tl[type]);
+}
+
+void timerlistgroup_init(QEMUTimerListGroup *tlg,
+ QEMUTimerListNotifyCB *cb, void *opaque)
+{
+ QEMUClockType type;
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ tlg->tl[type] = timerlist_new(type, cb, opaque);
+ }
+}
+
+void timerlistgroup_deinit(QEMUTimerListGroup *tlg)
+{
+ QEMUClockType type;
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ timerlist_free(tlg->tl[type]);
+ }
+}
+
+bool timerlistgroup_run_timers(QEMUTimerListGroup *tlg)
+{
+ QEMUClockType type;
+ bool progress = false;
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ progress |= timerlist_run_timers(tlg->tl[type]);
+ }
+ return progress;
+}
+
+int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg)
+{
+ int64_t deadline = -1;
+ QEMUClockType type;
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ if (qemu_clock_use_for_deadline(type)) {
+ deadline = qemu_soonest_timeout(deadline,
+ timerlist_deadline_ns(tlg->tl[type]));
+ }
+ }
+ return deadline;
+}
+
+int64_t qemu_clock_get_ns(QEMUClockType type)
+{
+ switch (type) {
+ case QEMU_CLOCK_REALTIME:
+ return get_clock();
+ default:
+ case QEMU_CLOCK_VIRTUAL:
+ return cpus_get_virtual_clock();
+ case QEMU_CLOCK_HOST:
+ return REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
+ case QEMU_CLOCK_VIRTUAL_RT:
+ return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock());
+ }
+}
+
+void init_clocks(QEMUTimerListNotifyCB *notify_cb)
+{
+ QEMUClockType type;
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ qemu_clock_init(type, notify_cb);
+ }
+
+#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK
+ prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0);
+#endif
+}
+
+uint64_t timer_expire_time_ns(QEMUTimer *ts)
+{
+ return timer_pending(ts) ? ts->expire_time : -1;
+}
+
+bool qemu_clock_run_all_timers(void)
+{
+ bool progress = false;
+ QEMUClockType type;
+
+ for (type = 0; type < QEMU_CLOCK_MAX; type++) {
+ if (qemu_clock_use_for_deadline(type)) {
+ progress |= qemu_clock_run_timers(type);
+ }
+ }
+
+ return progress;
+}
diff --git a/util/qht.c b/util/qht.c
new file mode 100644
index 000000000..079605121
--- /dev/null
+++ b/util/qht.c
@@ -0,0 +1,963 @@
+/*
+ * qht.c - QEMU Hash Table, designed to scale for read-mostly workloads.
+ *
+ * Copyright (C) 2016, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * Assumptions:
+ * - NULL cannot be inserted/removed as a pointer value.
+ * - Trying to insert an already-existing hash-pointer pair is OK. However,
+ * it is not OK to insert into the same hash table different hash-pointer
+ * pairs that have the same pointer value, but not the hashes.
+ * - Lookups are performed under an RCU read-critical section; removals
+ * must wait for a grace period to elapse before freeing removed objects.
+ *
+ * Features:
+ * - Reads (i.e. lookups and iterators) can be concurrent with other reads.
+ * Lookups that are concurrent with writes to the same bucket will retry
+ * via a seqlock; iterators acquire all bucket locks and therefore can be
+ * concurrent with lookups and are serialized wrt writers.
+ * - Writes (i.e. insertions/removals) can be concurrent with writes to
+ * different buckets; writes to the same bucket are serialized through a lock.
+ * - Optional auto-resizing: the hash table resizes up if the load surpasses
+ * a certain threshold. Resizing is done concurrently with readers; writes
+ * are serialized with the resize operation.
+ *
+ * The key structure is the bucket, which is cacheline-sized. Buckets
+ * contain a few hash values and pointers; the u32 hash values are stored in
+ * full so that resizing is fast. Having this structure instead of directly
+ * chaining items has two advantages:
+ * - Failed lookups fail fast, and touch a minimum number of cache lines.
+ * - Resizing the hash table with concurrent lookups is easy.
+ *
+ * There are two types of buckets:
+ * 1. "head" buckets are the ones allocated in the array of buckets in qht_map.
+ * 2. all "non-head" buckets (i.e. all others) are members of a chain that
+ * starts from a head bucket.
+ * Note that the seqlock and spinlock of a head bucket applies to all buckets
+ * chained to it; these two fields are unused in non-head buckets.
+ *
+ * On removals, we move the last valid item in the chain to the position of the
+ * just-removed entry. This makes lookups slightly faster, since the moment an
+ * invalid entry is found, the (failed) lookup is over.
+ *
+ * Resizing is done by taking all bucket spinlocks (so that no other writers can
+ * race with us) and then copying all entries into a new hash map. Then, the
+ * ht->map pointer is set, and the old map is freed once no RCU readers can see
+ * it anymore.
+ *
+ * Writers check for concurrent resizes by comparing ht->map before and after
+ * acquiring their bucket lock. If they don't match, a resize has occurred
+ * while the bucket spinlock was being acquired.
+ *
+ * Related Work:
+ * - Idea of cacheline-sized buckets with full hashes taken from:
+ * David, Guerraoui & Trigonakis, "Asynchronized Concurrency:
+ * The Secret to Scaling Concurrent Search Data Structures", ASPLOS'15.
+ * - Why not RCU-based hash tables? They would allow us to get rid of the
+ * seqlock, but resizing would take forever since RCU read critical
+ * sections in QEMU take quite a long time.
+ * More info on relativistic hash tables:
+ * + Triplett, McKenney & Walpole, "Resizable, Scalable, Concurrent Hash
+ * Tables via Relativistic Programming", USENIX ATC'11.
+ * + Corbet, "Relativistic hash tables, part 1: Algorithms", @ lwn.net, 2014.
+ * https://lwn.net/Articles/612021/
+ */
+#include "qemu/osdep.h"
+#include "qemu/qht.h"
+#include "qemu/atomic.h"
+#include "qemu/rcu.h"
+
+//#define QHT_DEBUG
+
+/*
+ * We want to avoid false sharing of cache lines. Most systems have 64-byte
+ * cache lines so we go with it for simplicity.
+ *
+ * Note that systems with smaller cache lines will be fine (the struct is
+ * almost 64-bytes); systems with larger cache lines might suffer from
+ * some false sharing.
+ */
+#define QHT_BUCKET_ALIGN 64
+
+/* define these to keep sizeof(qht_bucket) within QHT_BUCKET_ALIGN */
+#if HOST_LONG_BITS == 32
+#define QHT_BUCKET_ENTRIES 6
+#else /* 64-bit */
+#define QHT_BUCKET_ENTRIES 4
+#endif
+
+enum qht_iter_type {
+ QHT_ITER_VOID, /* do nothing; use retvoid */
+ QHT_ITER_RM, /* remove element if retbool returns true */
+};
+
+struct qht_iter {
+ union {
+ qht_iter_func_t retvoid;
+ qht_iter_bool_func_t retbool;
+ } f;
+ enum qht_iter_type type;
+};
+
+/*
+ * Do _not_ use qemu_mutex_[try]lock directly! Use these macros, otherwise
+ * the profiler (QSP) will deadlock.
+ */
+static inline void qht_lock(struct qht *ht)
+{
+ if (ht->mode & QHT_MODE_RAW_MUTEXES) {
+ qemu_mutex_lock__raw(&ht->lock);
+ } else {
+ qemu_mutex_lock(&ht->lock);
+ }
+}
+
+static inline int qht_trylock(struct qht *ht)
+{
+ if (ht->mode & QHT_MODE_RAW_MUTEXES) {
+ return qemu_mutex_trylock__raw(&(ht)->lock);
+ }
+ return qemu_mutex_trylock(&(ht)->lock);
+}
+
+/* this inline is not really necessary, but it helps keep code consistent */
+static inline void qht_unlock(struct qht *ht)
+{
+ qemu_mutex_unlock(&ht->lock);
+}
+
+/*
+ * Note: reading partially-updated pointers in @pointers could lead to
+ * segfaults. We thus access them with qatomic_read/set; this guarantees
+ * that the compiler makes all those accesses atomic. We also need the
+ * volatile-like behavior in qatomic_read, since otherwise the compiler
+ * might refetch the pointer.
+ * qatomic_read's are of course not necessary when the bucket lock is held.
+ *
+ * If both ht->lock and b->lock are grabbed, ht->lock should always
+ * be grabbed first.
+ */
+struct qht_bucket {
+ QemuSpin lock;
+ QemuSeqLock sequence;
+ uint32_t hashes[QHT_BUCKET_ENTRIES];
+ void *pointers[QHT_BUCKET_ENTRIES];
+ struct qht_bucket *next;
+} QEMU_ALIGNED(QHT_BUCKET_ALIGN);
+
+QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > QHT_BUCKET_ALIGN);
+
+/**
+ * struct qht_map - structure to track an array of buckets
+ * @rcu: used by RCU. Keep it as the top field in the struct to help valgrind
+ * find the whole struct.
+ * @buckets: array of head buckets. It is constant once the map is created.
+ * @n_buckets: number of head buckets. It is constant once the map is created.
+ * @n_added_buckets: number of added (i.e. "non-head") buckets
+ * @n_added_buckets_threshold: threshold to trigger an upward resize once the
+ * number of added buckets surpasses it.
+ *
+ * Buckets are tracked in what we call a "map", i.e. this structure.
+ */
+struct qht_map {
+ struct rcu_head rcu;
+ struct qht_bucket *buckets;
+ size_t n_buckets;
+ size_t n_added_buckets;
+ size_t n_added_buckets_threshold;
+};
+
+/* trigger a resize when n_added_buckets > n_buckets / div */
+#define QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV 8
+
+static void qht_do_resize_reset(struct qht *ht, struct qht_map *new,
+ bool reset);
+static void qht_grow_maybe(struct qht *ht);
+
+#ifdef QHT_DEBUG
+
+#define qht_debug_assert(X) do { assert(X); } while (0)
+
+static void qht_bucket_debug__locked(struct qht_bucket *b)
+{
+ bool seen_empty = false;
+ bool corrupt = false;
+ int i;
+
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (b->pointers[i] == NULL) {
+ seen_empty = true;
+ continue;
+ }
+ if (seen_empty) {
+ fprintf(stderr, "%s: b: %p, pos: %i, hash: 0x%x, p: %p\n",
+ __func__, b, i, b->hashes[i], b->pointers[i]);
+ corrupt = true;
+ }
+ }
+ b = b->next;
+ } while (b);
+ qht_debug_assert(!corrupt);
+}
+
+static void qht_map_debug__all_locked(struct qht_map *map)
+{
+ int i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ qht_bucket_debug__locked(&map->buckets[i]);
+ }
+}
+#else
+
+#define qht_debug_assert(X) do { (void)(X); } while (0)
+
+static inline void qht_bucket_debug__locked(struct qht_bucket *b)
+{ }
+
+static inline void qht_map_debug__all_locked(struct qht_map *map)
+{ }
+#endif /* QHT_DEBUG */
+
+static inline size_t qht_elems_to_buckets(size_t n_elems)
+{
+ return pow2ceil(n_elems / QHT_BUCKET_ENTRIES);
+}
+
+static inline void qht_head_init(struct qht_bucket *b)
+{
+ memset(b, 0, sizeof(*b));
+ qemu_spin_init(&b->lock);
+ seqlock_init(&b->sequence);
+}
+
+static inline
+struct qht_bucket *qht_map_to_bucket(const struct qht_map *map, uint32_t hash)
+{
+ return &map->buckets[hash & (map->n_buckets - 1)];
+}
+
+/* acquire all bucket locks from a map */
+static void qht_map_lock_buckets(struct qht_map *map)
+{
+ size_t i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ struct qht_bucket *b = &map->buckets[i];
+
+ qemu_spin_lock(&b->lock);
+ }
+}
+
+static void qht_map_unlock_buckets(struct qht_map *map)
+{
+ size_t i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ struct qht_bucket *b = &map->buckets[i];
+
+ qemu_spin_unlock(&b->lock);
+ }
+}
+
+/*
+ * Call with at least a bucket lock held.
+ * @map should be the value read before acquiring the lock (or locks).
+ */
+static inline bool qht_map_is_stale__locked(const struct qht *ht,
+ const struct qht_map *map)
+{
+ return map != ht->map;
+}
+
+/*
+ * Grab all bucket locks, and set @pmap after making sure the map isn't stale.
+ *
+ * Pairs with qht_map_unlock_buckets(), hence the pass-by-reference.
+ *
+ * Note: callers cannot have ht->lock held.
+ */
+static inline
+void qht_map_lock_buckets__no_stale(struct qht *ht, struct qht_map **pmap)
+{
+ struct qht_map *map;
+
+ map = qatomic_rcu_read(&ht->map);
+ qht_map_lock_buckets(map);
+ if (likely(!qht_map_is_stale__locked(ht, map))) {
+ *pmap = map;
+ return;
+ }
+ qht_map_unlock_buckets(map);
+
+ /* we raced with a resize; acquire ht->lock to see the updated ht->map */
+ qht_lock(ht);
+ map = ht->map;
+ qht_map_lock_buckets(map);
+ qht_unlock(ht);
+ *pmap = map;
+ return;
+}
+
+/*
+ * Get a head bucket and lock it, making sure its parent map is not stale.
+ * @pmap is filled with a pointer to the bucket's parent map.
+ *
+ * Unlock with qemu_spin_unlock(&b->lock).
+ *
+ * Note: callers cannot have ht->lock held.
+ */
+static inline
+struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht, uint32_t hash,
+ struct qht_map **pmap)
+{
+ struct qht_bucket *b;
+ struct qht_map *map;
+
+ map = qatomic_rcu_read(&ht->map);
+ b = qht_map_to_bucket(map, hash);
+
+ qemu_spin_lock(&b->lock);
+ if (likely(!qht_map_is_stale__locked(ht, map))) {
+ *pmap = map;
+ return b;
+ }
+ qemu_spin_unlock(&b->lock);
+
+ /* we raced with a resize; acquire ht->lock to see the updated ht->map */
+ qht_lock(ht);
+ map = ht->map;
+ b = qht_map_to_bucket(map, hash);
+ qemu_spin_lock(&b->lock);
+ qht_unlock(ht);
+ *pmap = map;
+ return b;
+}
+
+static inline bool qht_map_needs_resize(const struct qht_map *map)
+{
+ return qatomic_read(&map->n_added_buckets) >
+ map->n_added_buckets_threshold;
+}
+
+static inline void qht_chain_destroy(const struct qht_bucket *head)
+{
+ struct qht_bucket *curr = head->next;
+ struct qht_bucket *prev;
+
+ qemu_spin_destroy(&head->lock);
+ while (curr) {
+ prev = curr;
+ curr = curr->next;
+ qemu_vfree(prev);
+ }
+}
+
+/* pass only an orphan map */
+static void qht_map_destroy(struct qht_map *map)
+{
+ size_t i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ qht_chain_destroy(&map->buckets[i]);
+ }
+ qemu_vfree(map->buckets);
+ g_free(map);
+}
+
+static struct qht_map *qht_map_create(size_t n_buckets)
+{
+ struct qht_map *map;
+ size_t i;
+
+ map = g_malloc(sizeof(*map));
+ map->n_buckets = n_buckets;
+
+ map->n_added_buckets = 0;
+ map->n_added_buckets_threshold = n_buckets /
+ QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV;
+
+ /* let tiny hash tables to at least add one non-head bucket */
+ if (unlikely(map->n_added_buckets_threshold == 0)) {
+ map->n_added_buckets_threshold = 1;
+ }
+
+ map->buckets = qemu_memalign(QHT_BUCKET_ALIGN,
+ sizeof(*map->buckets) * n_buckets);
+ for (i = 0; i < n_buckets; i++) {
+ qht_head_init(&map->buckets[i]);
+ }
+ return map;
+}
+
+void qht_init(struct qht *ht, qht_cmp_func_t cmp, size_t n_elems,
+ unsigned int mode)
+{
+ struct qht_map *map;
+ size_t n_buckets = qht_elems_to_buckets(n_elems);
+
+ g_assert(cmp);
+ ht->cmp = cmp;
+ ht->mode = mode;
+ qemu_mutex_init(&ht->lock);
+ map = qht_map_create(n_buckets);
+ qatomic_rcu_set(&ht->map, map);
+}
+
+/* call only when there are no readers/writers left */
+void qht_destroy(struct qht *ht)
+{
+ qht_map_destroy(ht->map);
+ memset(ht, 0, sizeof(*ht));
+}
+
+static void qht_bucket_reset__locked(struct qht_bucket *head)
+{
+ struct qht_bucket *b = head;
+ int i;
+
+ seqlock_write_begin(&head->sequence);
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (b->pointers[i] == NULL) {
+ goto done;
+ }
+ qatomic_set(&b->hashes[i], 0);
+ qatomic_set(&b->pointers[i], NULL);
+ }
+ b = b->next;
+ } while (b);
+ done:
+ seqlock_write_end(&head->sequence);
+}
+
+/* call with all bucket locks held */
+static void qht_map_reset__all_locked(struct qht_map *map)
+{
+ size_t i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ qht_bucket_reset__locked(&map->buckets[i]);
+ }
+ qht_map_debug__all_locked(map);
+}
+
+void qht_reset(struct qht *ht)
+{
+ struct qht_map *map;
+
+ qht_map_lock_buckets__no_stale(ht, &map);
+ qht_map_reset__all_locked(map);
+ qht_map_unlock_buckets(map);
+}
+
+static inline void qht_do_resize(struct qht *ht, struct qht_map *new)
+{
+ qht_do_resize_reset(ht, new, false);
+}
+
+static inline void qht_do_resize_and_reset(struct qht *ht, struct qht_map *new)
+{
+ qht_do_resize_reset(ht, new, true);
+}
+
+bool qht_reset_size(struct qht *ht, size_t n_elems)
+{
+ struct qht_map *new = NULL;
+ struct qht_map *map;
+ size_t n_buckets;
+
+ n_buckets = qht_elems_to_buckets(n_elems);
+
+ qht_lock(ht);
+ map = ht->map;
+ if (n_buckets != map->n_buckets) {
+ new = qht_map_create(n_buckets);
+ }
+ qht_do_resize_and_reset(ht, new);
+ qht_unlock(ht);
+
+ return !!new;
+}
+
+static inline
+void *qht_do_lookup(const struct qht_bucket *head, qht_lookup_func_t func,
+ const void *userp, uint32_t hash)
+{
+ const struct qht_bucket *b = head;
+ int i;
+
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (qatomic_read(&b->hashes[i]) == hash) {
+ /* The pointer is dereferenced before seqlock_read_retry,
+ * so (unlike qht_insert__locked) we need to use
+ * qatomic_rcu_read here.
+ */
+ void *p = qatomic_rcu_read(&b->pointers[i]);
+
+ if (likely(p) && likely(func(p, userp))) {
+ return p;
+ }
+ }
+ }
+ b = qatomic_rcu_read(&b->next);
+ } while (b);
+
+ return NULL;
+}
+
+static __attribute__((noinline))
+void *qht_lookup__slowpath(const struct qht_bucket *b, qht_lookup_func_t func,
+ const void *userp, uint32_t hash)
+{
+ unsigned int version;
+ void *ret;
+
+ do {
+ version = seqlock_read_begin(&b->sequence);
+ ret = qht_do_lookup(b, func, userp, hash);
+ } while (seqlock_read_retry(&b->sequence, version));
+ return ret;
+}
+
+void *qht_lookup_custom(const struct qht *ht, const void *userp, uint32_t hash,
+ qht_lookup_func_t func)
+{
+ const struct qht_bucket *b;
+ const struct qht_map *map;
+ unsigned int version;
+ void *ret;
+
+ map = qatomic_rcu_read(&ht->map);
+ b = qht_map_to_bucket(map, hash);
+
+ version = seqlock_read_begin(&b->sequence);
+ ret = qht_do_lookup(b, func, userp, hash);
+ if (likely(!seqlock_read_retry(&b->sequence, version))) {
+ return ret;
+ }
+ /*
+ * Removing the do/while from the fastpath gives a 4% perf. increase when
+ * running a 100%-lookup microbenchmark.
+ */
+ return qht_lookup__slowpath(b, func, userp, hash);
+}
+
+void *qht_lookup(const struct qht *ht, const void *userp, uint32_t hash)
+{
+ return qht_lookup_custom(ht, userp, hash, ht->cmp);
+}
+
+/*
+ * call with head->lock held
+ * @ht is const since it is only used for ht->cmp()
+ */
+static void *qht_insert__locked(const struct qht *ht, struct qht_map *map,
+ struct qht_bucket *head, void *p, uint32_t hash,
+ bool *needs_resize)
+{
+ struct qht_bucket *b = head;
+ struct qht_bucket *prev = NULL;
+ struct qht_bucket *new = NULL;
+ int i;
+
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (b->pointers[i]) {
+ if (unlikely(b->hashes[i] == hash &&
+ ht->cmp(b->pointers[i], p))) {
+ return b->pointers[i];
+ }
+ } else {
+ goto found;
+ }
+ }
+ prev = b;
+ b = b->next;
+ } while (b);
+
+ b = qemu_memalign(QHT_BUCKET_ALIGN, sizeof(*b));
+ memset(b, 0, sizeof(*b));
+ new = b;
+ i = 0;
+ qatomic_inc(&map->n_added_buckets);
+ if (unlikely(qht_map_needs_resize(map)) && needs_resize) {
+ *needs_resize = true;
+ }
+
+ found:
+ /* found an empty key: acquire the seqlock and write */
+ seqlock_write_begin(&head->sequence);
+ if (new) {
+ qatomic_rcu_set(&prev->next, b);
+ }
+ /* smp_wmb() implicit in seqlock_write_begin. */
+ qatomic_set(&b->hashes[i], hash);
+ qatomic_set(&b->pointers[i], p);
+ seqlock_write_end(&head->sequence);
+ return NULL;
+}
+
+static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht)
+{
+ struct qht_map *map;
+
+ /*
+ * If the lock is taken it probably means there's an ongoing resize,
+ * so bail out.
+ */
+ if (qht_trylock(ht)) {
+ return;
+ }
+ map = ht->map;
+ /* another thread might have just performed the resize we were after */
+ if (qht_map_needs_resize(map)) {
+ struct qht_map *new = qht_map_create(map->n_buckets * 2);
+
+ qht_do_resize(ht, new);
+ }
+ qht_unlock(ht);
+}
+
+bool qht_insert(struct qht *ht, void *p, uint32_t hash, void **existing)
+{
+ struct qht_bucket *b;
+ struct qht_map *map;
+ bool needs_resize = false;
+ void *prev;
+
+ /* NULL pointers are not supported */
+ qht_debug_assert(p);
+
+ b = qht_bucket_lock__no_stale(ht, hash, &map);
+ prev = qht_insert__locked(ht, map, b, p, hash, &needs_resize);
+ qht_bucket_debug__locked(b);
+ qemu_spin_unlock(&b->lock);
+
+ if (unlikely(needs_resize) && ht->mode & QHT_MODE_AUTO_RESIZE) {
+ qht_grow_maybe(ht);
+ }
+ if (likely(prev == NULL)) {
+ return true;
+ }
+ if (existing) {
+ *existing = prev;
+ }
+ return false;
+}
+
+static inline bool qht_entry_is_last(const struct qht_bucket *b, int pos)
+{
+ if (pos == QHT_BUCKET_ENTRIES - 1) {
+ if (b->next == NULL) {
+ return true;
+ }
+ return b->next->pointers[0] == NULL;
+ }
+ return b->pointers[pos + 1] == NULL;
+}
+
+static void
+qht_entry_move(struct qht_bucket *to, int i, struct qht_bucket *from, int j)
+{
+ qht_debug_assert(!(to == from && i == j));
+ qht_debug_assert(to->pointers[i]);
+ qht_debug_assert(from->pointers[j]);
+
+ qatomic_set(&to->hashes[i], from->hashes[j]);
+ qatomic_set(&to->pointers[i], from->pointers[j]);
+
+ qatomic_set(&from->hashes[j], 0);
+ qatomic_set(&from->pointers[j], NULL);
+}
+
+/*
+ * Find the last valid entry in @orig, and swap it with @orig[pos], which has
+ * just been invalidated.
+ */
+static inline void qht_bucket_remove_entry(struct qht_bucket *orig, int pos)
+{
+ struct qht_bucket *b = orig;
+ struct qht_bucket *prev = NULL;
+ int i;
+
+ if (qht_entry_is_last(orig, pos)) {
+ orig->hashes[pos] = 0;
+ qatomic_set(&orig->pointers[pos], NULL);
+ return;
+ }
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (b->pointers[i]) {
+ continue;
+ }
+ if (i > 0) {
+ return qht_entry_move(orig, pos, b, i - 1);
+ }
+ qht_debug_assert(prev);
+ return qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1);
+ }
+ prev = b;
+ b = b->next;
+ } while (b);
+ /* no free entries other than orig[pos], so swap it with the last one */
+ qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1);
+}
+
+/* call with b->lock held */
+static inline
+bool qht_remove__locked(struct qht_bucket *head, const void *p, uint32_t hash)
+{
+ struct qht_bucket *b = head;
+ int i;
+
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ void *q = b->pointers[i];
+
+ if (unlikely(q == NULL)) {
+ return false;
+ }
+ if (q == p) {
+ qht_debug_assert(b->hashes[i] == hash);
+ seqlock_write_begin(&head->sequence);
+ qht_bucket_remove_entry(b, i);
+ seqlock_write_end(&head->sequence);
+ return true;
+ }
+ }
+ b = b->next;
+ } while (b);
+ return false;
+}
+
+bool qht_remove(struct qht *ht, const void *p, uint32_t hash)
+{
+ struct qht_bucket *b;
+ struct qht_map *map;
+ bool ret;
+
+ /* NULL pointers are not supported */
+ qht_debug_assert(p);
+
+ b = qht_bucket_lock__no_stale(ht, hash, &map);
+ ret = qht_remove__locked(b, p, hash);
+ qht_bucket_debug__locked(b);
+ qemu_spin_unlock(&b->lock);
+ return ret;
+}
+
+static inline void qht_bucket_iter(struct qht_bucket *head,
+ const struct qht_iter *iter, void *userp)
+{
+ struct qht_bucket *b = head;
+ int i;
+
+ do {
+ for (i = 0; i < QHT_BUCKET_ENTRIES; i++) {
+ if (b->pointers[i] == NULL) {
+ return;
+ }
+ switch (iter->type) {
+ case QHT_ITER_VOID:
+ iter->f.retvoid(b->pointers[i], b->hashes[i], userp);
+ break;
+ case QHT_ITER_RM:
+ if (iter->f.retbool(b->pointers[i], b->hashes[i], userp)) {
+ /* replace i with the last valid element in the bucket */
+ seqlock_write_begin(&head->sequence);
+ qht_bucket_remove_entry(b, i);
+ seqlock_write_end(&head->sequence);
+ qht_bucket_debug__locked(b);
+ /* reevaluate i, since it just got replaced */
+ i--;
+ continue;
+ }
+ break;
+ default:
+ g_assert_not_reached();
+ }
+ }
+ b = b->next;
+ } while (b);
+}
+
+/* call with all of the map's locks held */
+static inline void qht_map_iter__all_locked(struct qht_map *map,
+ const struct qht_iter *iter,
+ void *userp)
+{
+ size_t i;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ qht_bucket_iter(&map->buckets[i], iter, userp);
+ }
+}
+
+static inline void
+do_qht_iter(struct qht *ht, const struct qht_iter *iter, void *userp)
+{
+ struct qht_map *map;
+
+ map = qatomic_rcu_read(&ht->map);
+ qht_map_lock_buckets(map);
+ qht_map_iter__all_locked(map, iter, userp);
+ qht_map_unlock_buckets(map);
+}
+
+void qht_iter(struct qht *ht, qht_iter_func_t func, void *userp)
+{
+ const struct qht_iter iter = {
+ .f.retvoid = func,
+ .type = QHT_ITER_VOID,
+ };
+
+ do_qht_iter(ht, &iter, userp);
+}
+
+void qht_iter_remove(struct qht *ht, qht_iter_bool_func_t func, void *userp)
+{
+ const struct qht_iter iter = {
+ .f.retbool = func,
+ .type = QHT_ITER_RM,
+ };
+
+ do_qht_iter(ht, &iter, userp);
+}
+
+struct qht_map_copy_data {
+ struct qht *ht;
+ struct qht_map *new;
+};
+
+static void qht_map_copy(void *p, uint32_t hash, void *userp)
+{
+ struct qht_map_copy_data *data = userp;
+ struct qht *ht = data->ht;
+ struct qht_map *new = data->new;
+ struct qht_bucket *b = qht_map_to_bucket(new, hash);
+
+ /* no need to acquire b->lock because no thread has seen this map yet */
+ qht_insert__locked(ht, new, b, p, hash, NULL);
+}
+
+/*
+ * Atomically perform a resize and/or reset.
+ * Call with ht->lock held.
+ */
+static void qht_do_resize_reset(struct qht *ht, struct qht_map *new, bool reset)
+{
+ struct qht_map *old;
+ const struct qht_iter iter = {
+ .f.retvoid = qht_map_copy,
+ .type = QHT_ITER_VOID,
+ };
+ struct qht_map_copy_data data;
+
+ old = ht->map;
+ qht_map_lock_buckets(old);
+
+ if (reset) {
+ qht_map_reset__all_locked(old);
+ }
+
+ if (new == NULL) {
+ qht_map_unlock_buckets(old);
+ return;
+ }
+
+ g_assert(new->n_buckets != old->n_buckets);
+ data.ht = ht;
+ data.new = new;
+ qht_map_iter__all_locked(old, &iter, &data);
+ qht_map_debug__all_locked(new);
+
+ qatomic_rcu_set(&ht->map, new);
+ qht_map_unlock_buckets(old);
+ call_rcu(old, qht_map_destroy, rcu);
+}
+
+bool qht_resize(struct qht *ht, size_t n_elems)
+{
+ size_t n_buckets = qht_elems_to_buckets(n_elems);
+ size_t ret = false;
+
+ qht_lock(ht);
+ if (n_buckets != ht->map->n_buckets) {
+ struct qht_map *new;
+
+ new = qht_map_create(n_buckets);
+ qht_do_resize(ht, new);
+ ret = true;
+ }
+ qht_unlock(ht);
+
+ return ret;
+}
+
+/* pass @stats to qht_statistics_destroy() when done */
+void qht_statistics_init(const struct qht *ht, struct qht_stats *stats)
+{
+ const struct qht_map *map;
+ int i;
+
+ map = qatomic_rcu_read(&ht->map);
+
+ stats->used_head_buckets = 0;
+ stats->entries = 0;
+ qdist_init(&stats->chain);
+ qdist_init(&stats->occupancy);
+ /* bail out if the qht has not yet been initialized */
+ if (unlikely(map == NULL)) {
+ stats->head_buckets = 0;
+ return;
+ }
+ stats->head_buckets = map->n_buckets;
+
+ for (i = 0; i < map->n_buckets; i++) {
+ const struct qht_bucket *head = &map->buckets[i];
+ const struct qht_bucket *b;
+ unsigned int version;
+ size_t buckets;
+ size_t entries;
+ int j;
+
+ do {
+ version = seqlock_read_begin(&head->sequence);
+ buckets = 0;
+ entries = 0;
+ b = head;
+ do {
+ for (j = 0; j < QHT_BUCKET_ENTRIES; j++) {
+ if (qatomic_read(&b->pointers[j]) == NULL) {
+ break;
+ }
+ entries++;
+ }
+ buckets++;
+ b = qatomic_rcu_read(&b->next);
+ } while (b);
+ } while (seqlock_read_retry(&head->sequence, version));
+
+ if (entries) {
+ qdist_inc(&stats->chain, buckets);
+ qdist_inc(&stats->occupancy,
+ (double)entries / QHT_BUCKET_ENTRIES / buckets);
+ stats->used_head_buckets++;
+ stats->entries += entries;
+ } else {
+ qdist_inc(&stats->occupancy, 0);
+ }
+ }
+}
+
+void qht_statistics_destroy(struct qht_stats *stats)
+{
+ qdist_destroy(&stats->occupancy);
+ qdist_destroy(&stats->chain);
+}
diff --git a/util/qsp.c b/util/qsp.c
new file mode 100644
index 000000000..8562b14a8
--- /dev/null
+++ b/util/qsp.c
@@ -0,0 +1,813 @@
+/*
+ * qsp.c - QEMU Synchronization Profiler
+ *
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ * QSP profiles the time spent in synchronization primitives, which can
+ * help diagnose performance problems, e.g. scalability issues when
+ * contention is high.
+ *
+ * The primitives currently supported are mutexes, recursive mutexes and
+ * condition variables. Note that not all related functions are intercepted;
+ * instead we profile only those functions that can have a performance impact,
+ * either due to blocking (e.g. cond_wait, mutex_lock) or cache line
+ * contention (e.g. mutex_lock, mutex_trylock).
+ *
+ * QSP's design focuses on speed and scalability. This is achieved
+ * by having threads do their profiling entirely on thread-local data.
+ * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash
+ * table. To aggregate data in order to generate a report, we iterate over
+ * all entries in the hash table. Depending on the number of threads and
+ * synchronization objects this might be expensive, but note that it is
+ * very rarely called -- reports are generated only when requested by users.
+ *
+ * Reports are generated as a table where each row represents a call site. A
+ * call site is the triplet formed by the __file__ and __LINE__ of the caller
+ * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar)
+ * being operated on. Optionally, call sites that operate on different objects
+ * of the same type can be coalesced, which can be particularly useful when
+ * profiling dynamically-allocated objects.
+ *
+ * Alternative designs considered:
+ *
+ * - Use an off-the-shelf profiler such as mutrace. This is not a viable option
+ * for us because QEMU has __malloc_hook set (by one of the libraries it
+ * uses); leaving this hook unset is required to avoid deadlock in mutrace.
+ *
+ * - Use a glib HT for each thread, protecting each HT with its own lock.
+ * This isn't simpler than the current design, and is 10% slower in the
+ * atomic_add-bench microbenchmark (-m option).
+ *
+ * - For reports, just use a binary tree as we aggregate data, instead of having
+ * an intermediate hash table. This would simplify the code only slightly, but
+ * would perform badly if there were many threads and objects to track.
+ *
+ * - Wrap operations on qsp entries with RCU read-side critical sections, so
+ * that qsp_reset() can delete entries. Unfortunately, the overhead of calling
+ * rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having
+ * a snapshot that is updated on qsp_reset() avoids this overhead.
+ *
+ * Related Work:
+ * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html
+ * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating
+ * Critical-Section Execution to Improve the Performance of Multithreaded
+ * Applications", USENIX ATC'12.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/qemu-print.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+#include "qemu/qht.h"
+#include "qemu/rcu.h"
+#include "qemu/xxhash.h"
+
+enum QSPType {
+ QSP_MUTEX,
+ QSP_BQL_MUTEX,
+ QSP_REC_MUTEX,
+ QSP_CONDVAR,
+};
+
+struct QSPCallSite {
+ const void *obj;
+ const char *file; /* i.e. __FILE__; shortened later */
+ int line;
+ enum QSPType type;
+};
+typedef struct QSPCallSite QSPCallSite;
+
+struct QSPEntry {
+ void *thread_ptr;
+ const QSPCallSite *callsite;
+ aligned_uint64_t n_acqs;
+ aligned_uint64_t ns;
+ unsigned int n_objs; /* count of coalesced objs; only used for reporting */
+};
+typedef struct QSPEntry QSPEntry;
+
+struct QSPSnapshot {
+ struct rcu_head rcu;
+ struct qht ht;
+};
+typedef struct QSPSnapshot QSPSnapshot;
+
+/* initial sizing for hash tables */
+#define QSP_INITIAL_SIZE 64
+
+/* If this file is moved, QSP_REL_PATH should be updated accordingly */
+#define QSP_REL_PATH "util/qsp.c"
+
+/* this file's full path. Used to present all call sites with relative paths */
+static size_t qsp_qemu_path_len;
+
+/* the address of qsp_thread gives us a unique 'thread ID' */
+static __thread int qsp_thread;
+
+/*
+ * Call sites are the same for all threads, so we track them in a separate hash
+ * table to save memory.
+ */
+static struct qht qsp_callsite_ht;
+
+static struct qht qsp_ht;
+static QSPSnapshot *qsp_snapshot;
+static bool qsp_initialized, qsp_initializing;
+
+static const char * const qsp_typenames[] = {
+ [QSP_MUTEX] = "mutex",
+ [QSP_BQL_MUTEX] = "BQL mutex",
+ [QSP_REC_MUTEX] = "rec_mutex",
+ [QSP_CONDVAR] = "condvar",
+};
+
+QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl;
+QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl;
+QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl;
+QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl;
+QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func =
+ qemu_rec_mutex_trylock_impl;
+QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl;
+QemuCondTimedWaitFunc qemu_cond_timedwait_func = qemu_cond_timedwait_impl;
+
+/*
+ * It pays off to _not_ hash callsite->file; hashing a string is slow, and
+ * without it we still get a pretty unique hash.
+ */
+static inline
+uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t ab)
+{
+ uint64_t cd = (uint64_t)(uintptr_t)callsite->obj;
+ uint32_t e = callsite->line;
+ uint32_t f = callsite->type;
+
+ return qemu_xxhash6(ab, cd, e, f);
+}
+
+static inline
+uint32_t qsp_callsite_hash(const QSPCallSite *callsite)
+{
+ return do_qsp_callsite_hash(callsite, 0);
+}
+
+static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a)
+{
+ return do_qsp_callsite_hash(entry->callsite, a);
+}
+
+static uint32_t qsp_entry_hash(const QSPEntry *entry)
+{
+ return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr);
+}
+
+static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry)
+{
+ return do_qsp_entry_hash(entry, 0);
+}
+
+/* without the objects we need to hash the file name to get a decent hash */
+static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry)
+{
+ const QSPCallSite *callsite = entry->callsite;
+ uint64_t ab = g_str_hash(callsite->file);
+ uint64_t cd = callsite->line;
+ uint32_t e = callsite->type;
+
+ return qemu_xxhash5(ab, cd, e);
+}
+
+static bool qsp_callsite_cmp(const void *ap, const void *bp)
+{
+ const QSPCallSite *a = ap;
+ const QSPCallSite *b = bp;
+
+ return a == b ||
+ (a->obj == b->obj &&
+ a->line == b->line &&
+ a->type == b->type &&
+ (a->file == b->file || !strcmp(a->file, b->file)));
+}
+
+static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp)
+{
+ const QSPCallSite *a = ap;
+ const QSPCallSite *b = bp;
+
+ return a == b ||
+ (a->line == b->line &&
+ a->type == b->type &&
+ (a->file == b->file || !strcmp(a->file, b->file)));
+}
+
+static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp)
+{
+ const QSPEntry *a = ap;
+ const QSPEntry *b = bp;
+
+ return qsp_callsite_cmp(a->callsite, b->callsite);
+}
+
+static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp)
+{
+ const QSPEntry *a = ap;
+ const QSPEntry *b = bp;
+
+ return qsp_callsite_no_obj_cmp(a->callsite, b->callsite);
+}
+
+static bool qsp_entry_cmp(const void *ap, const void *bp)
+{
+ const QSPEntry *a = ap;
+ const QSPEntry *b = bp;
+
+ return a->thread_ptr == b->thread_ptr &&
+ qsp_callsite_cmp(a->callsite, b->callsite);
+}
+
+/*
+ * Normally we'd call this from a constructor function, but we want it to work
+ * via libutil as well.
+ */
+static void qsp_do_init(void)
+{
+ /* make sure this file's path in the tree is up to date with QSP_REL_PATH */
+ g_assert(strstr(__FILE__, QSP_REL_PATH));
+ qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH);
+
+ qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+ qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+}
+
+static __attribute__((noinline)) void qsp_init__slowpath(void)
+{
+ if (qatomic_cmpxchg(&qsp_initializing, false, true) == false) {
+ qsp_do_init();
+ qatomic_set(&qsp_initialized, true);
+ } else {
+ while (!qatomic_read(&qsp_initialized)) {
+ cpu_relax();
+ }
+ }
+}
+
+/* qsp_init() must be called from _all_ exported functions */
+static inline void qsp_init(void)
+{
+ if (likely(qatomic_read(&qsp_initialized))) {
+ return;
+ }
+ qsp_init__slowpath();
+}
+
+static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig)
+{
+ QSPCallSite *callsite;
+ uint32_t hash;
+
+ hash = qsp_callsite_hash(orig);
+ callsite = qht_lookup(&qsp_callsite_ht, orig, hash);
+ if (callsite == NULL) {
+ void *existing = NULL;
+
+ callsite = g_new(QSPCallSite, 1);
+ memcpy(callsite, orig, sizeof(*callsite));
+ qht_insert(&qsp_callsite_ht, callsite, hash, &existing);
+ if (unlikely(existing)) {
+ g_free(callsite);
+ callsite = existing;
+ }
+ }
+ return callsite;
+}
+
+static QSPEntry *
+qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash)
+{
+ QSPEntry *e;
+ void *existing = NULL;
+
+ e = g_new0(QSPEntry, 1);
+ e->thread_ptr = entry->thread_ptr;
+ e->callsite = qsp_callsite_find(entry->callsite);
+
+ qht_insert(ht, e, hash, &existing);
+ if (unlikely(existing)) {
+ g_free(e);
+ e = existing;
+ }
+ return e;
+}
+
+static QSPEntry *
+qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash)
+{
+ QSPEntry *e;
+
+ e = qht_lookup(ht, entry, hash);
+ if (e == NULL) {
+ e = qsp_entry_create(ht, entry, hash);
+ }
+ return e;
+}
+
+/*
+ * Note: Entries are never removed, so callers do not have to be in an RCU
+ * read-side critical section.
+ */
+static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line,
+ enum QSPType type)
+{
+ QSPCallSite callsite = {
+ .obj = obj,
+ .file = file,
+ .line = line,
+ .type = type,
+ };
+ QSPEntry orig;
+ uint32_t hash;
+
+ qsp_init();
+
+ orig.thread_ptr = &qsp_thread;
+ orig.callsite = &callsite;
+
+ hash = qsp_entry_hash(&orig);
+ return qsp_entry_find(&qsp_ht, &orig, hash);
+}
+
+/*
+ * @e is in the global hash table; it is only written to by the current thread,
+ * so we write to it atomically (as in "write once") to prevent torn reads.
+ */
+static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq)
+{
+ qatomic_set_u64(&e->ns, e->ns + delta);
+ if (acq) {
+ qatomic_set_u64(&e->n_acqs, e->n_acqs + 1);
+ }
+}
+
+static inline void qsp_entry_record(QSPEntry *e, int64_t delta)
+{
+ do_qsp_entry_record(e, delta, true);
+}
+
+#define QSP_GEN_VOID(type_, qsp_t_, func_, impl_) \
+ static void func_(type_ *obj, const char *file, int line) \
+ { \
+ QSPEntry *e; \
+ int64_t t0, t1; \
+ \
+ t0 = get_clock(); \
+ impl_(obj, file, line); \
+ t1 = get_clock(); \
+ \
+ e = qsp_entry_get(obj, file, line, qsp_t_); \
+ qsp_entry_record(e, t1 - t0); \
+ }
+
+#define QSP_GEN_RET1(type_, qsp_t_, func_, impl_) \
+ static int func_(type_ *obj, const char *file, int line) \
+ { \
+ QSPEntry *e; \
+ int64_t t0, t1; \
+ int err; \
+ \
+ t0 = get_clock(); \
+ err = impl_(obj, file, line); \
+ t1 = get_clock(); \
+ \
+ e = qsp_entry_get(obj, file, line, qsp_t_); \
+ do_qsp_entry_record(e, t1 - t0, !err); \
+ return err; \
+ }
+
+QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl)
+QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl)
+QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl)
+
+QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock,
+ qemu_rec_mutex_lock_impl)
+QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock,
+ qemu_rec_mutex_trylock_impl)
+
+#undef QSP_GEN_RET1
+#undef QSP_GEN_VOID
+
+static void
+qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line)
+{
+ QSPEntry *e;
+ int64_t t0, t1;
+
+ t0 = get_clock();
+ qemu_cond_wait_impl(cond, mutex, file, line);
+ t1 = get_clock();
+
+ e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
+ qsp_entry_record(e, t1 - t0);
+}
+
+static bool
+qsp_cond_timedwait(QemuCond *cond, QemuMutex *mutex, int ms,
+ const char *file, int line)
+{
+ QSPEntry *e;
+ int64_t t0, t1;
+ bool ret;
+
+ t0 = get_clock();
+ ret = qemu_cond_timedwait_impl(cond, mutex, ms, file, line);
+ t1 = get_clock();
+
+ e = qsp_entry_get(cond, file, line, QSP_CONDVAR);
+ qsp_entry_record(e, t1 - t0);
+ return ret;
+}
+
+bool qsp_is_enabled(void)
+{
+ return qatomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock;
+}
+
+void qsp_enable(void)
+{
+ qatomic_set(&qemu_mutex_lock_func, qsp_mutex_lock);
+ qatomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock);
+ qatomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock);
+ qatomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock);
+ qatomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock);
+ qatomic_set(&qemu_cond_wait_func, qsp_cond_wait);
+ qatomic_set(&qemu_cond_timedwait_func, qsp_cond_timedwait);
+}
+
+void qsp_disable(void)
+{
+ qatomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl);
+ qatomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl);
+ qatomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl);
+ qatomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl);
+ qatomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl);
+ qatomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl);
+ qatomic_set(&qemu_cond_timedwait_func, qemu_cond_timedwait_impl);
+}
+
+static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up)
+{
+ const QSPEntry *a = ap;
+ const QSPEntry *b = bp;
+ enum QSPSortBy sort_by = *(enum QSPSortBy *)up;
+ const QSPCallSite *ca;
+ const QSPCallSite *cb;
+
+ switch (sort_by) {
+ case QSP_SORT_BY_TOTAL_WAIT_TIME:
+ if (a->ns > b->ns) {
+ return -1;
+ } else if (a->ns < b->ns) {
+ return 1;
+ }
+ break;
+ case QSP_SORT_BY_AVG_WAIT_TIME:
+ {
+ double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0;
+ double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0;
+
+ if (avg_a > avg_b) {
+ return -1;
+ } else if (avg_a < avg_b) {
+ return 1;
+ }
+ break;
+ }
+ default:
+ g_assert_not_reached();
+ }
+
+ ca = a->callsite;
+ cb = b->callsite;
+ /* Break the tie with the object's address */
+ if (ca->obj < cb->obj) {
+ return -1;
+ } else if (ca->obj > cb->obj) {
+ return 1;
+ } else {
+ int cmp;
+
+ /* same obj. Break the tie with the callsite's file */
+ cmp = strcmp(ca->file, cb->file);
+ if (cmp) {
+ return cmp;
+ }
+ /* same callsite file. Break the tie with the callsite's line */
+ g_assert(ca->line != cb->line);
+ if (ca->line < cb->line) {
+ return -1;
+ } else if (ca->line > cb->line) {
+ return 1;
+ } else {
+ /* break the tie with the callsite's type */
+ return cb->type - ca->type;
+ }
+ }
+}
+
+static void qsp_sort(void *p, uint32_t h, void *userp)
+{
+ QSPEntry *e = p;
+ GTree *tree = userp;
+
+ g_tree_insert(tree, e, NULL);
+}
+
+static void qsp_aggregate(void *p, uint32_t h, void *up)
+{
+ struct qht *ht = up;
+ const QSPEntry *e = p;
+ QSPEntry *agg;
+ uint32_t hash;
+
+ hash = qsp_entry_no_thread_hash(e);
+ agg = qsp_entry_find(ht, e, hash);
+ /*
+ * The entry is in the global hash table; read from it atomically (as in
+ * "read once").
+ */
+ agg->ns += qatomic_read_u64(&e->ns);
+ agg->n_acqs += qatomic_read_u64(&e->n_acqs);
+}
+
+static void qsp_iter_diff(void *p, uint32_t hash, void *htp)
+{
+ struct qht *ht = htp;
+ QSPEntry *old = p;
+ QSPEntry *new;
+
+ new = qht_lookup(ht, old, hash);
+ /* entries are never deleted, so we must have this one */
+ g_assert(new != NULL);
+ /* our reading of the stats happened after the snapshot was taken */
+ g_assert(new->n_acqs >= old->n_acqs);
+ g_assert(new->ns >= old->ns);
+
+ new->n_acqs -= old->n_acqs;
+ new->ns -= old->ns;
+
+ /* No point in reporting an empty entry */
+ if (new->n_acqs == 0 && new->ns == 0) {
+ bool removed = qht_remove(ht, new, hash);
+
+ g_assert(removed);
+ g_free(new);
+ }
+}
+
+static void qsp_diff(struct qht *orig, struct qht *new)
+{
+ qht_iter(orig, qsp_iter_diff, new);
+}
+
+static void qsp_iter_callsite_coalesce(void *p, uint32_t h, void *htp)
+{
+ struct qht *ht = htp;
+ QSPEntry *old = p;
+ QSPEntry *e;
+ uint32_t hash;
+
+ hash = qsp_entry_no_thread_obj_hash(old);
+ e = qht_lookup(ht, old, hash);
+ if (e == NULL) {
+ e = qsp_entry_create(ht, old, hash);
+ e->n_objs = 1;
+ } else if (e->callsite->obj != old->callsite->obj) {
+ e->n_objs++;
+ }
+ e->ns += old->ns;
+ e->n_acqs += old->n_acqs;
+}
+
+static void qsp_ht_delete(void *p, uint32_t h, void *htp)
+{
+ g_free(p);
+}
+
+static void qsp_mktree(GTree *tree, bool callsite_coalesce)
+{
+ struct qht ht, coalesce_ht;
+ struct qht *htp;
+
+ /*
+ * First, see if there's a prior snapshot, so that we read the global hash
+ * table _after_ the snapshot has been created, which guarantees that
+ * the entries we'll read will be a superset of the snapshot's entries.
+ *
+ * We must remain in an RCU read-side critical section until we're done
+ * with the snapshot.
+ */
+ WITH_RCU_READ_LOCK_GUARD() {
+ QSPSnapshot *snap = qatomic_rcu_read(&qsp_snapshot);
+
+ /* Aggregate all results from the global hash table into a local one */
+ qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+ qht_iter(&qsp_ht, qsp_aggregate, &ht);
+
+ /* compute the difference wrt the snapshot, if any */
+ if (snap) {
+ qsp_diff(&snap->ht, &ht);
+ }
+ }
+
+ htp = &ht;
+ if (callsite_coalesce) {
+ qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+ qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht);
+
+ /* free the previous hash table, and point htp to coalesce_ht */
+ qht_iter(&ht, qsp_ht_delete, NULL);
+ qht_destroy(&ht);
+ htp = &coalesce_ht;
+ }
+
+ /* sort the hash table elements by using a tree */
+ qht_iter(htp, qsp_sort, tree);
+
+ /* free the hash table, but keep the elements (those are in the tree now) */
+ qht_destroy(htp);
+}
+
+/* free string with g_free */
+static char *qsp_at(const QSPCallSite *callsite)
+{
+ GString *s = g_string_new(NULL);
+ const char *shortened;
+
+ /* remove the absolute path to qemu */
+ if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) {
+ shortened = callsite->file;
+ } else {
+ shortened = callsite->file + qsp_qemu_path_len;
+ }
+ g_string_append_printf(s, "%s:%u", shortened, callsite->line);
+ return g_string_free(s, FALSE);
+}
+
+struct QSPReportEntry {
+ const void *obj;
+ char *callsite_at;
+ const char *typename;
+ double time_s;
+ double ns_avg;
+ uint64_t n_acqs;
+ unsigned int n_objs;
+};
+typedef struct QSPReportEntry QSPReportEntry;
+
+struct QSPReport {
+ QSPReportEntry *entries;
+ size_t n_entries;
+ size_t max_n_entries;
+};
+typedef struct QSPReport QSPReport;
+
+static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata)
+{
+ const QSPEntry *e = key;
+ QSPReport *report = udata;
+ QSPReportEntry *entry;
+
+ if (report->n_entries == report->max_n_entries) {
+ return TRUE;
+ }
+ entry = &report->entries[report->n_entries];
+ report->n_entries++;
+
+ entry->obj = e->callsite->obj;
+ entry->n_objs = e->n_objs;
+ entry->callsite_at = qsp_at(e->callsite);
+ entry->typename = qsp_typenames[e->callsite->type];
+ entry->time_s = e->ns * 1e-9;
+ entry->n_acqs = e->n_acqs;
+ entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0;
+ return FALSE;
+}
+
+static void pr_report(const QSPReport *rep)
+{
+ char *dashes;
+ size_t max_len = 0;
+ int callsite_len = 0;
+ int callsite_rspace;
+ int n_dashes;
+ size_t i;
+
+ /* find out the maximum length of all 'callsite' fields */
+ for (i = 0; i < rep->n_entries; i++) {
+ const QSPReportEntry *e = &rep->entries[i];
+ size_t len = strlen(e->callsite_at);
+
+ if (len > max_len) {
+ max_len = len;
+ }
+ }
+
+ callsite_len = MAX(max_len, strlen("Call site"));
+ /* white space to leave to the right of "Call site" */
+ callsite_rspace = callsite_len - strlen("Call site");
+
+ qemu_printf("Type Object Call site%*s Wait Time (s) "
+ " Count Average (us)\n", callsite_rspace, "");
+
+ /* build a horizontal rule with dashes */
+ n_dashes = 79 + callsite_rspace;
+ dashes = g_malloc(n_dashes + 1);
+ memset(dashes, '-', n_dashes);
+ dashes[n_dashes] = '\0';
+ qemu_printf("%s\n", dashes);
+
+ for (i = 0; i < rep->n_entries; i++) {
+ const QSPReportEntry *e = &rep->entries[i];
+ GString *s = g_string_new(NULL);
+
+ g_string_append_printf(s, "%-9s ", e->typename);
+ if (e->n_objs > 1) {
+ g_string_append_printf(s, "[%12u]", e->n_objs);
+ } else {
+ g_string_append_printf(s, "%14p", e->obj);
+ }
+ g_string_append_printf(s, " %s%*s %13.5f %12" PRIu64 " %12.2f\n",
+ e->callsite_at,
+ callsite_len - (int)strlen(e->callsite_at), "",
+ e->time_s, e->n_acqs, e->ns_avg * 1e-3);
+ qemu_printf("%s", s->str);
+ g_string_free(s, TRUE);
+ }
+
+ qemu_printf("%s\n", dashes);
+ g_free(dashes);
+}
+
+static void report_destroy(QSPReport *rep)
+{
+ size_t i;
+
+ for (i = 0; i < rep->n_entries; i++) {
+ QSPReportEntry *e = &rep->entries[i];
+
+ g_free(e->callsite_at);
+ }
+ g_free(rep->entries);
+}
+
+void qsp_report(size_t max, enum QSPSortBy sort_by,
+ bool callsite_coalesce)
+{
+ GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL);
+ QSPReport rep;
+
+ qsp_init();
+
+ rep.entries = g_new0(QSPReportEntry, max);
+ rep.n_entries = 0;
+ rep.max_n_entries = max;
+
+ qsp_mktree(tree, callsite_coalesce);
+ g_tree_foreach(tree, qsp_tree_report, &rep);
+ g_tree_destroy(tree);
+
+ pr_report(&rep);
+ report_destroy(&rep);
+}
+
+static void qsp_snapshot_destroy(QSPSnapshot *snap)
+{
+ qht_iter(&snap->ht, qsp_ht_delete, NULL);
+ qht_destroy(&snap->ht);
+ g_free(snap);
+}
+
+void qsp_reset(void)
+{
+ QSPSnapshot *new = g_new(QSPSnapshot, 1);
+ QSPSnapshot *old;
+
+ qsp_init();
+
+ qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE,
+ QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES);
+
+ /* take a snapshot of the current state */
+ qht_iter(&qsp_ht, qsp_aggregate, &new->ht);
+
+ /* replace the previous snapshot, if any */
+ old = qatomic_xchg(&qsp_snapshot, new);
+ if (old) {
+ call_rcu(old, qsp_snapshot_destroy, rcu);
+ }
+}
diff --git a/util/range.c b/util/range.c
new file mode 100644
index 000000000..098d9d2dc
--- /dev/null
+++ b/util/range.c
@@ -0,0 +1,72 @@
+/*
+ * QEMU 64-bit address ranges
+ *
+ * Copyright (c) 2015-2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/range.h"
+
+/*
+ * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap.
+ * Both @a and @b must not be empty.
+ */
+static inline int range_compare(Range *a, Range *b)
+{
+ assert(!range_is_empty(a) && !range_is_empty(b));
+
+ /* Careful, avoid wraparound */
+ if (b->lob && b->lob - 1 > a->upb) {
+ return -1;
+ }
+ if (a->lob && a->lob - 1 > b->upb) {
+ return 1;
+ }
+ return 0;
+}
+
+/* Insert @data into @list of ranges; caller no longer owns @data */
+GList *range_list_insert(GList *list, Range *data)
+{
+ GList *l;
+
+ assert(!range_is_empty(data));
+
+ /* Skip all list elements strictly less than data */
+ for (l = list; l && range_compare(l->data, data) < 0; l = l->next) {
+ }
+
+ if (!l || range_compare(l->data, data) > 0) {
+ /* Rest of the list (if any) is strictly greater than @data */
+ return g_list_insert_before(list, l, data);
+ }
+
+ /* Current list element overlaps @data, merge the two */
+ range_extend(l->data, data);
+ g_free(data);
+
+ /* Merge any subsequent list elements that now also overlap */
+ while (l->next && range_compare(l->data, l->next->data) == 0) {
+ GList *new_l;
+
+ range_extend(l->data, l->next->data);
+ g_free(l->next->data);
+ new_l = g_list_delete_link(list, l->next);
+ assert(new_l == list);
+ }
+
+ return list;
+}
diff --git a/util/rcu.c b/util/rcu.c
new file mode 100644
index 000000000..c91da9f13
--- /dev/null
+++ b/util/rcu.c
@@ -0,0 +1,455 @@
+/*
+ * urcu-mb.c
+ *
+ * Userspace RCU library with explicit memory barriers
+ *
+ * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * Copyright (c) 2009 Paul E. McKenney, IBM Corporation.
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Ported to QEMU by Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/rcu.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+#include "qemu/main-loop.h"
+#include "qemu/lockable.h"
+#if defined(CONFIG_MALLOC_TRIM)
+#include <malloc.h>
+#endif
+
+/*
+ * Global grace period counter. Bit 0 is always one in rcu_gp_ctr.
+ * Bits 1 and above are defined in synchronize_rcu.
+ */
+#define RCU_GP_LOCKED (1UL << 0)
+#define RCU_GP_CTR (1UL << 1)
+
+unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
+
+QemuEvent rcu_gp_event;
+static int in_drain_call_rcu;
+static QemuMutex rcu_registry_lock;
+static QemuMutex rcu_sync_lock;
+
+/*
+ * Check whether a quiescent state was crossed between the beginning of
+ * update_counter_and_wait and now.
+ */
+static inline int rcu_gp_ongoing(unsigned long *ctr)
+{
+ unsigned long v;
+
+ v = qatomic_read(ctr);
+ return v && (v != rcu_gp_ctr);
+}
+
+/* Written to only by each individual reader. Read by both the reader and the
+ * writers.
+ */
+__thread struct rcu_reader_data rcu_reader;
+
+/* Protected by rcu_registry_lock. */
+typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
+static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
+
+/* Wait for previous parity/grace period to be empty of readers. */
+static void wait_for_readers(void)
+{
+ ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
+ struct rcu_reader_data *index, *tmp;
+
+ for (;;) {
+ /* We want to be notified of changes made to rcu_gp_ongoing
+ * while we walk the list.
+ */
+ qemu_event_reset(&rcu_gp_event);
+
+ /* Instead of using qatomic_mb_set for index->waiting, and
+ * qatomic_mb_read for index->ctr, memory barriers are placed
+ * manually since writes to different threads are independent.
+ * qemu_event_reset has acquire semantics, so no memory barrier
+ * is needed here.
+ */
+ QLIST_FOREACH(index, &registry, node) {
+ qatomic_set(&index->waiting, true);
+ }
+
+ /* Here, order the stores to index->waiting before the loads of
+ * index->ctr. Pairs with smp_mb_placeholder() in rcu_read_unlock(),
+ * ensuring that the loads of index->ctr are sequentially consistent.
+ */
+ smp_mb_global();
+
+ QLIST_FOREACH_SAFE(index, &registry, node, tmp) {
+ if (!rcu_gp_ongoing(&index->ctr)) {
+ QLIST_REMOVE(index, node);
+ QLIST_INSERT_HEAD(&qsreaders, index, node);
+
+ /* No need for mb_set here, worst of all we
+ * get some extra futex wakeups.
+ */
+ qatomic_set(&index->waiting, false);
+ } else if (qatomic_read(&in_drain_call_rcu)) {
+ notifier_list_notify(&index->force_rcu, NULL);
+ }
+ }
+
+ if (QLIST_EMPTY(&registry)) {
+ break;
+ }
+
+ /* Wait for one thread to report a quiescent state and try again.
+ * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't
+ * wait too much time.
+ *
+ * rcu_register_thread() may add nodes to &registry; it will not
+ * wake up synchronize_rcu, but that is okay because at least another
+ * thread must exit its RCU read-side critical section before
+ * synchronize_rcu is done. The next iteration of the loop will
+ * move the new thread's rcu_reader from &registry to &qsreaders,
+ * because rcu_gp_ongoing() will return false.
+ *
+ * rcu_unregister_thread() may remove nodes from &qsreaders instead
+ * of &registry if it runs during qemu_event_wait. That's okay;
+ * the node then will not be added back to &registry by QLIST_SWAP
+ * below. The invariant is that the node is part of one list when
+ * rcu_registry_lock is released.
+ */
+ qemu_mutex_unlock(&rcu_registry_lock);
+ qemu_event_wait(&rcu_gp_event);
+ qemu_mutex_lock(&rcu_registry_lock);
+ }
+
+ /* put back the reader list in the registry */
+ QLIST_SWAP(&registry, &qsreaders, node);
+}
+
+void synchronize_rcu(void)
+{
+ QEMU_LOCK_GUARD(&rcu_sync_lock);
+
+ /* Write RCU-protected pointers before reading p_rcu_reader->ctr.
+ * Pairs with smp_mb_placeholder() in rcu_read_lock().
+ */
+ smp_mb_global();
+
+ QEMU_LOCK_GUARD(&rcu_registry_lock);
+ if (!QLIST_EMPTY(&registry)) {
+ /* In either case, the qatomic_mb_set below blocks stores that free
+ * old RCU-protected pointers.
+ */
+ if (sizeof(rcu_gp_ctr) < 8) {
+ /* For architectures with 32-bit longs, a two-subphases algorithm
+ * ensures we do not encounter overflow bugs.
+ *
+ * Switch parity: 0 -> 1, 1 -> 0.
+ */
+ qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+ wait_for_readers();
+ qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+ } else {
+ /* Increment current grace period. */
+ qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
+ }
+
+ wait_for_readers();
+ }
+}
+
+
+#define RCU_CALL_MIN_SIZE 30
+
+/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
+ * from liburcu. Note that head is only used by the consumer.
+ */
+static struct rcu_head dummy;
+static struct rcu_head *head = &dummy, **tail = &dummy.next;
+static int rcu_call_count;
+static QemuEvent rcu_call_ready_event;
+
+static void enqueue(struct rcu_head *node)
+{
+ struct rcu_head **old_tail;
+
+ node->next = NULL;
+ old_tail = qatomic_xchg(&tail, &node->next);
+ qatomic_mb_set(old_tail, node);
+}
+
+static struct rcu_head *try_dequeue(void)
+{
+ struct rcu_head *node, *next;
+
+retry:
+ /* Test for an empty list, which we do not expect. Note that for
+ * the consumer head and tail are always consistent. The head
+ * is consistent because only the consumer reads/writes it.
+ * The tail, because it is the first step in the enqueuing.
+ * It is only the next pointers that might be inconsistent.
+ */
+ if (head == &dummy && qatomic_mb_read(&tail) == &dummy.next) {
+ abort();
+ }
+
+ /* If the head node has NULL in its next pointer, the value is
+ * wrong and we need to wait until its enqueuer finishes the update.
+ */
+ node = head;
+ next = qatomic_mb_read(&head->next);
+ if (!next) {
+ return NULL;
+ }
+
+ /* Since we are the sole consumer, and we excluded the empty case
+ * above, the queue will always have at least two nodes: the
+ * dummy node, and the one being removed. So we do not need to update
+ * the tail pointer.
+ */
+ head = next;
+
+ /* If we dequeued the dummy node, add it back at the end and retry. */
+ if (node == &dummy) {
+ enqueue(node);
+ goto retry;
+ }
+
+ return node;
+}
+
+static void *call_rcu_thread(void *opaque)
+{
+ struct rcu_head *node;
+
+ rcu_register_thread();
+
+ for (;;) {
+ int tries = 0;
+ int n = qatomic_read(&rcu_call_count);
+
+ /* Heuristically wait for a decent number of callbacks to pile up.
+ * Fetch rcu_call_count now, we only must process elements that were
+ * added before synchronize_rcu() starts.
+ */
+ while (n == 0 || (n < RCU_CALL_MIN_SIZE && ++tries <= 5)) {
+ g_usleep(10000);
+ if (n == 0) {
+ qemu_event_reset(&rcu_call_ready_event);
+ n = qatomic_read(&rcu_call_count);
+ if (n == 0) {
+#if defined(CONFIG_MALLOC_TRIM)
+ malloc_trim(4 * 1024 * 1024);
+#endif
+ qemu_event_wait(&rcu_call_ready_event);
+ }
+ }
+ n = qatomic_read(&rcu_call_count);
+ }
+
+ qatomic_sub(&rcu_call_count, n);
+ synchronize_rcu();
+ qemu_mutex_lock_iothread();
+ while (n > 0) {
+ node = try_dequeue();
+ while (!node) {
+ qemu_mutex_unlock_iothread();
+ qemu_event_reset(&rcu_call_ready_event);
+ node = try_dequeue();
+ if (!node) {
+ qemu_event_wait(&rcu_call_ready_event);
+ node = try_dequeue();
+ }
+ qemu_mutex_lock_iothread();
+ }
+
+ n--;
+ node->func(node);
+ }
+ qemu_mutex_unlock_iothread();
+ }
+ abort();
+}
+
+void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
+{
+ node->func = func;
+ enqueue(node);
+ qatomic_inc(&rcu_call_count);
+ qemu_event_set(&rcu_call_ready_event);
+}
+
+
+struct rcu_drain {
+ struct rcu_head rcu;
+ QemuEvent drain_complete_event;
+};
+
+static void drain_rcu_callback(struct rcu_head *node)
+{
+ struct rcu_drain *event = (struct rcu_drain *)node;
+ qemu_event_set(&event->drain_complete_event);
+}
+
+/*
+ * This function ensures that all pending RCU callbacks
+ * on the current thread are done executing
+
+ * drops big qemu lock during the wait to allow RCU thread
+ * to process the callbacks
+ *
+ */
+
+void drain_call_rcu(void)
+{
+ struct rcu_drain rcu_drain;
+ bool locked = qemu_mutex_iothread_locked();
+
+ memset(&rcu_drain, 0, sizeof(struct rcu_drain));
+ qemu_event_init(&rcu_drain.drain_complete_event, false);
+
+ if (locked) {
+ qemu_mutex_unlock_iothread();
+ }
+
+
+ /*
+ * RCU callbacks are invoked in the same order as in which they
+ * are registered, thus we can be sure that when 'drain_rcu_callback'
+ * is called, all RCU callbacks that were registered on this thread
+ * prior to calling this function are completed.
+ *
+ * Note that since we have only one global queue of the RCU callbacks,
+ * we also end up waiting for most of RCU callbacks that were registered
+ * on the other threads, but this is a side effect that shoudn't be
+ * assumed.
+ */
+
+ qatomic_inc(&in_drain_call_rcu);
+ call_rcu1(&rcu_drain.rcu, drain_rcu_callback);
+ qemu_event_wait(&rcu_drain.drain_complete_event);
+ qatomic_dec(&in_drain_call_rcu);
+
+ if (locked) {
+ qemu_mutex_lock_iothread();
+ }
+
+}
+
+void rcu_register_thread(void)
+{
+ assert(rcu_reader.ctr == 0);
+ qemu_mutex_lock(&rcu_registry_lock);
+ QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
+ qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_unregister_thread(void)
+{
+ qemu_mutex_lock(&rcu_registry_lock);
+ QLIST_REMOVE(&rcu_reader, node);
+ qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_add_force_rcu_notifier(Notifier *n)
+{
+ qemu_mutex_lock(&rcu_registry_lock);
+ notifier_list_add(&rcu_reader.force_rcu, n);
+ qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+void rcu_remove_force_rcu_notifier(Notifier *n)
+{
+ qemu_mutex_lock(&rcu_registry_lock);
+ notifier_remove(n);
+ qemu_mutex_unlock(&rcu_registry_lock);
+}
+
+static void rcu_init_complete(void)
+{
+ QemuThread thread;
+
+ qemu_mutex_init(&rcu_registry_lock);
+ qemu_mutex_init(&rcu_sync_lock);
+ qemu_event_init(&rcu_gp_event, true);
+
+ qemu_event_init(&rcu_call_ready_event, false);
+
+ /* The caller is assumed to have iothread lock, so the call_rcu thread
+ * must have been quiescent even after forking, just recreate it.
+ */
+ qemu_thread_create(&thread, "call_rcu", call_rcu_thread,
+ NULL, QEMU_THREAD_DETACHED);
+
+ rcu_register_thread();
+}
+
+static int atfork_depth = 1;
+
+void rcu_enable_atfork(void)
+{
+ atfork_depth++;
+}
+
+void rcu_disable_atfork(void)
+{
+ atfork_depth--;
+}
+
+#ifdef CONFIG_POSIX
+static void rcu_init_lock(void)
+{
+ if (atfork_depth < 1) {
+ return;
+ }
+
+ qemu_mutex_lock(&rcu_sync_lock);
+ qemu_mutex_lock(&rcu_registry_lock);
+}
+
+static void rcu_init_unlock(void)
+{
+ if (atfork_depth < 1) {
+ return;
+ }
+
+ qemu_mutex_unlock(&rcu_registry_lock);
+ qemu_mutex_unlock(&rcu_sync_lock);
+}
+
+static void rcu_init_child(void)
+{
+ if (atfork_depth < 1) {
+ return;
+ }
+
+ memset(&registry, 0, sizeof(registry));
+ rcu_init_complete();
+}
+#endif
+
+static void __attribute__((__constructor__)) rcu_init(void)
+{
+ smp_mb_global_init();
+#ifdef CONFIG_POSIX
+ pthread_atfork(rcu_init_lock, rcu_init_unlock, rcu_init_child);
+#endif
+ rcu_init_complete();
+}
diff --git a/util/readline.c b/util/readline.c
new file mode 100644
index 000000000..f1ac6e476
--- /dev/null
+++ b/util/readline.c
@@ -0,0 +1,549 @@
+/*
+ * QEMU readline utility
+ *
+ * Copyright (c) 2003-2004 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/readline.h"
+#include "qemu/ctype.h"
+#include "qemu/cutils.h"
+
+#define IS_NORM 0
+#define IS_ESC 1
+#define IS_CSI 2
+#define IS_SS3 3
+
+void readline_show_prompt(ReadLineState *rs)
+{
+ rs->printf_func(rs->opaque, "%s", rs->prompt);
+ rs->flush_func(rs->opaque);
+ rs->last_cmd_buf_index = 0;
+ rs->last_cmd_buf_size = 0;
+ rs->esc_state = IS_NORM;
+}
+
+/* update the displayed command line */
+static void readline_update(ReadLineState *rs)
+{
+ int i, delta, len;
+
+ if (rs->cmd_buf_size != rs->last_cmd_buf_size ||
+ memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) {
+ for (i = 0; i < rs->last_cmd_buf_index; i++) {
+ rs->printf_func(rs->opaque, "\033[D");
+ }
+ rs->cmd_buf[rs->cmd_buf_size] = '\0';
+ if (rs->read_password) {
+ len = strlen(rs->cmd_buf);
+ for (i = 0; i < len; i++) {
+ rs->printf_func(rs->opaque, "*");
+ }
+ } else {
+ rs->printf_func(rs->opaque, "%s", rs->cmd_buf);
+ }
+ rs->printf_func(rs->opaque, "\033[K");
+ memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size);
+ rs->last_cmd_buf_size = rs->cmd_buf_size;
+ rs->last_cmd_buf_index = rs->cmd_buf_size;
+ }
+ if (rs->cmd_buf_index != rs->last_cmd_buf_index) {
+ delta = rs->cmd_buf_index - rs->last_cmd_buf_index;
+ if (delta > 0) {
+ for (i = 0; i < delta; i++) {
+ rs->printf_func(rs->opaque, "\033[C");
+ }
+ } else {
+ delta = -delta;
+ for (i = 0; i < delta; i++) {
+ rs->printf_func(rs->opaque, "\033[D");
+ }
+ }
+ rs->last_cmd_buf_index = rs->cmd_buf_index;
+ }
+ rs->flush_func(rs->opaque);
+}
+
+static void readline_insert_char(ReadLineState *rs, int ch)
+{
+ if (rs->cmd_buf_index < READLINE_CMD_BUF_SIZE) {
+ memmove(rs->cmd_buf + rs->cmd_buf_index + 1,
+ rs->cmd_buf + rs->cmd_buf_index,
+ rs->cmd_buf_size - rs->cmd_buf_index);
+ rs->cmd_buf[rs->cmd_buf_index] = ch;
+ rs->cmd_buf_size++;
+ rs->cmd_buf_index++;
+ }
+}
+
+static void readline_backward_char(ReadLineState *rs)
+{
+ if (rs->cmd_buf_index > 0) {
+ rs->cmd_buf_index--;
+ }
+}
+
+static void readline_forward_char(ReadLineState *rs)
+{
+ if (rs->cmd_buf_index < rs->cmd_buf_size) {
+ rs->cmd_buf_index++;
+ }
+}
+
+static void readline_delete_char(ReadLineState *rs)
+{
+ if (rs->cmd_buf_index < rs->cmd_buf_size) {
+ memmove(rs->cmd_buf + rs->cmd_buf_index,
+ rs->cmd_buf + rs->cmd_buf_index + 1,
+ rs->cmd_buf_size - rs->cmd_buf_index - 1);
+ rs->cmd_buf_size--;
+ }
+}
+
+static void readline_backspace(ReadLineState *rs)
+{
+ if (rs->cmd_buf_index > 0) {
+ readline_backward_char(rs);
+ readline_delete_char(rs);
+ }
+}
+
+static void readline_backword(ReadLineState *rs)
+{
+ int start;
+
+ if (rs->cmd_buf_index == 0 || rs->cmd_buf_index > rs->cmd_buf_size) {
+ return;
+ }
+
+ start = rs->cmd_buf_index - 1;
+
+ /* find first word (backwards) */
+ while (start > 0) {
+ if (!qemu_isspace(rs->cmd_buf[start])) {
+ break;
+ }
+
+ --start;
+ }
+
+ /* find first space (backwards) */
+ while (start > 0) {
+ if (qemu_isspace(rs->cmd_buf[start])) {
+ ++start;
+ break;
+ }
+
+ --start;
+ }
+
+ /* remove word */
+ if (start < rs->cmd_buf_index) {
+ memmove(rs->cmd_buf + start,
+ rs->cmd_buf + rs->cmd_buf_index,
+ rs->cmd_buf_size - rs->cmd_buf_index);
+ rs->cmd_buf_size -= rs->cmd_buf_index - start;
+ rs->cmd_buf_index = start;
+ }
+}
+
+static void readline_bol(ReadLineState *rs)
+{
+ rs->cmd_buf_index = 0;
+}
+
+static void readline_eol(ReadLineState *rs)
+{
+ rs->cmd_buf_index = rs->cmd_buf_size;
+}
+
+static void readline_up_char(ReadLineState *rs)
+{
+ int idx;
+
+ if (rs->hist_entry == 0) {
+ return;
+ }
+ if (rs->hist_entry == -1) {
+ /* Find latest entry */
+ for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+ if (rs->history[idx] == NULL) {
+ break;
+ }
+ }
+ rs->hist_entry = idx;
+ }
+ rs->hist_entry--;
+ if (rs->hist_entry >= 0) {
+ pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+ rs->history[rs->hist_entry]);
+ rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+ }
+}
+
+static void readline_down_char(ReadLineState *rs)
+{
+ if (rs->hist_entry == -1) {
+ return;
+ }
+ if (rs->hist_entry < READLINE_MAX_CMDS - 1 &&
+ rs->history[++rs->hist_entry] != NULL) {
+ pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf),
+ rs->history[rs->hist_entry]);
+ } else {
+ rs->cmd_buf[0] = 0;
+ rs->hist_entry = -1;
+ }
+ rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf);
+}
+
+static void readline_hist_add(ReadLineState *rs, const char *cmdline)
+{
+ char *hist_entry, *new_entry;
+ int idx;
+
+ if (cmdline[0] == '\0') {
+ return;
+ }
+ new_entry = NULL;
+ if (rs->hist_entry != -1) {
+ /* We were editing an existing history entry: replace it */
+ hist_entry = rs->history[rs->hist_entry];
+ idx = rs->hist_entry;
+ if (strcmp(hist_entry, cmdline) == 0) {
+ goto same_entry;
+ }
+ }
+ /* Search cmdline in history buffers */
+ for (idx = 0; idx < READLINE_MAX_CMDS; idx++) {
+ hist_entry = rs->history[idx];
+ if (hist_entry == NULL) {
+ break;
+ }
+ if (strcmp(hist_entry, cmdline) == 0) {
+ same_entry:
+ if (idx == READLINE_MAX_CMDS - 1) {
+ return;
+ }
+ new_entry = hist_entry;
+ /* Put this entry at the end of history */
+ memmove(&rs->history[idx], &rs->history[idx + 1],
+ (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *));
+ rs->history[READLINE_MAX_CMDS - 1] = NULL;
+ for (; idx < READLINE_MAX_CMDS; idx++) {
+ if (rs->history[idx] == NULL) {
+ break;
+ }
+ }
+ break;
+ }
+ }
+ if (idx == READLINE_MAX_CMDS) {
+ /* Need to get one free slot */
+ g_free(rs->history[0]);
+ memmove(rs->history, &rs->history[1],
+ (READLINE_MAX_CMDS - 1) * sizeof(char *));
+ rs->history[READLINE_MAX_CMDS - 1] = NULL;
+ idx = READLINE_MAX_CMDS - 1;
+ }
+ if (new_entry == NULL) {
+ new_entry = g_strdup(cmdline);
+ }
+ rs->history[idx] = new_entry;
+ rs->hist_entry = -1;
+}
+
+/* completion support */
+
+void readline_add_completion(ReadLineState *rs, const char *str)
+{
+ if (rs->nb_completions < READLINE_MAX_COMPLETIONS) {
+ int i;
+ for (i = 0; i < rs->nb_completions; i++) {
+ if (!strcmp(rs->completions[i], str)) {
+ return;
+ }
+ }
+ rs->completions[rs->nb_completions++] = g_strdup(str);
+ }
+}
+
+void readline_set_completion_index(ReadLineState *rs, int index)
+{
+ rs->completion_index = index;
+}
+
+static int completion_comp(const void *a, const void *b)
+{
+ return strcmp(*(const char **) a, *(const char **) b);
+}
+
+static void readline_completion(ReadLineState *rs)
+{
+ int len, i, j, max_width, nb_cols, max_prefix;
+ char *cmdline;
+
+ rs->nb_completions = 0;
+
+ cmdline = g_strndup(rs->cmd_buf, rs->cmd_buf_index);
+ rs->completion_finder(rs->opaque, cmdline);
+ g_free(cmdline);
+
+ /* no completion found */
+ if (rs->nb_completions <= 0) {
+ return;
+ }
+ if (rs->nb_completions == 1) {
+ len = strlen(rs->completions[0]);
+ for (i = rs->completion_index; i < len; i++) {
+ readline_insert_char(rs, rs->completions[0][i]);
+ }
+ /* extra space for next argument. XXX: make it more generic */
+ if (len > 0 && rs->completions[0][len - 1] != '/') {
+ readline_insert_char(rs, ' ');
+ }
+ } else {
+ qsort(rs->completions, rs->nb_completions, sizeof(char *),
+ completion_comp);
+ rs->printf_func(rs->opaque, "\n");
+ max_width = 0;
+ max_prefix = 0;
+ for (i = 0; i < rs->nb_completions; i++) {
+ len = strlen(rs->completions[i]);
+ if (i == 0) {
+ max_prefix = len;
+ } else {
+ if (len < max_prefix) {
+ max_prefix = len;
+ }
+ for (j = 0; j < max_prefix; j++) {
+ if (rs->completions[i][j] != rs->completions[0][j]) {
+ max_prefix = j;
+ }
+ }
+ }
+ if (len > max_width) {
+ max_width = len;
+ }
+ }
+ if (max_prefix > 0)
+ for (i = rs->completion_index; i < max_prefix; i++) {
+ readline_insert_char(rs, rs->completions[0][i]);
+ }
+ max_width += 2;
+ if (max_width < 10) {
+ max_width = 10;
+ } else if (max_width > 80) {
+ max_width = 80;
+ }
+ nb_cols = 80 / max_width;
+ j = 0;
+ for (i = 0; i < rs->nb_completions; i++) {
+ rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]);
+ if (++j == nb_cols || i == (rs->nb_completions - 1)) {
+ rs->printf_func(rs->opaque, "\n");
+ j = 0;
+ }
+ }
+ readline_show_prompt(rs);
+ }
+ for (i = 0; i < rs->nb_completions; i++) {
+ g_free(rs->completions[i]);
+ }
+}
+
+static void readline_clear_screen(ReadLineState *rs)
+{
+ rs->printf_func(rs->opaque, "\033[2J\033[1;1H");
+ readline_show_prompt(rs);
+}
+
+/* return true if command handled */
+void readline_handle_byte(ReadLineState *rs, int ch)
+{
+ switch (rs->esc_state) {
+ case IS_NORM:
+ switch (ch) {
+ case 1:
+ readline_bol(rs);
+ break;
+ case 4:
+ readline_delete_char(rs);
+ break;
+ case 5:
+ readline_eol(rs);
+ break;
+ case 9:
+ readline_completion(rs);
+ break;
+ case 12:
+ readline_clear_screen(rs);
+ break;
+ case 10:
+ case 13:
+ rs->cmd_buf[rs->cmd_buf_size] = '\0';
+ if (!rs->read_password) {
+ readline_hist_add(rs, rs->cmd_buf);
+ }
+ rs->printf_func(rs->opaque, "\n");
+ rs->cmd_buf_index = 0;
+ rs->cmd_buf_size = 0;
+ rs->last_cmd_buf_index = 0;
+ rs->last_cmd_buf_size = 0;
+ rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque);
+ break;
+ case 23:
+ /* ^W */
+ readline_backword(rs);
+ break;
+ case 27:
+ rs->esc_state = IS_ESC;
+ break;
+ case 127:
+ case 8:
+ readline_backspace(rs);
+ break;
+ case 155:
+ rs->esc_state = IS_CSI;
+ break;
+ default:
+ if (ch >= 32) {
+ readline_insert_char(rs, ch);
+ }
+ break;
+ }
+ break;
+ case IS_ESC:
+ if (ch == '[') {
+ rs->esc_state = IS_CSI;
+ rs->esc_param = 0;
+ } else if (ch == 'O') {
+ rs->esc_state = IS_SS3;
+ rs->esc_param = 0;
+ } else {
+ rs->esc_state = IS_NORM;
+ }
+ break;
+ case IS_CSI:
+ switch (ch) {
+ case 'A':
+ case 'F':
+ readline_up_char(rs);
+ break;
+ case 'B':
+ case 'E':
+ readline_down_char(rs);
+ break;
+ case 'D':
+ readline_backward_char(rs);
+ break;
+ case 'C':
+ readline_forward_char(rs);
+ break;
+ case '0' ... '9':
+ rs->esc_param = rs->esc_param * 10 + (ch - '0');
+ goto the_end;
+ case '~':
+ switch (rs->esc_param) {
+ case 1:
+ readline_bol(rs);
+ break;
+ case 3:
+ readline_delete_char(rs);
+ break;
+ case 4:
+ readline_eol(rs);
+ break;
+ }
+ break;
+ default:
+ break;
+ }
+ rs->esc_state = IS_NORM;
+ the_end:
+ break;
+ case IS_SS3:
+ switch (ch) {
+ case 'F':
+ readline_eol(rs);
+ break;
+ case 'H':
+ readline_bol(rs);
+ break;
+ }
+ rs->esc_state = IS_NORM;
+ break;
+ }
+ readline_update(rs);
+}
+
+void readline_start(ReadLineState *rs, const char *prompt, int read_password,
+ ReadLineFunc *readline_func, void *opaque)
+{
+ pstrcpy(rs->prompt, sizeof(rs->prompt), prompt);
+ rs->readline_func = readline_func;
+ rs->readline_opaque = opaque;
+ rs->read_password = read_password;
+ readline_restart(rs);
+}
+
+void readline_restart(ReadLineState *rs)
+{
+ rs->cmd_buf_index = 0;
+ rs->cmd_buf_size = 0;
+}
+
+const char *readline_get_history(ReadLineState *rs, unsigned int index)
+{
+ if (index >= READLINE_MAX_CMDS) {
+ return NULL;
+ }
+ return rs->history[index];
+}
+
+void readline_free(ReadLineState *rs)
+{
+ int i;
+
+ if (!rs) {
+ return;
+ }
+ for (i = 0; i < READLINE_MAX_CMDS; i++) {
+ g_free(rs->history[i]);
+ }
+ g_free(rs);
+}
+
+ReadLineState *readline_init(ReadLinePrintfFunc *printf_func,
+ ReadLineFlushFunc *flush_func,
+ void *opaque,
+ ReadLineCompletionFunc *completion_finder)
+{
+ ReadLineState *rs = g_new0(ReadLineState, 1);
+
+ rs->hist_entry = -1;
+ rs->opaque = opaque;
+ rs->printf_func = printf_func;
+ rs->flush_func = flush_func;
+ rs->completion_finder = completion_finder;
+
+ return rs;
+}
diff --git a/util/selfmap.c b/util/selfmap.c
new file mode 100644
index 000000000..2c14f019c
--- /dev/null
+++ b/util/selfmap.c
@@ -0,0 +1,83 @@
+/*
+ * Utility function to get QEMU's own process map
+ *
+ * Copyright (c) 2020 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qemu/selfmap.h"
+
+GSList *read_self_maps(void)
+{
+ gchar *maps;
+ GSList *map_info = NULL;
+
+ if (g_file_get_contents("/proc/self/maps", &maps, NULL, NULL)) {
+ gchar **lines = g_strsplit(maps, "\n", 0);
+ int i, entries = g_strv_length(lines);
+
+ for (i = 0; i < entries; i++) {
+ gchar **fields = g_strsplit(lines[i], " ", 6);
+ if (g_strv_length(fields) > 4) {
+ MapInfo *e = g_new0(MapInfo, 1);
+ int errors = 0;
+ const char *end;
+
+ errors |= qemu_strtoul(fields[0], &end, 16, &e->start);
+ errors |= qemu_strtoul(end + 1, NULL, 16, &e->end);
+
+ e->is_read = fields[1][0] == 'r';
+ e->is_write = fields[1][1] == 'w';
+ e->is_exec = fields[1][2] == 'x';
+ e->is_priv = fields[1][3] == 'p';
+
+ errors |= qemu_strtoul(fields[2], NULL, 16, &e->offset);
+ e->dev = g_strdup(fields[3]);
+ errors |= qemu_strtou64(fields[4], NULL, 10, &e->inode);
+
+ if (!errors) {
+ /*
+ * The last field may have leading spaces which we
+ * need to strip.
+ */
+ if (g_strv_length(fields) == 6) {
+ e->path = g_strdup(g_strchug(fields[5]));
+ }
+ map_info = g_slist_prepend(map_info, e);
+ } else {
+ g_free(e->dev);
+ g_free(e);
+ }
+ }
+
+ g_strfreev(fields);
+ }
+ g_strfreev(lines);
+ g_free(maps);
+ }
+
+ /* ensure the map data is in the same order we collected it */
+ return g_slist_reverse(map_info);
+}
+
+/**
+ * free_self_maps:
+ * @info: a GSlist
+ *
+ * Free a list of MapInfo structures.
+ */
+static void free_info(gpointer data)
+{
+ MapInfo *e = (MapInfo *) data;
+ g_free(e->dev);
+ g_free(e->path);
+ g_free(e);
+}
+
+void free_self_maps(GSList *info)
+{
+ g_slist_free_full(info, &free_info);
+}
diff --git a/util/stats64.c b/util/stats64.c
new file mode 100644
index 000000000..897613c94
--- /dev/null
+++ b/util/stats64.c
@@ -0,0 +1,137 @@
+/*
+ * Atomic operations on 64-bit quantities.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+#include "qemu/stats64.h"
+#include "qemu/processor.h"
+
+#ifndef CONFIG_ATOMIC64
+static inline void stat64_rdlock(Stat64 *s)
+{
+ /* Keep out incoming writers to avoid them starving us. */
+ qatomic_add(&s->lock, 2);
+
+ /* If there is a concurrent writer, wait for it. */
+ while (qatomic_read(&s->lock) & 1) {
+ cpu_relax();
+ }
+}
+
+static inline void stat64_rdunlock(Stat64 *s)
+{
+ qatomic_sub(&s->lock, 2);
+}
+
+static inline bool stat64_wrtrylock(Stat64 *s)
+{
+ return qatomic_cmpxchg(&s->lock, 0, 1) == 0;
+}
+
+static inline void stat64_wrunlock(Stat64 *s)
+{
+ qatomic_dec(&s->lock);
+}
+
+uint64_t stat64_get(const Stat64 *s)
+{
+ uint32_t high, low;
+
+ stat64_rdlock((Stat64 *)s);
+
+ /* 64-bit writes always take the lock, so we can read in
+ * any order.
+ */
+ high = qatomic_read(&s->high);
+ low = qatomic_read(&s->low);
+ stat64_rdunlock((Stat64 *)s);
+
+ return ((uint64_t)high << 32) | low;
+}
+
+bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high)
+{
+ uint32_t old;
+
+ if (!stat64_wrtrylock(s)) {
+ cpu_relax();
+ return false;
+ }
+
+ /* 64-bit reads always take the lock, so they don't care about the
+ * order of our update. By updating s->low first, we can check
+ * whether we have to carry into s->high.
+ */
+ old = qatomic_fetch_add(&s->low, low);
+ high += (old + low) < old;
+ qatomic_add(&s->high, high);
+ stat64_wrunlock(s);
+ return true;
+}
+
+bool stat64_min_slow(Stat64 *s, uint64_t value)
+{
+ uint32_t high, low;
+ uint64_t orig;
+
+ if (!stat64_wrtrylock(s)) {
+ cpu_relax();
+ return false;
+ }
+
+ high = qatomic_read(&s->high);
+ low = qatomic_read(&s->low);
+
+ orig = ((uint64_t)high << 32) | low;
+ if (value < orig) {
+ /* We have to set low before high, just like stat64_min reads
+ * high before low. The value may become higher temporarily, but
+ * stat64_get does not notice (it takes the lock) and the only ill
+ * effect on stat64_min is that the slow path may be triggered
+ * unnecessarily.
+ */
+ qatomic_set(&s->low, (uint32_t)value);
+ smp_wmb();
+ qatomic_set(&s->high, value >> 32);
+ }
+ stat64_wrunlock(s);
+ return true;
+}
+
+bool stat64_max_slow(Stat64 *s, uint64_t value)
+{
+ uint32_t high, low;
+ uint64_t orig;
+
+ if (!stat64_wrtrylock(s)) {
+ cpu_relax();
+ return false;
+ }
+
+ high = qatomic_read(&s->high);
+ low = qatomic_read(&s->low);
+
+ orig = ((uint64_t)high << 32) | low;
+ if (value > orig) {
+ /* We have to set low before high, just like stat64_max reads
+ * high before low. The value may become lower temporarily, but
+ * stat64_get does not notice (it takes the lock) and the only ill
+ * effect on stat64_max is that the slow path may be triggered
+ * unnecessarily.
+ */
+ qatomic_set(&s->low, (uint32_t)value);
+ smp_wmb();
+ qatomic_set(&s->high, value >> 32);
+ }
+ stat64_wrunlock(s);
+ return true;
+}
+#endif
diff --git a/util/sys_membarrier.c b/util/sys_membarrier.c
new file mode 100644
index 000000000..1362c0c4c
--- /dev/null
+++ b/util/sys_membarrier.c
@@ -0,0 +1,50 @@
+/*
+ * Process-global memory barriers
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/sys_membarrier.h"
+#include "qemu/error-report.h"
+
+#ifdef CONFIG_LINUX
+#include <linux/membarrier.h>
+#include <sys/syscall.h>
+
+static int
+membarrier(int cmd, int flags)
+{
+ return syscall(__NR_membarrier, cmd, flags);
+}
+#endif
+
+void smp_mb_global(void)
+{
+#if defined CONFIG_WIN32
+ FlushProcessWriteBuffers();
+#elif defined CONFIG_LINUX
+ membarrier(MEMBARRIER_CMD_SHARED, 0);
+#else
+#error --enable-membarrier is not supported on this operating system.
+#endif
+}
+
+void smp_mb_global_init(void)
+{
+#ifdef CONFIG_LINUX
+ int ret = membarrier(MEMBARRIER_CMD_QUERY, 0);
+ if (ret < 0) {
+ error_report("This QEMU binary requires the membarrier system call.");
+ error_report("Please upgrade your system to a newer version of Linux");
+ exit(1);
+ }
+ if (!(ret & MEMBARRIER_CMD_SHARED)) {
+ error_report("This QEMU binary requires MEMBARRIER_CMD_SHARED support.");
+ error_report("Please upgrade your system to a newer version of Linux");
+ exit(1);
+ }
+#endif
+}
diff --git a/util/systemd.c b/util/systemd.c
new file mode 100644
index 000000000..5bcac9b40
--- /dev/null
+++ b/util/systemd.c
@@ -0,0 +1,79 @@
+/*
+ * systemd socket activation support
+ *
+ * Copyright 2017 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Richard W.M. Jones <rjones@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/systemd.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+#ifndef _WIN32
+unsigned int check_socket_activation(void)
+{
+ const char *s;
+ unsigned long pid;
+ unsigned long nr_fds;
+ unsigned int i;
+ int fd;
+ int f;
+ int err;
+
+ s = getenv("LISTEN_PID");
+ if (s == NULL) {
+ return 0;
+ }
+ err = qemu_strtoul(s, NULL, 10, &pid);
+ if (err) {
+ return 0;
+ }
+ if (pid != getpid()) {
+ return 0;
+ }
+
+ s = getenv("LISTEN_FDS");
+ if (s == NULL) {
+ return 0;
+ }
+ err = qemu_strtoul(s, NULL, 10, &nr_fds);
+ if (err) {
+ return 0;
+ }
+ assert(nr_fds <= UINT_MAX);
+
+ /* So these are not passed to any child processes we might start. */
+ unsetenv("LISTEN_FDS");
+ unsetenv("LISTEN_PID");
+
+ /* So the file descriptors don't leak into child processes. */
+ for (i = 0; i < nr_fds; ++i) {
+ fd = FIRST_SOCKET_ACTIVATION_FD + i;
+ f = fcntl(fd, F_GETFD);
+ if (f == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) {
+ /* If we cannot set FD_CLOEXEC then it probably means the file
+ * descriptor is invalid, so socket activation has gone wrong
+ * and we should exit.
+ */
+ error_report("Socket activation failed: "
+ "invalid file descriptor fd = %d: %s",
+ fd, g_strerror(errno));
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ return (unsigned int) nr_fds;
+}
+
+#else /* !_WIN32 */
+unsigned int check_socket_activation(void)
+{
+ return 0;
+}
+#endif
diff --git a/util/thread-pool.c b/util/thread-pool.c
new file mode 100644
index 000000000..d763cea50
--- /dev/null
+++ b/util/thread-pool.c
@@ -0,0 +1,352 @@
+/*
+ * QEMU block layer thread pool
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+#include "qemu/osdep.h"
+#include "qemu/queue.h"
+#include "qemu/thread.h"
+#include "qemu/coroutine.h"
+#include "trace.h"
+#include "block/thread-pool.h"
+#include "qemu/main-loop.h"
+
+static void do_spawn_thread(ThreadPool *pool);
+
+typedef struct ThreadPoolElement ThreadPoolElement;
+
+enum ThreadState {
+ THREAD_QUEUED,
+ THREAD_ACTIVE,
+ THREAD_DONE,
+};
+
+struct ThreadPoolElement {
+ BlockAIOCB common;
+ ThreadPool *pool;
+ ThreadPoolFunc *func;
+ void *arg;
+
+ /* Moving state out of THREAD_QUEUED is protected by lock. After
+ * that, only the worker thread can write to it. Reads and writes
+ * of state and ret are ordered with memory barriers.
+ */
+ enum ThreadState state;
+ int ret;
+
+ /* Access to this list is protected by lock. */
+ QTAILQ_ENTRY(ThreadPoolElement) reqs;
+
+ /* Access to this list is protected by the global mutex. */
+ QLIST_ENTRY(ThreadPoolElement) all;
+};
+
+struct ThreadPool {
+ AioContext *ctx;
+ QEMUBH *completion_bh;
+ QemuMutex lock;
+ QemuCond worker_stopped;
+ QemuSemaphore sem;
+ int max_threads;
+ QEMUBH *new_thread_bh;
+
+ /* The following variables are only accessed from one AioContext. */
+ QLIST_HEAD(, ThreadPoolElement) head;
+
+ /* The following variables are protected by lock. */
+ QTAILQ_HEAD(, ThreadPoolElement) request_list;
+ int cur_threads;
+ int idle_threads;
+ int new_threads; /* backlog of threads we need to create */
+ int pending_threads; /* threads created but not running yet */
+ bool stopping;
+};
+
+static void *worker_thread(void *opaque)
+{
+ ThreadPool *pool = opaque;
+
+ qemu_mutex_lock(&pool->lock);
+ pool->pending_threads--;
+ do_spawn_thread(pool);
+
+ while (!pool->stopping) {
+ ThreadPoolElement *req;
+ int ret;
+
+ do {
+ pool->idle_threads++;
+ qemu_mutex_unlock(&pool->lock);
+ ret = qemu_sem_timedwait(&pool->sem, 10000);
+ qemu_mutex_lock(&pool->lock);
+ pool->idle_threads--;
+ } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list));
+ if (ret == -1 || pool->stopping) {
+ break;
+ }
+
+ req = QTAILQ_FIRST(&pool->request_list);
+ QTAILQ_REMOVE(&pool->request_list, req, reqs);
+ req->state = THREAD_ACTIVE;
+ qemu_mutex_unlock(&pool->lock);
+
+ ret = req->func(req->arg);
+
+ req->ret = ret;
+ /* Write ret before state. */
+ smp_wmb();
+ req->state = THREAD_DONE;
+
+ qemu_mutex_lock(&pool->lock);
+
+ qemu_bh_schedule(pool->completion_bh);
+ }
+
+ pool->cur_threads--;
+ qemu_cond_signal(&pool->worker_stopped);
+ qemu_mutex_unlock(&pool->lock);
+ return NULL;
+}
+
+static void do_spawn_thread(ThreadPool *pool)
+{
+ QemuThread t;
+
+ /* Runs with lock taken. */
+ if (!pool->new_threads) {
+ return;
+ }
+
+ pool->new_threads--;
+ pool->pending_threads++;
+
+ qemu_thread_create(&t, "worker", worker_thread, pool, QEMU_THREAD_DETACHED);
+}
+
+static void spawn_thread_bh_fn(void *opaque)
+{
+ ThreadPool *pool = opaque;
+
+ qemu_mutex_lock(&pool->lock);
+ do_spawn_thread(pool);
+ qemu_mutex_unlock(&pool->lock);
+}
+
+static void spawn_thread(ThreadPool *pool)
+{
+ pool->cur_threads++;
+ pool->new_threads++;
+ /* If there are threads being created, they will spawn new workers, so
+ * we don't spend time creating many threads in a loop holding a mutex or
+ * starving the current vcpu.
+ *
+ * If there are no idle threads, ask the main thread to create one, so we
+ * inherit the correct affinity instead of the vcpu affinity.
+ */
+ if (!pool->pending_threads) {
+ qemu_bh_schedule(pool->new_thread_bh);
+ }
+}
+
+static void thread_pool_completion_bh(void *opaque)
+{
+ ThreadPool *pool = opaque;
+ ThreadPoolElement *elem, *next;
+
+ aio_context_acquire(pool->ctx);
+restart:
+ QLIST_FOREACH_SAFE(elem, &pool->head, all, next) {
+ if (elem->state != THREAD_DONE) {
+ continue;
+ }
+
+ trace_thread_pool_complete(pool, elem, elem->common.opaque,
+ elem->ret);
+ QLIST_REMOVE(elem, all);
+
+ if (elem->common.cb) {
+ /* Read state before ret. */
+ smp_rmb();
+
+ /* Schedule ourselves in case elem->common.cb() calls aio_poll() to
+ * wait for another request that completed at the same time.
+ */
+ qemu_bh_schedule(pool->completion_bh);
+
+ aio_context_release(pool->ctx);
+ elem->common.cb(elem->common.opaque, elem->ret);
+ aio_context_acquire(pool->ctx);
+
+ /* We can safely cancel the completion_bh here regardless of someone
+ * else having scheduled it meanwhile because we reenter the
+ * completion function anyway (goto restart).
+ */
+ qemu_bh_cancel(pool->completion_bh);
+
+ qemu_aio_unref(elem);
+ goto restart;
+ } else {
+ qemu_aio_unref(elem);
+ }
+ }
+ aio_context_release(pool->ctx);
+}
+
+static void thread_pool_cancel(BlockAIOCB *acb)
+{
+ ThreadPoolElement *elem = (ThreadPoolElement *)acb;
+ ThreadPool *pool = elem->pool;
+
+ trace_thread_pool_cancel(elem, elem->common.opaque);
+
+ QEMU_LOCK_GUARD(&pool->lock);
+ if (elem->state == THREAD_QUEUED &&
+ /* No thread has yet started working on elem. we can try to "steal"
+ * the item from the worker if we can get a signal from the
+ * semaphore. Because this is non-blocking, we can do it with
+ * the lock taken and ensure that elem will remain THREAD_QUEUED.
+ */
+ qemu_sem_timedwait(&pool->sem, 0) == 0) {
+ QTAILQ_REMOVE(&pool->request_list, elem, reqs);
+ qemu_bh_schedule(pool->completion_bh);
+
+ elem->state = THREAD_DONE;
+ elem->ret = -ECANCELED;
+ }
+
+}
+
+static AioContext *thread_pool_get_aio_context(BlockAIOCB *acb)
+{
+ ThreadPoolElement *elem = (ThreadPoolElement *)acb;
+ ThreadPool *pool = elem->pool;
+ return pool->ctx;
+}
+
+static const AIOCBInfo thread_pool_aiocb_info = {
+ .aiocb_size = sizeof(ThreadPoolElement),
+ .cancel_async = thread_pool_cancel,
+ .get_aio_context = thread_pool_get_aio_context,
+};
+
+BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
+ ThreadPoolFunc *func, void *arg,
+ BlockCompletionFunc *cb, void *opaque)
+{
+ ThreadPoolElement *req;
+
+ req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque);
+ req->func = func;
+ req->arg = arg;
+ req->state = THREAD_QUEUED;
+ req->pool = pool;
+
+ QLIST_INSERT_HEAD(&pool->head, req, all);
+
+ trace_thread_pool_submit(pool, req, arg);
+
+ qemu_mutex_lock(&pool->lock);
+ if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) {
+ spawn_thread(pool);
+ }
+ QTAILQ_INSERT_TAIL(&pool->request_list, req, reqs);
+ qemu_mutex_unlock(&pool->lock);
+ qemu_sem_post(&pool->sem);
+ return &req->common;
+}
+
+typedef struct ThreadPoolCo {
+ Coroutine *co;
+ int ret;
+} ThreadPoolCo;
+
+static void thread_pool_co_cb(void *opaque, int ret)
+{
+ ThreadPoolCo *co = opaque;
+
+ co->ret = ret;
+ aio_co_wake(co->co);
+}
+
+int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func,
+ void *arg)
+{
+ ThreadPoolCo tpc = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS };
+ assert(qemu_in_coroutine());
+ thread_pool_submit_aio(pool, func, arg, thread_pool_co_cb, &tpc);
+ qemu_coroutine_yield();
+ return tpc.ret;
+}
+
+void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg)
+{
+ thread_pool_submit_aio(pool, func, arg, NULL, NULL);
+}
+
+static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx)
+{
+ if (!ctx) {
+ ctx = qemu_get_aio_context();
+ }
+
+ memset(pool, 0, sizeof(*pool));
+ pool->ctx = ctx;
+ pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool);
+ qemu_mutex_init(&pool->lock);
+ qemu_cond_init(&pool->worker_stopped);
+ qemu_sem_init(&pool->sem, 0);
+ pool->max_threads = 64;
+ pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool);
+
+ QLIST_INIT(&pool->head);
+ QTAILQ_INIT(&pool->request_list);
+}
+
+ThreadPool *thread_pool_new(AioContext *ctx)
+{
+ ThreadPool *pool = g_new(ThreadPool, 1);
+ thread_pool_init_one(pool, ctx);
+ return pool;
+}
+
+void thread_pool_free(ThreadPool *pool)
+{
+ if (!pool) {
+ return;
+ }
+
+ assert(QLIST_EMPTY(&pool->head));
+
+ qemu_mutex_lock(&pool->lock);
+
+ /* Stop new threads from spawning */
+ qemu_bh_delete(pool->new_thread_bh);
+ pool->cur_threads -= pool->new_threads;
+ pool->new_threads = 0;
+
+ /* Wait for worker threads to terminate */
+ pool->stopping = true;
+ while (pool->cur_threads > 0) {
+ qemu_sem_post(&pool->sem);
+ qemu_cond_wait(&pool->worker_stopped, &pool->lock);
+ }
+
+ qemu_mutex_unlock(&pool->lock);
+
+ qemu_bh_delete(pool->completion_bh);
+ qemu_sem_destroy(&pool->sem);
+ qemu_cond_destroy(&pool->worker_stopped);
+ qemu_mutex_destroy(&pool->lock);
+ g_free(pool);
+}
diff --git a/util/throttle.c b/util/throttle.c
new file mode 100644
index 000000000..81f247a8d
--- /dev/null
+++ b/util/throttle.c
@@ -0,0 +1,637 @@
+/*
+ * QEMU throttling infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2013-2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ * Benoît Canet <benoit.canet@nodalink.com>
+ * Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/throttle.h"
+#include "qemu/timer.h"
+#include "block/aio.h"
+
+/* This function make a bucket leak
+ *
+ * @bkt: the bucket to make leak
+ * @delta_ns: the time delta
+ */
+void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns)
+{
+ double leak;
+
+ /* compute how much to leak */
+ leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND;
+
+ /* make the bucket leak */
+ bkt->level = MAX(bkt->level - leak, 0);
+
+ /* if we allow bursts for more than one second we also need to
+ * keep track of bkt->burst_level so the bkt->max goal per second
+ * is attained */
+ if (bkt->burst_length > 1) {
+ leak = (bkt->max * (double) delta_ns) / NANOSECONDS_PER_SECOND;
+ bkt->burst_level = MAX(bkt->burst_level - leak, 0);
+ }
+}
+
+/* Calculate the time delta since last leak and make proportionals leaks
+ *
+ * @now: the current timestamp in ns
+ */
+static void throttle_do_leak(ThrottleState *ts, int64_t now)
+{
+ /* compute the time elapsed since the last leak */
+ int64_t delta_ns = now - ts->previous_leak;
+ int i;
+
+ ts->previous_leak = now;
+
+ if (delta_ns <= 0) {
+ return;
+ }
+
+ /* make each bucket leak */
+ for (i = 0; i < BUCKETS_COUNT; i++) {
+ throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns);
+ }
+}
+
+/* do the real job of computing the time to wait
+ *
+ * @limit: the throttling limit
+ * @extra: the number of operation to delay
+ * @ret: the time to wait in ns
+ */
+static int64_t throttle_do_compute_wait(double limit, double extra)
+{
+ double wait = extra * NANOSECONDS_PER_SECOND;
+ wait /= limit;
+ return wait;
+}
+
+/* This function compute the wait time in ns that a leaky bucket should trigger
+ *
+ * @bkt: the leaky bucket we operate on
+ * @ret: the resulting wait time in ns or 0 if the operation can go through
+ */
+int64_t throttle_compute_wait(LeakyBucket *bkt)
+{
+ double extra; /* the number of extra units blocking the io */
+ double bucket_size; /* I/O before throttling to bkt->avg */
+ double burst_bucket_size; /* Before throttling to bkt->max */
+
+ if (!bkt->avg) {
+ return 0;
+ }
+
+ if (!bkt->max) {
+ /* If bkt->max is 0 we still want to allow short bursts of I/O
+ * from the guest, otherwise every other request will be throttled
+ * and performance will suffer considerably. */
+ bucket_size = (double) bkt->avg / 10;
+ burst_bucket_size = 0;
+ } else {
+ /* If we have a burst limit then we have to wait until all I/O
+ * at burst rate has finished before throttling to bkt->avg */
+ bucket_size = bkt->max * bkt->burst_length;
+ burst_bucket_size = (double) bkt->max / 10;
+ }
+
+ /* If the main bucket is full then we have to wait */
+ extra = bkt->level - bucket_size;
+ if (extra > 0) {
+ return throttle_do_compute_wait(bkt->avg, extra);
+ }
+
+ /* If the main bucket is not full yet we still have to check the
+ * burst bucket in order to enforce the burst limit */
+ if (bkt->burst_length > 1) {
+ assert(bkt->max > 0); /* see throttle_is_valid() */
+ extra = bkt->burst_level - burst_bucket_size;
+ if (extra > 0) {
+ return throttle_do_compute_wait(bkt->max, extra);
+ }
+ }
+
+ return 0;
+}
+
+/* This function compute the time that must be waited while this IO
+ *
+ * @is_write: true if the current IO is a write, false if it's a read
+ * @ret: time to wait
+ */
+static int64_t throttle_compute_wait_for(ThrottleState *ts,
+ bool is_write)
+{
+ BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL,
+ THROTTLE_OPS_TOTAL,
+ THROTTLE_BPS_READ,
+ THROTTLE_OPS_READ},
+ {THROTTLE_BPS_TOTAL,
+ THROTTLE_OPS_TOTAL,
+ THROTTLE_BPS_WRITE,
+ THROTTLE_OPS_WRITE}, };
+ int64_t wait, max_wait = 0;
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ BucketType index = to_check[is_write][i];
+ wait = throttle_compute_wait(&ts->cfg.buckets[index]);
+ if (wait > max_wait) {
+ max_wait = wait;
+ }
+ }
+
+ return max_wait;
+}
+
+/* compute the timer for this type of operation
+ *
+ * @is_write: the type of operation
+ * @now: the current clock timestamp
+ * @next_timestamp: the resulting timer
+ * @ret: true if a timer must be set
+ */
+static bool throttle_compute_timer(ThrottleState *ts,
+ bool is_write,
+ int64_t now,
+ int64_t *next_timestamp)
+{
+ int64_t wait;
+
+ /* leak proportionally to the time elapsed */
+ throttle_do_leak(ts, now);
+
+ /* compute the wait time if any */
+ wait = throttle_compute_wait_for(ts, is_write);
+
+ /* if the code must wait compute when the next timer should fire */
+ if (wait) {
+ *next_timestamp = now + wait;
+ return true;
+ }
+
+ /* else no need to wait at all */
+ *next_timestamp = now;
+ return false;
+}
+
+/* Add timers to event loop */
+void throttle_timers_attach_aio_context(ThrottleTimers *tt,
+ AioContext *new_context)
+{
+ tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+ tt->read_timer_cb, tt->timer_opaque);
+ tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS,
+ tt->write_timer_cb, tt->timer_opaque);
+}
+
+/*
+ * Initialize the ThrottleConfig structure to a valid state
+ * @cfg: the config to initialize
+ */
+void throttle_config_init(ThrottleConfig *cfg)
+{
+ unsigned i;
+ memset(cfg, 0, sizeof(*cfg));
+ for (i = 0; i < BUCKETS_COUNT; i++) {
+ cfg->buckets[i].burst_length = 1;
+ }
+}
+
+/* To be called first on the ThrottleState */
+void throttle_init(ThrottleState *ts)
+{
+ memset(ts, 0, sizeof(ThrottleState));
+ throttle_config_init(&ts->cfg);
+}
+
+/* To be called first on the ThrottleTimers */
+void throttle_timers_init(ThrottleTimers *tt,
+ AioContext *aio_context,
+ QEMUClockType clock_type,
+ QEMUTimerCB *read_timer_cb,
+ QEMUTimerCB *write_timer_cb,
+ void *timer_opaque)
+{
+ memset(tt, 0, sizeof(ThrottleTimers));
+
+ tt->clock_type = clock_type;
+ tt->read_timer_cb = read_timer_cb;
+ tt->write_timer_cb = write_timer_cb;
+ tt->timer_opaque = timer_opaque;
+ throttle_timers_attach_aio_context(tt, aio_context);
+}
+
+/* destroy a timer */
+static void throttle_timer_destroy(QEMUTimer **timer)
+{
+ assert(*timer != NULL);
+
+ timer_free(*timer);
+ *timer = NULL;
+}
+
+/* Remove timers from event loop */
+void throttle_timers_detach_aio_context(ThrottleTimers *tt)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ throttle_timer_destroy(&tt->timers[i]);
+ }
+}
+
+/* To be called last on the ThrottleTimers */
+void throttle_timers_destroy(ThrottleTimers *tt)
+{
+ throttle_timers_detach_aio_context(tt);
+}
+
+/* is any throttling timer configured */
+bool throttle_timers_are_initialized(ThrottleTimers *tt)
+{
+ if (tt->timers[0]) {
+ return true;
+ }
+
+ return false;
+}
+
+/* Does any throttling must be done
+ *
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if throttling must be done else false
+ */
+bool throttle_enabled(ThrottleConfig *cfg)
+{
+ int i;
+
+ for (i = 0; i < BUCKETS_COUNT; i++) {
+ if (cfg->buckets[i].avg > 0) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* check if a throttling configuration is valid
+ * @cfg: the throttling configuration to inspect
+ * @ret: true if valid else false
+ * @errp: error object
+ */
+bool throttle_is_valid(ThrottleConfig *cfg, Error **errp)
+{
+ int i;
+ bool bps_flag, ops_flag;
+ bool bps_max_flag, ops_max_flag;
+
+ bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg &&
+ (cfg->buckets[THROTTLE_BPS_READ].avg ||
+ cfg->buckets[THROTTLE_BPS_WRITE].avg);
+
+ ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg &&
+ (cfg->buckets[THROTTLE_OPS_READ].avg ||
+ cfg->buckets[THROTTLE_OPS_WRITE].avg);
+
+ bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max &&
+ (cfg->buckets[THROTTLE_BPS_READ].max ||
+ cfg->buckets[THROTTLE_BPS_WRITE].max);
+
+ ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max &&
+ (cfg->buckets[THROTTLE_OPS_READ].max ||
+ cfg->buckets[THROTTLE_OPS_WRITE].max);
+
+ if (bps_flag || ops_flag || bps_max_flag || ops_max_flag) {
+ error_setg(errp, "bps/iops/max total values and read/write values"
+ " cannot be used at the same time");
+ return false;
+ }
+
+ if (cfg->op_size &&
+ !cfg->buckets[THROTTLE_OPS_TOTAL].avg &&
+ !cfg->buckets[THROTTLE_OPS_READ].avg &&
+ !cfg->buckets[THROTTLE_OPS_WRITE].avg) {
+ error_setg(errp, "iops size requires an iops value to be set");
+ return false;
+ }
+
+ for (i = 0; i < BUCKETS_COUNT; i++) {
+ LeakyBucket *bkt = &cfg->buckets[i];
+ if (bkt->avg > THROTTLE_VALUE_MAX || bkt->max > THROTTLE_VALUE_MAX) {
+ error_setg(errp, "bps/iops/max values must be within [0, %lld]",
+ THROTTLE_VALUE_MAX);
+ return false;
+ }
+
+ if (!bkt->burst_length) {
+ error_setg(errp, "the burst length cannot be 0");
+ return false;
+ }
+
+ if (bkt->burst_length > 1 && !bkt->max) {
+ error_setg(errp, "burst length set without burst rate");
+ return false;
+ }
+
+ if (bkt->max && bkt->burst_length > THROTTLE_VALUE_MAX / bkt->max) {
+ error_setg(errp, "burst length too high for this burst rate");
+ return false;
+ }
+
+ if (bkt->max && !bkt->avg) {
+ error_setg(errp, "bps_max/iops_max require corresponding"
+ " bps/iops values");
+ return false;
+ }
+
+ if (bkt->max && bkt->max < bkt->avg) {
+ error_setg(errp, "bps_max/iops_max cannot be lower than bps/iops");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/* Used to configure the throttle
+ *
+ * @ts: the throttle state we are working on
+ * @clock_type: the group's clock_type
+ * @cfg: the config to set
+ */
+void throttle_config(ThrottleState *ts,
+ QEMUClockType clock_type,
+ ThrottleConfig *cfg)
+{
+ int i;
+
+ ts->cfg = *cfg;
+
+ /* Zero bucket level */
+ for (i = 0; i < BUCKETS_COUNT; i++) {
+ ts->cfg.buckets[i].level = 0;
+ ts->cfg.buckets[i].burst_level = 0;
+ }
+
+ ts->previous_leak = qemu_clock_get_ns(clock_type);
+}
+
+/* used to get config
+ *
+ * @ts: the throttle state we are working on
+ * @cfg: the config to write
+ */
+void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg)
+{
+ *cfg = ts->cfg;
+}
+
+
+/* Schedule the read or write timer if needed
+ *
+ * NOTE: this function is not unit tested due to it's usage of timer_mod
+ *
+ * @tt: the timers structure
+ * @is_write: the type of operation (read/write)
+ * @ret: true if the timer has been scheduled else false
+ */
+bool throttle_schedule_timer(ThrottleState *ts,
+ ThrottleTimers *tt,
+ bool is_write)
+{
+ int64_t now = qemu_clock_get_ns(tt->clock_type);
+ int64_t next_timestamp;
+ bool must_wait;
+
+ must_wait = throttle_compute_timer(ts,
+ is_write,
+ now,
+ &next_timestamp);
+
+ /* request not throttled */
+ if (!must_wait) {
+ return false;
+ }
+
+ /* request throttled and timer pending -> do nothing */
+ if (timer_pending(tt->timers[is_write])) {
+ return true;
+ }
+
+ /* request throttled and timer not pending -> arm timer */
+ timer_mod(tt->timers[is_write], next_timestamp);
+ return true;
+}
+
+/* do the accounting for this operation
+ *
+ * @is_write: the type of operation (read/write)
+ * @size: the size of the operation
+ */
+void throttle_account(ThrottleState *ts, bool is_write, uint64_t size)
+{
+ const BucketType bucket_types_size[2][2] = {
+ { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ },
+ { THROTTLE_BPS_TOTAL, THROTTLE_BPS_WRITE }
+ };
+ const BucketType bucket_types_units[2][2] = {
+ { THROTTLE_OPS_TOTAL, THROTTLE_OPS_READ },
+ { THROTTLE_OPS_TOTAL, THROTTLE_OPS_WRITE }
+ };
+ double units = 1.0;
+ unsigned i;
+
+ /* if cfg.op_size is defined and smaller than size we compute unit count */
+ if (ts->cfg.op_size && size > ts->cfg.op_size) {
+ units = (double) size / ts->cfg.op_size;
+ }
+
+ for (i = 0; i < 2; i++) {
+ LeakyBucket *bkt;
+
+ bkt = &ts->cfg.buckets[bucket_types_size[is_write][i]];
+ bkt->level += size;
+ if (bkt->burst_length > 1) {
+ bkt->burst_level += size;
+ }
+
+ bkt = &ts->cfg.buckets[bucket_types_units[is_write][i]];
+ bkt->level += units;
+ if (bkt->burst_length > 1) {
+ bkt->burst_level += units;
+ }
+ }
+}
+
+/* return a ThrottleConfig based on the options in a ThrottleLimits
+ *
+ * @arg: the ThrottleLimits object to read from
+ * @cfg: the ThrottleConfig to edit
+ * @errp: error object
+ */
+void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg,
+ Error **errp)
+{
+ if (arg->has_bps_total) {
+ cfg->buckets[THROTTLE_BPS_TOTAL].avg = arg->bps_total;
+ }
+ if (arg->has_bps_read) {
+ cfg->buckets[THROTTLE_BPS_READ].avg = arg->bps_read;
+ }
+ if (arg->has_bps_write) {
+ cfg->buckets[THROTTLE_BPS_WRITE].avg = arg->bps_write;
+ }
+
+ if (arg->has_iops_total) {
+ cfg->buckets[THROTTLE_OPS_TOTAL].avg = arg->iops_total;
+ }
+ if (arg->has_iops_read) {
+ cfg->buckets[THROTTLE_OPS_READ].avg = arg->iops_read;
+ }
+ if (arg->has_iops_write) {
+ cfg->buckets[THROTTLE_OPS_WRITE].avg = arg->iops_write;
+ }
+
+ if (arg->has_bps_total_max) {
+ cfg->buckets[THROTTLE_BPS_TOTAL].max = arg->bps_total_max;
+ }
+ if (arg->has_bps_read_max) {
+ cfg->buckets[THROTTLE_BPS_READ].max = arg->bps_read_max;
+ }
+ if (arg->has_bps_write_max) {
+ cfg->buckets[THROTTLE_BPS_WRITE].max = arg->bps_write_max;
+ }
+ if (arg->has_iops_total_max) {
+ cfg->buckets[THROTTLE_OPS_TOTAL].max = arg->iops_total_max;
+ }
+ if (arg->has_iops_read_max) {
+ cfg->buckets[THROTTLE_OPS_READ].max = arg->iops_read_max;
+ }
+ if (arg->has_iops_write_max) {
+ cfg->buckets[THROTTLE_OPS_WRITE].max = arg->iops_write_max;
+ }
+
+ if (arg->has_bps_total_max_length) {
+ if (arg->bps_total_max_length > UINT_MAX) {
+ error_setg(errp, "bps-total-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_BPS_TOTAL].burst_length = arg->bps_total_max_length;
+ }
+ if (arg->has_bps_read_max_length) {
+ if (arg->bps_read_max_length > UINT_MAX) {
+ error_setg(errp, "bps-read-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_BPS_READ].burst_length = arg->bps_read_max_length;
+ }
+ if (arg->has_bps_write_max_length) {
+ if (arg->bps_write_max_length > UINT_MAX) {
+ error_setg(errp, "bps-write-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_BPS_WRITE].burst_length = arg->bps_write_max_length;
+ }
+ if (arg->has_iops_total_max_length) {
+ if (arg->iops_total_max_length > UINT_MAX) {
+ error_setg(errp, "iops-total-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_OPS_TOTAL].burst_length = arg->iops_total_max_length;
+ }
+ if (arg->has_iops_read_max_length) {
+ if (arg->iops_read_max_length > UINT_MAX) {
+ error_setg(errp, "iops-read-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_OPS_READ].burst_length = arg->iops_read_max_length;
+ }
+ if (arg->has_iops_write_max_length) {
+ if (arg->iops_write_max_length > UINT_MAX) {
+ error_setg(errp, "iops-write-max-length value must be in"
+ " the range [0, %u]", UINT_MAX);
+ return;
+ }
+ cfg->buckets[THROTTLE_OPS_WRITE].burst_length = arg->iops_write_max_length;
+ }
+
+ if (arg->has_iops_size) {
+ cfg->op_size = arg->iops_size;
+ }
+
+ throttle_is_valid(cfg, errp);
+}
+
+/* write the options of a ThrottleConfig to a ThrottleLimits
+ *
+ * @cfg: the ThrottleConfig to read from
+ * @var: the ThrottleLimits to write to
+ */
+void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var)
+{
+ var->bps_total = cfg->buckets[THROTTLE_BPS_TOTAL].avg;
+ var->bps_read = cfg->buckets[THROTTLE_BPS_READ].avg;
+ var->bps_write = cfg->buckets[THROTTLE_BPS_WRITE].avg;
+ var->iops_total = cfg->buckets[THROTTLE_OPS_TOTAL].avg;
+ var->iops_read = cfg->buckets[THROTTLE_OPS_READ].avg;
+ var->iops_write = cfg->buckets[THROTTLE_OPS_WRITE].avg;
+ var->bps_total_max = cfg->buckets[THROTTLE_BPS_TOTAL].max;
+ var->bps_read_max = cfg->buckets[THROTTLE_BPS_READ].max;
+ var->bps_write_max = cfg->buckets[THROTTLE_BPS_WRITE].max;
+ var->iops_total_max = cfg->buckets[THROTTLE_OPS_TOTAL].max;
+ var->iops_read_max = cfg->buckets[THROTTLE_OPS_READ].max;
+ var->iops_write_max = cfg->buckets[THROTTLE_OPS_WRITE].max;
+ var->bps_total_max_length = cfg->buckets[THROTTLE_BPS_TOTAL].burst_length;
+ var->bps_read_max_length = cfg->buckets[THROTTLE_BPS_READ].burst_length;
+ var->bps_write_max_length = cfg->buckets[THROTTLE_BPS_WRITE].burst_length;
+ var->iops_total_max_length = cfg->buckets[THROTTLE_OPS_TOTAL].burst_length;
+ var->iops_read_max_length = cfg->buckets[THROTTLE_OPS_READ].burst_length;
+ var->iops_write_max_length = cfg->buckets[THROTTLE_OPS_WRITE].burst_length;
+ var->iops_size = cfg->op_size;
+
+ var->has_bps_total = true;
+ var->has_bps_read = true;
+ var->has_bps_write = true;
+ var->has_iops_total = true;
+ var->has_iops_read = true;
+ var->has_iops_write = true;
+ var->has_bps_total_max = true;
+ var->has_bps_read_max = true;
+ var->has_bps_write_max = true;
+ var->has_iops_total_max = true;
+ var->has_iops_read_max = true;
+ var->has_iops_write_max = true;
+ var->has_bps_read_max_length = true;
+ var->has_bps_total_max_length = true;
+ var->has_bps_write_max_length = true;
+ var->has_iops_total_max_length = true;
+ var->has_iops_read_max_length = true;
+ var->has_iops_write_max_length = true;
+ var->has_iops_size = true;
+}
diff --git a/util/timed-average.c b/util/timed-average.c
new file mode 100644
index 000000000..2b49d532c
--- /dev/null
+++ b/util/timed-average.c
@@ -0,0 +1,231 @@
+/*
+ * QEMU timed average computation
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ * Benoît Canet <benoit.canet@nodalink.com>
+ * Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) version 3 or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/timed-average.h"
+
+/* This module computes an average of a set of values within a time
+ * window.
+ *
+ * Algorithm:
+ *
+ * - Create two windows with a certain expiration period, and
+ * offsetted by period / 2.
+ * - Each time you want to account a new value, do it in both windows.
+ * - The minimum / maximum / average values are always returned from
+ * the oldest window.
+ *
+ * Example:
+ *
+ * t=0 |t=0.5 |t=1 |t=1.5 |t=2
+ * wnd0: [0,0.5)|wnd0: [0.5,1.5) | |wnd0: [1.5,2.5) |
+ * wnd1: [0,1) | |wnd1: [1,2) | |
+ *
+ * Values are returned from:
+ *
+ * wnd0---------|wnd1------------|wnd0---------|wnd1-------------|
+ */
+
+/* Update the expiration of a time window
+ *
+ * @w: the window used
+ * @now: the current time in nanoseconds
+ * @period: the expiration period in nanoseconds
+ */
+static void update_expiration(TimedAverageWindow *w, int64_t now,
+ int64_t period)
+{
+ /* time elapsed since the last theoretical expiration */
+ int64_t elapsed = (now - w->expiration) % period;
+ /* time remaininging until the next expiration */
+ int64_t remaining = period - elapsed;
+ /* compute expiration */
+ w->expiration = now + remaining;
+}
+
+/* Reset a window
+ *
+ * @w: the window to reset
+ */
+static void window_reset(TimedAverageWindow *w)
+{
+ w->min = UINT64_MAX;
+ w->max = 0;
+ w->sum = 0;
+ w->count = 0;
+}
+
+/* Get the current window (that is, the one with the earliest
+ * expiration time).
+ *
+ * @ta: the TimedAverage structure
+ * @ret: a pointer to the current window
+ */
+static TimedAverageWindow *current_window(TimedAverage *ta)
+{
+ return &ta->windows[ta->current];
+}
+
+/* Initialize a TimedAverage structure
+ *
+ * @ta: the TimedAverage structure
+ * @clock_type: the type of clock to use
+ * @period: the time window period in nanoseconds
+ */
+void timed_average_init(TimedAverage *ta, QEMUClockType clock_type,
+ uint64_t period)
+{
+ int64_t now = qemu_clock_get_ns(clock_type);
+
+ /* Returned values are from the oldest window, so they belong to
+ * the interval [ta->period/2,ta->period). By adjusting the
+ * requested period by 4/3, we guarantee that they're in the
+ * interval [2/3 period,4/3 period), closer to the requested
+ * period on average */
+ ta->period = (uint64_t) period * 4 / 3;
+ ta->clock_type = clock_type;
+ ta->current = 0;
+
+ window_reset(&ta->windows[0]);
+ window_reset(&ta->windows[1]);
+
+ /* Both windows are offsetted by half a period */
+ ta->windows[0].expiration = now + ta->period / 2;
+ ta->windows[1].expiration = now + ta->period;
+}
+
+/* Check if the time windows have expired, updating their counters and
+ * expiration time if that's the case.
+ *
+ * @ta: the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) within the current
+ * window will be stored here
+ */
+static void check_expirations(TimedAverage *ta, uint64_t *elapsed)
+{
+ int64_t now = qemu_clock_get_ns(ta->clock_type);
+ int i;
+
+ assert(ta->period != 0);
+
+ /* Check if the windows have expired */
+ for (i = 0; i < 2; i++) {
+ TimedAverageWindow *w = &ta->windows[i];
+ if (w->expiration <= now) {
+ window_reset(w);
+ update_expiration(w, now, ta->period);
+ }
+ }
+
+ /* Make ta->current point to the oldest window */
+ if (ta->windows[0].expiration < ta->windows[1].expiration) {
+ ta->current = 0;
+ } else {
+ ta->current = 1;
+ }
+
+ /* Calculate the elapsed time within the current window */
+ if (elapsed) {
+ int64_t remaining = ta->windows[ta->current].expiration - now;
+ *elapsed = ta->period - remaining;
+ }
+}
+
+/* Account a value
+ *
+ * @ta: the TimedAverage structure
+ * @value: the value to account
+ */
+void timed_average_account(TimedAverage *ta, uint64_t value)
+{
+ int i;
+ check_expirations(ta, NULL);
+
+ /* Do the accounting in both windows at the same time */
+ for (i = 0; i < 2; i++) {
+ TimedAverageWindow *w = &ta->windows[i];
+
+ w->sum += value;
+ w->count++;
+
+ if (value < w->min) {
+ w->min = value;
+ }
+
+ if (value > w->max) {
+ w->max = value;
+ }
+ }
+}
+
+/* Get the minimum value
+ *
+ * @ta: the TimedAverage structure
+ * @ret: the minimum value
+ */
+uint64_t timed_average_min(TimedAverage *ta)
+{
+ TimedAverageWindow *w;
+ check_expirations(ta, NULL);
+ w = current_window(ta);
+ return w->min < UINT64_MAX ? w->min : 0;
+}
+
+/* Get the average value
+ *
+ * @ta: the TimedAverage structure
+ * @ret: the average value
+ */
+uint64_t timed_average_avg(TimedAverage *ta)
+{
+ TimedAverageWindow *w;
+ check_expirations(ta, NULL);
+ w = current_window(ta);
+ return w->count > 0 ? w->sum / w->count : 0;
+}
+
+/* Get the maximum value
+ *
+ * @ta: the TimedAverage structure
+ * @ret: the maximum value
+ */
+uint64_t timed_average_max(TimedAverage *ta)
+{
+ check_expirations(ta, NULL);
+ return current_window(ta)->max;
+}
+
+/* Get the sum of all accounted values
+ * @ta: the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here
+ * @ret: the sum of all accounted values
+ */
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
+{
+ TimedAverageWindow *w;
+ check_expirations(ta, elapsed);
+ w = current_window(ta);
+ return w->sum;
+}
diff --git a/util/trace-events b/util/trace-events
new file mode 100644
index 000000000..c8f53d7d9
--- /dev/null
+++ b/util/trace-events
@@ -0,0 +1,106 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# aio-posix.c
+run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ns %"PRId64 " timeout %"PRId64
+run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64
+poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64
+poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x"
+poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d"
+
+# async.c
+aio_co_schedule(void *ctx, void *co) "ctx %p co %p"
+aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p"
+
+# thread-pool.c
+thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p"
+thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d"
+thread_pool_cancel(void *req, void *opaque) "req %p opaque %p"
+
+# buffer.c
+buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd"
+buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
+buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s"
+buffer_free(const char *buf, size_t len) "%s: capacity %zd"
+
+# filemonitor-inotify.c
+qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64
+qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64
+qemu_file_monitor_new(void *mon, int fd) "File monitor %p created fd=%d"
+qemu_file_monitor_enable_watch(void *mon, const char *dirpath, int id) "File monitor %p enable watch dir='%s' id=%u"
+qemu_file_monitor_disable_watch(void *mon, const char *dirpath, int id) "File monitor %p disable watch dir='%s' id=%u"
+qemu_file_monitor_event(void *mon, const char *dirpath, const char *filename, int mask, unsigned int id) "File monitor %p event dir='%s' file='%s' mask=0x%x id=%u"
+qemu_file_monitor_dispatch(void *mon, const char *dirpath, const char *filename, int ev, void *cb, void *opaque, int64_t id) "File monitor %p dispatch dir='%s' file='%s' ev=%d cb=%p opaque=%p id=%" PRId64
+
+# qemu-coroutine.c
+qemu_aio_coroutine_enter(void *ctx, void *from, void *to, void *opaque) "ctx %p from %p to %p opaque %p"
+qemu_coroutine_yield(void *from, void *to) "from %p to %p"
+qemu_coroutine_terminate(void *co) "self %p"
+
+# qemu-coroutine-lock.c
+qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p"
+qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p"
+
+# oslib-posix.c
+# oslib-win32.c
+qemu_memalign(size_t alignment, size_t size, void *ptr) "alignment %zu size %zu ptr %p"
+qemu_anon_ram_alloc(size_t size, void *ptr) "size %zu ptr %p"
+qemu_vfree(void *ptr) "ptr %p"
+qemu_anon_ram_free(void *ptr, size_t size) "ptr %p size %zu"
+
+# hbitmap.c
+hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx"
+hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
+hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64
+
+# lockcnt.c
+lockcnt_fast_path_attempt(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d"
+lockcnt_fast_path_success(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d succeeded"
+lockcnt_unlock_attempt(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d"
+lockcnt_unlock_success(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d succeeded"
+lockcnt_futex_wait_prepare(const void *lockcnt, int expected, int new) "lockcnt %p preparing slow path %d->%d"
+lockcnt_futex_wait(const void *lockcnt, int val) "lockcnt %p waiting on %d"
+lockcnt_futex_wait_resume(const void *lockcnt, int new) "lockcnt %p after wait: %d"
+lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter"
+
+# qemu-sockets.c
+socket_listen(int num) "backlog: %d"
+
+# qemu-thread-common.h
+# qemu-thread-posix.c
+# qemu-thread-win32.c
+qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)"
+qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)"
+qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)"
+
+# vfio-helpers.c
+qemu_vfio_dma_reset_temporary(void *s) "s %p"
+qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx"
+qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx"
+qemu_vfio_find_mapping(void *s, void *p) "s %p host %p"
+qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64
+qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx"
+qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p"
+qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx"
+qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p"
+qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")"
+qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32
+qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p"
+
+#userfaultfd.c
+uffd_query_features_nosys(int err) "errno: %i"
+uffd_query_features_api_failed(int err) "errno: %i"
+uffd_create_fd_nosys(int err) "errno: %i"
+uffd_create_fd_api_failed(int err) "errno: %i"
+uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64
+uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i"
+uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i"
+
+# module.c
+module_load_module(const char *name) "file %s"
+module_lookup_object_type(const char *name) "name %s"
diff --git a/util/trace.h b/util/trace.h
new file mode 100644
index 000000000..86ff7a390
--- /dev/null
+++ b/util/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-util.h"
diff --git a/util/transactions.c b/util/transactions.c
new file mode 100644
index 000000000..2dbdedce9
--- /dev/null
+++ b/util/transactions.c
@@ -0,0 +1,100 @@
+/*
+ * Simple transactions API
+ *
+ * Copyright (c) 2021 Virtuozzo International GmbH.
+ *
+ * Author:
+ * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qemu/transactions.h"
+#include "qemu/queue.h"
+
+typedef struct TransactionAction {
+ TransactionActionDrv *drv;
+ void *opaque;
+ QSLIST_ENTRY(TransactionAction) entry;
+} TransactionAction;
+
+struct Transaction {
+ QSLIST_HEAD(, TransactionAction) actions;
+};
+
+Transaction *tran_new(void)
+{
+ Transaction *tran = g_new(Transaction, 1);
+
+ QSLIST_INIT(&tran->actions);
+
+ return tran;
+}
+
+void tran_add(Transaction *tran, TransactionActionDrv *drv, void *opaque)
+{
+ TransactionAction *act;
+
+ act = g_new(TransactionAction, 1);
+ *act = (TransactionAction) {
+ .drv = drv,
+ .opaque = opaque
+ };
+
+ QSLIST_INSERT_HEAD(&tran->actions, act, entry);
+}
+
+void tran_abort(Transaction *tran)
+{
+ TransactionAction *act, *next;
+
+ QSLIST_FOREACH(act, &tran->actions, entry) {
+ if (act->drv->abort) {
+ act->drv->abort(act->opaque);
+ }
+ }
+
+ QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) {
+ if (act->drv->clean) {
+ act->drv->clean(act->opaque);
+ }
+
+ g_free(act);
+ }
+
+ g_free(tran);
+}
+
+void tran_commit(Transaction *tran)
+{
+ TransactionAction *act, *next;
+
+ QSLIST_FOREACH(act, &tran->actions, entry) {
+ if (act->drv->commit) {
+ act->drv->commit(act->opaque);
+ }
+ }
+
+ QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) {
+ if (act->drv->clean) {
+ act->drv->clean(act->opaque);
+ }
+
+ g_free(act);
+ }
+
+ g_free(tran);
+}
diff --git a/util/unicode.c b/util/unicode.c
new file mode 100644
index 000000000..8580bc598
--- /dev/null
+++ b/util/unicode.c
@@ -0,0 +1,156 @@
+/*
+ * Dealing with Unicode
+ *
+ * Copyright (C) 2013 Red Hat, Inc.
+ *
+ * Authors:
+ * Markus Armbruster <armbru@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/unicode.h"
+
+static bool is_valid_codepoint(int codepoint)
+{
+ if (codepoint > 0x10FFFFu) {
+ return false; /* beyond Unicode range */
+ }
+ if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF)
+ || (codepoint & 0xFFFE) == 0xFFFE) {
+ return false; /* noncharacter */
+ }
+ if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
+ return false; /* surrogate code point */
+ }
+ return true;
+}
+
+/**
+ * mod_utf8_codepoint:
+ * @s: string encoded in modified UTF-8
+ * @n: maximum number of bytes to read from @s, if less than 6
+ * @end: set to end of sequence on return
+ *
+ * Convert the modified UTF-8 sequence at the start of @s. Modified
+ * UTF-8 is exactly like UTF-8, except U+0000 is encoded as
+ * "\xC0\x80".
+ *
+ * If @n is zero or @s points to a zero byte, the sequence is invalid,
+ * and @end is set to @s.
+ *
+ * If @s points to an impossible byte (0xFE or 0xFF) or a continuation
+ * byte, the sequence is invalid, and @end is set to @s + 1
+ *
+ * Else, the first byte determines how many continuation bytes are
+ * expected. If there are fewer, the sequence is invalid, and @end is
+ * set to @s + 1 + actual number of continuation bytes. Else, the
+ * sequence is well-formed, and @end is set to @s + 1 + expected
+ * number of continuation bytes.
+ *
+ * A well-formed sequence is valid unless it encodes a codepoint
+ * outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66
+ * noncharacters, a surrogate codepoint, or is overlong. Except the
+ * overlong sequence "\xC0\x80" is valid.
+ *
+ * Conversion succeeds if and only if the sequence is valid.
+ *
+ * Returns: the Unicode codepoint on success, -1 on failure.
+ */
+int mod_utf8_codepoint(const char *s, size_t n, char **end)
+{
+ static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 };
+ const unsigned char *p;
+ unsigned byte, mask, len, i;
+ int cp;
+
+ if (n == 0 || *s == 0) {
+ /* empty sequence */
+ *end = (char *)s;
+ return -1;
+ }
+
+ p = (const unsigned char *)s;
+ byte = *p++;
+ if (byte < 0x80) {
+ cp = byte; /* one byte sequence */
+ } else if (byte >= 0xFE) {
+ cp = -1; /* impossible bytes 0xFE, 0xFF */
+ } else if ((byte & 0x40) == 0) {
+ cp = -1; /* unexpected continuation byte */
+ } else {
+ /* multi-byte sequence */
+ len = 0;
+ for (mask = 0x80; byte & mask; mask >>= 1) {
+ len++;
+ }
+ assert(len > 1 && len < 7);
+ cp = byte & (mask - 1);
+ for (i = 1; i < len; i++) {
+ byte = i < n ? *p : 0;
+ if ((byte & 0xC0) != 0x80) {
+ cp = -1; /* continuation byte missing */
+ goto out;
+ }
+ p++;
+ cp <<= 6;
+ cp |= byte & 0x3F;
+ }
+ if (!is_valid_codepoint(cp)) {
+ cp = -1;
+ } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) {
+ cp = -1; /* overlong, not \xC0\x80 */
+ }
+ }
+
+out:
+ *end = (char *)p;
+ return cp;
+}
+
+/**
+ * mod_utf8_encode:
+ * @buf: Destination buffer
+ * @bufsz: size of @buf, at least 5.
+ * @codepoint: Unicode codepoint to encode
+ *
+ * Convert Unicode codepoint @codepoint to modified UTF-8.
+ *
+ * Returns: the length of the UTF-8 sequence on success, -1 when
+ * @codepoint is invalid.
+ */
+ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint)
+{
+ assert(bufsz >= 5);
+
+ if (!is_valid_codepoint(codepoint)) {
+ return -1;
+ }
+
+ if (codepoint > 0 && codepoint <= 0x7F) {
+ buf[0] = codepoint & 0x7F;
+ buf[1] = 0;
+ return 1;
+ }
+ if (codepoint <= 0x7FF) {
+ buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F);
+ buf[1] = 0x80 | (codepoint & 0x3F);
+ buf[2] = 0;
+ return 2;
+ }
+ if (codepoint <= 0xFFFF) {
+ buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F);
+ buf[1] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[2] = 0x80 | (codepoint & 0x3F);
+ buf[3] = 0;
+ return 3;
+ }
+ buf[0] = 0xF0 | ((codepoint >> 18) & 0x07);
+ buf[1] = 0x80 | ((codepoint >> 12) & 0x3F);
+ buf[2] = 0x80 | ((codepoint >> 6) & 0x3F);
+ buf[3] = 0x80 | (codepoint & 0x3F);
+ buf[4] = 0;
+ return 4;
+}
diff --git a/util/uri.c b/util/uri.c
new file mode 100644
index 000000000..ff72c6005
--- /dev/null
+++ b/util/uri.c
@@ -0,0 +1,2314 @@
+/**
+ * uri.c: set of generic URI related routines
+ *
+ * Reference: RFCs 3986, 2732 and 2373
+ *
+ * Copyright (C) 1998-2003 Daniel Veillard. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Except as contained in this notice, the name of Daniel Veillard shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization from him.
+ *
+ * daniel@veillard.com
+ *
+ **
+ *
+ * Copyright (C) 2007, 2009-2010 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Authors:
+ * Richard W.M. Jones <rjones@redhat.com>
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+
+#include "qemu/uri.h"
+
+static void uri_clean(URI *uri);
+
+/*
+ * Old rule from 2396 used in legacy handling code
+ * alpha = lowalpha | upalpha
+ */
+#define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x))
+
+/*
+ * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" |
+ * "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" |
+ * "u" | "v" | "w" | "x" | "y" | "z"
+ */
+
+#define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z'))
+
+/*
+ * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" |
+ * "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" |
+ * "U" | "V" | "W" | "X" | "Y" | "Z"
+ */
+#define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z'))
+
+#ifdef IS_DIGIT
+#undef IS_DIGIT
+#endif
+/*
+ * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+ */
+#define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9'))
+
+/*
+ * alphanum = alpha | digit
+ */
+
+#define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x))
+
+/*
+ * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")"
+ */
+
+#define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') || \
+ ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') || \
+ ((x) == '(') || ((x) == ')'))
+
+/*
+ * unwise = "{" | "}" | "|" | "\" | "^" | "`"
+ */
+
+#define IS_UNWISE(p) \
+ (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) || \
+ ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) || \
+ ((*(p) == ']')) || ((*(p) == '`')))
+/*
+ * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," |
+ * "[" | "]"
+ */
+
+#define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') || \
+ ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') || \
+ ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') || \
+ ((x) == ']'))
+
+/*
+ * unreserved = alphanum | mark
+ */
+
+#define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x))
+
+/*
+ * Skip to next pointer char, handle escaped sequences
+ */
+
+#define NEXT(p) ((*p == '%') ? p += 3 : p++)
+
+/*
+ * Productions from the spec.
+ *
+ * authority = server | reg_name
+ * reg_name = 1*( unreserved | escaped | "$" | "," |
+ * ";" | ":" | "@" | "&" | "=" | "+" )
+ *
+ * path = [ abs_path | opaque_part ]
+ */
+
+/************************************************************************
+ * *
+ * RFC 3986 parser *
+ * *
+ ************************************************************************/
+
+#define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9'))
+#define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) || \
+ ((*(p) >= 'A') && (*(p) <= 'Z')))
+#define ISA_HEXDIG(p) \
+ (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) || \
+ ((*(p) >= 'A') && (*(p) <= 'F')))
+
+/*
+ * sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
+ * / "*" / "+" / "," / ";" / "="
+ */
+#define ISA_SUB_DELIM(p) \
+ (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) || \
+ ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) || \
+ ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) || \
+ ((*(p) == '=')) || ((*(p) == '\'')))
+
+/*
+ * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+ */
+#define ISA_GEN_DELIM(p) \
+ (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) || \
+ ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) || \
+ ((*(p) == '@')))
+
+/*
+ * reserved = gen-delims / sub-delims
+ */
+#define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p)))
+
+/*
+ * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+ */
+#define ISA_UNRESERVED(p) \
+ ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) || \
+ ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~')))
+
+/*
+ * pct-encoded = "%" HEXDIG HEXDIG
+ */
+#define ISA_PCT_ENCODED(p) \
+ ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2)))
+
+/*
+ * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
+ */
+#define ISA_PCHAR(p) \
+ (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) || \
+ ((*(p) == ':')) || ((*(p) == '@')))
+
+/**
+ * rfc3986_parse_scheme:
+ * @uri: pointer to an URI structure
+ * @str: pointer to the string to analyze
+ *
+ * Parse an URI scheme
+ *
+ * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_scheme(URI *uri, const char **str)
+{
+ const char *cur;
+
+ if (str == NULL) {
+ return -1;
+ }
+
+ cur = *str;
+ if (!ISA_ALPHA(cur)) {
+ return 2;
+ }
+ cur++;
+ while (ISA_ALPHA(cur) || ISA_DIGIT(cur) || (*cur == '+') || (*cur == '-') ||
+ (*cur == '.')) {
+ cur++;
+ }
+ if (uri != NULL) {
+ g_free(uri->scheme);
+ uri->scheme = g_strndup(*str, cur - *str);
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_fragment:
+ * @uri: pointer to an URI structure
+ * @str: pointer to the string to analyze
+ *
+ * Parse the query part of an URI
+ *
+ * fragment = *( pchar / "/" / "?" )
+ * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']'
+ * in the fragment identifier but this is used very broadly for
+ * xpointer scheme selection, so we are allowing it here to not break
+ * for example all the DocBook processing chains.
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_fragment(URI *uri, const char **str)
+{
+ const char *cur;
+
+ if (str == NULL) {
+ return -1;
+ }
+
+ cur = *str;
+
+ while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
+ (*cur == '[') || (*cur == ']') ||
+ ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) {
+ NEXT(cur);
+ }
+ if (uri != NULL) {
+ g_free(uri->fragment);
+ if (uri->cleanup & 2) {
+ uri->fragment = g_strndup(*str, cur - *str);
+ } else {
+ uri->fragment = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_query:
+ * @uri: pointer to an URI structure
+ * @str: pointer to the string to analyze
+ *
+ * Parse the query part of an URI
+ *
+ * query = *uric
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_query(URI *uri, const char **str)
+{
+ const char *cur;
+
+ if (str == NULL) {
+ return -1;
+ }
+
+ cur = *str;
+
+ while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') ||
+ ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) {
+ NEXT(cur);
+ }
+ if (uri != NULL) {
+ g_free(uri->query);
+ uri->query = g_strndup(*str, cur - *str);
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_port:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse a port part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * port = *DIGIT
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_port(URI *uri, const char **str)
+{
+ const char *cur = *str;
+ int port = 0;
+
+ if (ISA_DIGIT(cur)) {
+ while (ISA_DIGIT(cur)) {
+ port = port * 10 + (*cur - '0');
+ if (port > 65535) {
+ return 1;
+ }
+ cur++;
+ }
+ if (uri) {
+ uri->port = port;
+ }
+ *str = cur;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * rfc3986_parse_user_info:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse a user information part and fill in the appropriate fields
+ * of the @uri structure
+ *
+ * userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_user_info(URI *uri, const char **str)
+{
+ const char *cur;
+
+ cur = *str;
+ while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur) ||
+ (*cur == ':')) {
+ NEXT(cur);
+ }
+ if (*cur == '@') {
+ if (uri != NULL) {
+ g_free(uri->user);
+ if (uri->cleanup & 2) {
+ uri->user = g_strndup(*str, cur - *str);
+ } else {
+ uri->user = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ }
+ *str = cur;
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * rfc3986_parse_dec_octet:
+ * @str: the string to analyze
+ *
+ * dec-octet = DIGIT ; 0-9
+ * / %x31-39 DIGIT ; 10-99
+ * / "1" 2DIGIT ; 100-199
+ * / "2" %x30-34 DIGIT ; 200-249
+ * / "25" %x30-35 ; 250-255
+ *
+ * Skip a dec-octet.
+ *
+ * Returns 0 if found and skipped, 1 otherwise
+ */
+static int rfc3986_parse_dec_octet(const char **str)
+{
+ const char *cur = *str;
+
+ if (!(ISA_DIGIT(cur))) {
+ return 1;
+ }
+ if (!ISA_DIGIT(cur + 1)) {
+ cur++;
+ } else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur + 2))) {
+ cur += 2;
+ } else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2))) {
+ cur += 3;
+ } else if ((*cur == '2') && (*(cur + 1) >= '0') && (*(cur + 1) <= '4') &&
+ (ISA_DIGIT(cur + 2))) {
+ cur += 3;
+ } else if ((*cur == '2') && (*(cur + 1) == '5') && (*(cur + 2) >= '0') &&
+ (*(cur + 1) <= '5')) {
+ cur += 3;
+ } else {
+ return 1;
+ }
+ *str = cur;
+ return 0;
+}
+/**
+ * rfc3986_parse_host:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an host part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * host = IP-literal / IPv4address / reg-name
+ * IP-literal = "[" ( IPv6address / IPvFuture ) "]"
+ * IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
+ * reg-name = *( unreserved / pct-encoded / sub-delims )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_host(URI *uri, const char **str)
+{
+ const char *cur = *str;
+ const char *host;
+
+ host = cur;
+ /*
+ * IPv6 and future addressing scheme are enclosed between brackets
+ */
+ if (*cur == '[') {
+ cur++;
+ while ((*cur != ']') && (*cur != 0)) {
+ cur++;
+ }
+ if (*cur != ']') {
+ return 1;
+ }
+ cur++;
+ goto found;
+ }
+ /*
+ * try to parse an IPv4
+ */
+ if (ISA_DIGIT(cur)) {
+ if (rfc3986_parse_dec_octet(&cur) != 0) {
+ goto not_ipv4;
+ }
+ if (*cur != '.') {
+ goto not_ipv4;
+ }
+ cur++;
+ if (rfc3986_parse_dec_octet(&cur) != 0) {
+ goto not_ipv4;
+ }
+ if (*cur != '.') {
+ goto not_ipv4;
+ }
+ if (rfc3986_parse_dec_octet(&cur) != 0) {
+ goto not_ipv4;
+ }
+ if (*cur != '.') {
+ goto not_ipv4;
+ }
+ if (rfc3986_parse_dec_octet(&cur) != 0) {
+ goto not_ipv4;
+ }
+ goto found;
+ not_ipv4:
+ cur = *str;
+ }
+ /*
+ * then this should be a hostname which can be empty
+ */
+ while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur)) {
+ NEXT(cur);
+ }
+found:
+ if (uri != NULL) {
+ g_free(uri->authority);
+ uri->authority = NULL;
+ g_free(uri->server);
+ if (cur != host) {
+ if (uri->cleanup & 2) {
+ uri->server = g_strndup(host, cur - host);
+ } else {
+ uri->server = uri_string_unescape(host, cur - host, NULL);
+ }
+ } else {
+ uri->server = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_authority:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an authority part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * authority = [ userinfo "@" ] host [ ":" port ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_authority(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+ /*
+ * try to parse a userinfo and check for the trailing @
+ */
+ ret = rfc3986_parse_user_info(uri, &cur);
+ if ((ret != 0) || (*cur != '@')) {
+ cur = *str;
+ } else {
+ cur++;
+ }
+ ret = rfc3986_parse_host(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ if (*cur == ':') {
+ cur++;
+ ret = rfc3986_parse_port(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_segment:
+ * @str: the string to analyze
+ * @forbid: an optional forbidden character
+ * @empty: allow an empty segment
+ *
+ * Parse a segment and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * segment = *pchar
+ * segment-nz = 1*pchar
+ * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
+ * ; non-zero-length segment without any colon ":"
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_segment(const char **str, char forbid, int empty)
+{
+ const char *cur;
+
+ cur = *str;
+ if (!ISA_PCHAR(cur)) {
+ if (empty) {
+ return 0;
+ }
+ return 1;
+ }
+ while (ISA_PCHAR(cur) && (*cur != forbid)) {
+ NEXT(cur);
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_path_ab_empty:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an path absolute or empty and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-abempty = *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_ab_empty(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+
+ while (*cur == '/') {
+ cur++;
+ ret = rfc3986_parse_segment(&cur, 0, 1);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (uri != NULL) {
+ g_free(uri->path);
+ if (*str != cur) {
+ if (uri->cleanup & 2) {
+ uri->path = g_strndup(*str, cur - *str);
+ } else {
+ uri->path = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ } else {
+ uri->path = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_path_absolute:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an path absolute and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-absolute = "/" [ segment-nz *( "/" segment ) ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_absolute(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+
+ if (*cur != '/') {
+ return 1;
+ }
+ cur++;
+ ret = rfc3986_parse_segment(&cur, 0, 0);
+ if (ret == 0) {
+ while (*cur == '/') {
+ cur++;
+ ret = rfc3986_parse_segment(&cur, 0, 1);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ }
+ if (uri != NULL) {
+ g_free(uri->path);
+ if (cur != *str) {
+ if (uri->cleanup & 2) {
+ uri->path = g_strndup(*str, cur - *str);
+ } else {
+ uri->path = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ } else {
+ uri->path = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_path_rootless:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an path without root and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-rootless = segment-nz *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_rootless(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+
+ ret = rfc3986_parse_segment(&cur, 0, 0);
+ if (ret != 0) {
+ return ret;
+ }
+ while (*cur == '/') {
+ cur++;
+ ret = rfc3986_parse_segment(&cur, 0, 1);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (uri != NULL) {
+ g_free(uri->path);
+ if (cur != *str) {
+ if (uri->cleanup & 2) {
+ uri->path = g_strndup(*str, cur - *str);
+ } else {
+ uri->path = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ } else {
+ uri->path = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_path_no_scheme:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an path which is not a scheme and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * path-noscheme = segment-nz-nc *( "/" segment )
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_path_no_scheme(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+
+ ret = rfc3986_parse_segment(&cur, ':', 0);
+ if (ret != 0) {
+ return ret;
+ }
+ while (*cur == '/') {
+ cur++;
+ ret = rfc3986_parse_segment(&cur, 0, 1);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (uri != NULL) {
+ g_free(uri->path);
+ if (cur != *str) {
+ if (uri->cleanup & 2) {
+ uri->path = g_strndup(*str, cur - *str);
+ } else {
+ uri->path = uri_string_unescape(*str, cur - *str, NULL);
+ }
+ } else {
+ uri->path = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_hier_part:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an hierarchical part and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * hier-part = "//" authority path-abempty
+ * / path-absolute
+ * / path-rootless
+ * / path-empty
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_hier_part(URI *uri, const char **str)
+{
+ const char *cur;
+ int ret;
+
+ cur = *str;
+
+ if ((*cur == '/') && (*(cur + 1) == '/')) {
+ cur += 2;
+ ret = rfc3986_parse_authority(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ ret = rfc3986_parse_path_ab_empty(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ *str = cur;
+ return 0;
+ } else if (*cur == '/') {
+ ret = rfc3986_parse_path_absolute(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ } else if (ISA_PCHAR(cur)) {
+ ret = rfc3986_parse_path_rootless(uri, &cur);
+ if (ret != 0) {
+ return ret;
+ }
+ } else {
+ /* path-empty is effectively empty */
+ if (uri != NULL) {
+ g_free(uri->path);
+ uri->path = NULL;
+ }
+ }
+ *str = cur;
+ return 0;
+}
+
+/**
+ * rfc3986_parse_relative_ref:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an URI string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
+ * relative-part = "//" authority path-abempty
+ * / path-absolute
+ * / path-noscheme
+ * / path-empty
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_relative_ref(URI *uri, const char *str)
+{
+ int ret;
+
+ if ((*str == '/') && (*(str + 1) == '/')) {
+ str += 2;
+ ret = rfc3986_parse_authority(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ ret = rfc3986_parse_path_ab_empty(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ } else if (*str == '/') {
+ ret = rfc3986_parse_path_absolute(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ } else if (ISA_PCHAR(str)) {
+ ret = rfc3986_parse_path_no_scheme(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ } else {
+ /* path-empty is effectively empty */
+ if (uri != NULL) {
+ g_free(uri->path);
+ uri->path = NULL;
+ }
+ }
+
+ if (*str == '?') {
+ str++;
+ ret = rfc3986_parse_query(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (*str == '#') {
+ str++;
+ ret = rfc3986_parse_fragment(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (*str != 0) {
+ uri_clean(uri);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * rfc3986_parse:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an URI string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse(URI *uri, const char *str)
+{
+ int ret;
+
+ ret = rfc3986_parse_scheme(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ if (*str != ':') {
+ return 1;
+ }
+ str++;
+ ret = rfc3986_parse_hier_part(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ if (*str == '?') {
+ str++;
+ ret = rfc3986_parse_query(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (*str == '#') {
+ str++;
+ ret = rfc3986_parse_fragment(uri, &str);
+ if (ret != 0) {
+ return ret;
+ }
+ }
+ if (*str != 0) {
+ uri_clean(uri);
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * rfc3986_parse_uri_reference:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an URI reference string and fills in the appropriate fields
+ * of the @uri structure
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns 0 or the error code
+ */
+static int rfc3986_parse_uri_reference(URI *uri, const char *str)
+{
+ int ret;
+
+ if (str == NULL) {
+ return -1;
+ }
+ uri_clean(uri);
+
+ /*
+ * Try first to parse absolute refs, then fallback to relative if
+ * it fails.
+ */
+ ret = rfc3986_parse(uri, str);
+ if (ret != 0) {
+ uri_clean(uri);
+ ret = rfc3986_parse_relative_ref(uri, str);
+ if (ret != 0) {
+ uri_clean(uri);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/**
+ * uri_parse:
+ * @str: the URI string to analyze
+ *
+ * Parse an URI based on RFC 3986
+ *
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * Returns a newly built URI or NULL in case of error
+ */
+URI *uri_parse(const char *str)
+{
+ URI *uri;
+ int ret;
+
+ if (str == NULL) {
+ return NULL;
+ }
+ uri = uri_new();
+ ret = rfc3986_parse_uri_reference(uri, str);
+ if (ret) {
+ uri_free(uri);
+ return NULL;
+ }
+ return uri;
+}
+
+/**
+ * uri_parse_into:
+ * @uri: pointer to an URI structure
+ * @str: the string to analyze
+ *
+ * Parse an URI reference string based on RFC 3986 and fills in the
+ * appropriate fields of the @uri structure
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns 0 or the error code
+ */
+int uri_parse_into(URI *uri, const char *str)
+{
+ return rfc3986_parse_uri_reference(uri, str);
+}
+
+/**
+ * uri_parse_raw:
+ * @str: the URI string to analyze
+ * @raw: if 1 unescaping of URI pieces are disabled
+ *
+ * Parse an URI but allows to keep intact the original fragments.
+ *
+ * URI-reference = URI / relative-ref
+ *
+ * Returns a newly built URI or NULL in case of error
+ */
+URI *uri_parse_raw(const char *str, int raw)
+{
+ URI *uri;
+ int ret;
+
+ if (str == NULL) {
+ return NULL;
+ }
+ uri = uri_new();
+ if (raw) {
+ uri->cleanup |= 2;
+ }
+ ret = uri_parse_into(uri, str);
+ if (ret) {
+ uri_free(uri);
+ return NULL;
+ }
+ return uri;
+}
+
+/************************************************************************
+ * *
+ * Generic URI structure functions *
+ * *
+ ************************************************************************/
+
+/**
+ * uri_new:
+ *
+ * Simply creates an empty URI
+ *
+ * Returns the new structure or NULL in case of error
+ */
+URI *uri_new(void)
+{
+ return g_new0(URI, 1);
+}
+
+/**
+ * realloc2n:
+ *
+ * Function to handle properly a reallocation when saving an URI
+ * Also imposes some limit on the length of an URI string output
+ */
+static char *realloc2n(char *ret, int *max)
+{
+ char *temp;
+ int tmp;
+
+ tmp = *max * 2;
+ temp = g_realloc(ret, (tmp + 1));
+ *max = tmp;
+ return temp;
+}
+
+/**
+ * uri_to_string:
+ * @uri: pointer to an URI
+ *
+ * Save the URI as an escaped string
+ *
+ * Returns a new string (to be deallocated by caller)
+ */
+char *uri_to_string(URI *uri)
+{
+ char *ret = NULL;
+ char *temp;
+ const char *p;
+ int len;
+ int max;
+
+ if (uri == NULL) {
+ return NULL;
+ }
+
+ max = 80;
+ ret = g_malloc(max + 1);
+ len = 0;
+
+ if (uri->scheme != NULL) {
+ p = uri->scheme;
+ while (*p != 0) {
+ if (len >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = *p++;
+ }
+ if (len >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = ':';
+ }
+ if (uri->opaque != NULL) {
+ p = uri->opaque;
+ while (*p != 0) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p))) {
+ ret[len++] = *p++;
+ } else {
+ int val = *(unsigned char *)p++;
+ int hi = val / 0x10, lo = val % 0x10;
+ ret[len++] = '%';
+ ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+ ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+ }
+ }
+ } else {
+ if (uri->server != NULL) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '/';
+ ret[len++] = '/';
+ if (uri->user != NULL) {
+ p = uri->user;
+ while (*p != 0) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ if ((IS_UNRESERVED(*(p))) || ((*(p) == ';')) ||
+ ((*(p) == ':')) || ((*(p) == '&')) || ((*(p) == '=')) ||
+ ((*(p) == '+')) || ((*(p) == '$')) || ((*(p) == ','))) {
+ ret[len++] = *p++;
+ } else {
+ int val = *(unsigned char *)p++;
+ int hi = val / 0x10, lo = val % 0x10;
+ ret[len++] = '%';
+ ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+ ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+ }
+ }
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '@';
+ }
+ p = uri->server;
+ while (*p != 0) {
+ if (len >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = *p++;
+ }
+ if (uri->port > 0) {
+ if (len + 10 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ len += snprintf(&ret[len], max - len, ":%d", uri->port);
+ }
+ } else if (uri->authority != NULL) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '/';
+ ret[len++] = '/';
+ p = uri->authority;
+ while (*p != 0) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ if ((IS_UNRESERVED(*(p))) || ((*(p) == '$')) ||
+ ((*(p) == ',')) || ((*(p) == ';')) || ((*(p) == ':')) ||
+ ((*(p) == '@')) || ((*(p) == '&')) || ((*(p) == '=')) ||
+ ((*(p) == '+'))) {
+ ret[len++] = *p++;
+ } else {
+ int val = *(unsigned char *)p++;
+ int hi = val / 0x10, lo = val % 0x10;
+ ret[len++] = '%';
+ ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+ ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+ }
+ }
+ } else if (uri->scheme != NULL) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '/';
+ ret[len++] = '/';
+ }
+ if (uri->path != NULL) {
+ p = uri->path;
+ /*
+ * the colon in file:///d: should not be escaped or
+ * Windows accesses fail later.
+ */
+ if ((uri->scheme != NULL) && (p[0] == '/') &&
+ (((p[1] >= 'a') && (p[1] <= 'z')) ||
+ ((p[1] >= 'A') && (p[1] <= 'Z'))) &&
+ (p[2] == ':') && (!strcmp(uri->scheme, "file"))) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = *p++;
+ ret[len++] = *p++;
+ ret[len++] = *p++;
+ }
+ while (*p != 0) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) ||
+ ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) ||
+ ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) ||
+ ((*(p) == ','))) {
+ ret[len++] = *p++;
+ } else {
+ int val = *(unsigned char *)p++;
+ int hi = val / 0x10, lo = val % 0x10;
+ ret[len++] = '%';
+ ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+ ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+ }
+ }
+ }
+ if (uri->query != NULL) {
+ if (len + 1 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '?';
+ p = uri->query;
+ while (*p != 0) {
+ if (len + 1 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = *p++;
+ }
+ }
+ }
+ if (uri->fragment != NULL) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len++] = '#';
+ p = uri->fragment;
+ while (*p != 0) {
+ if (len + 3 >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p)))) {
+ ret[len++] = *p++;
+ } else {
+ int val = *(unsigned char *)p++;
+ int hi = val / 0x10, lo = val % 0x10;
+ ret[len++] = '%';
+ ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0');
+ ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0');
+ }
+ }
+ }
+ if (len >= max) {
+ temp = realloc2n(ret, &max);
+ ret = temp;
+ }
+ ret[len] = 0;
+ return ret;
+}
+
+/**
+ * uri_clean:
+ * @uri: pointer to an URI
+ *
+ * Make sure the URI struct is free of content
+ */
+static void uri_clean(URI *uri)
+{
+ if (uri == NULL) {
+ return;
+ }
+
+ g_free(uri->scheme);
+ uri->scheme = NULL;
+ g_free(uri->server);
+ uri->server = NULL;
+ g_free(uri->user);
+ uri->user = NULL;
+ g_free(uri->path);
+ uri->path = NULL;
+ g_free(uri->fragment);
+ uri->fragment = NULL;
+ g_free(uri->opaque);
+ uri->opaque = NULL;
+ g_free(uri->authority);
+ uri->authority = NULL;
+ g_free(uri->query);
+ uri->query = NULL;
+}
+
+/**
+ * uri_free:
+ * @uri: pointer to an URI, NULL is ignored
+ *
+ * Free up the URI struct
+ */
+void uri_free(URI *uri)
+{
+ uri_clean(uri);
+ g_free(uri);
+}
+
+/************************************************************************
+ * *
+ * Helper functions *
+ * *
+ ************************************************************************/
+
+/**
+ * normalize_uri_path:
+ * @path: pointer to the path string
+ *
+ * Applies the 5 normalization steps to a path string--that is, RFC 2396
+ * Section 5.2, steps 6.c through 6.g.
+ *
+ * Normalization occurs directly on the string, no new allocation is done
+ *
+ * Returns 0 or an error code
+ */
+static int normalize_uri_path(char *path)
+{
+ char *cur, *out;
+
+ if (path == NULL) {
+ return -1;
+ }
+
+ /* Skip all initial "/" chars. We want to get to the beginning of the
+ * first non-empty segment.
+ */
+ cur = path;
+ while (cur[0] == '/') {
+ ++cur;
+ }
+ if (cur[0] == '\0') {
+ return 0;
+ }
+
+ /* Keep everything we've seen so far. */
+ out = cur;
+
+ /*
+ * Analyze each segment in sequence for cases (c) and (d).
+ */
+ while (cur[0] != '\0') {
+ /*
+ * c) All occurrences of "./", where "." is a complete path segment,
+ * are removed from the buffer string.
+ */
+ if ((cur[0] == '.') && (cur[1] == '/')) {
+ cur += 2;
+ /* '//' normalization should be done at this point too */
+ while (cur[0] == '/') {
+ cur++;
+ }
+ continue;
+ }
+
+ /*
+ * d) If the buffer string ends with "." as a complete path segment,
+ * that "." is removed.
+ */
+ if ((cur[0] == '.') && (cur[1] == '\0')) {
+ break;
+ }
+
+ /* Otherwise keep the segment. */
+ while (cur[0] != '/') {
+ if (cur[0] == '\0') {
+ goto done_cd;
+ }
+ (out++)[0] = (cur++)[0];
+ }
+ /* nomalize // */
+ while ((cur[0] == '/') && (cur[1] == '/')) {
+ cur++;
+ }
+
+ (out++)[0] = (cur++)[0];
+ }
+done_cd:
+ out[0] = '\0';
+
+ /* Reset to the beginning of the first segment for the next sequence. */
+ cur = path;
+ while (cur[0] == '/') {
+ ++cur;
+ }
+ if (cur[0] == '\0') {
+ return 0;
+ }
+
+ /*
+ * Analyze each segment in sequence for cases (e) and (f).
+ *
+ * e) All occurrences of "<segment>/../", where <segment> is a
+ * complete path segment not equal to "..", are removed from the
+ * buffer string. Removal of these path segments is performed
+ * iteratively, removing the leftmost matching pattern on each
+ * iteration, until no matching pattern remains.
+ *
+ * f) If the buffer string ends with "<segment>/..", where <segment>
+ * is a complete path segment not equal to "..", that
+ * "<segment>/.." is removed.
+ *
+ * To satisfy the "iterative" clause in (e), we need to collapse the
+ * string every time we find something that needs to be removed. Thus,
+ * we don't need to keep two pointers into the string: we only need a
+ * "current position" pointer.
+ */
+ while (1) {
+ char *segp, *tmp;
+
+ /* At the beginning of each iteration of this loop, "cur" points to
+ * the first character of the segment we want to examine.
+ */
+
+ /* Find the end of the current segment. */
+ segp = cur;
+ while ((segp[0] != '/') && (segp[0] != '\0')) {
+ ++segp;
+ }
+
+ /* If this is the last segment, we're done (we need at least two
+ * segments to meet the criteria for the (e) and (f) cases).
+ */
+ if (segp[0] == '\0') {
+ break;
+ }
+
+ /* If the first segment is "..", or if the next segment _isn't_ "..",
+ * keep this segment and try the next one.
+ */
+ ++segp;
+ if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur + 3)) ||
+ ((segp[0] != '.') || (segp[1] != '.') ||
+ ((segp[2] != '/') && (segp[2] != '\0')))) {
+ cur = segp;
+ continue;
+ }
+
+ /* If we get here, remove this segment and the next one and back up
+ * to the previous segment (if there is one), to implement the
+ * "iteratively" clause. It's pretty much impossible to back up
+ * while maintaining two pointers into the buffer, so just compact
+ * the whole buffer now.
+ */
+
+ /* If this is the end of the buffer, we're done. */
+ if (segp[2] == '\0') {
+ cur[0] = '\0';
+ break;
+ }
+ /* Valgrind complained, strcpy(cur, segp + 3); */
+ /* string will overlap, do not use strcpy */
+ tmp = cur;
+ segp += 3;
+ while ((*tmp++ = *segp++) != 0) {
+ /* No further work */
+ }
+
+ /* If there are no previous segments, then keep going from here. */
+ segp = cur;
+ while ((segp > path) && ((--segp)[0] == '/')) {
+ /* No further work */
+ }
+ if (segp == path) {
+ continue;
+ }
+
+ /* "segp" is pointing to the end of a previous segment; find it's
+ * start. We need to back up to the previous segment and start
+ * over with that to handle things like "foo/bar/../..". If we
+ * don't do this, then on the first pass we'll remove the "bar/..",
+ * but be pointing at the second ".." so we won't realize we can also
+ * remove the "foo/..".
+ */
+ cur = segp;
+ while ((cur > path) && (cur[-1] != '/')) {
+ --cur;
+ }
+ }
+ out[0] = '\0';
+
+ /*
+ * g) If the resulting buffer string still begins with one or more
+ * complete path segments of "..", then the reference is
+ * considered to be in error. Implementations may handle this
+ * error by retaining these components in the resolved path (i.e.,
+ * treating them as part of the final URI), by removing them from
+ * the resolved path (i.e., discarding relative levels above the
+ * root), or by avoiding traversal of the reference.
+ *
+ * We discard them from the final path.
+ */
+ if (path[0] == '/') {
+ cur = path;
+ while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.') &&
+ ((cur[3] == '/') || (cur[3] == '\0'))) {
+ cur += 3;
+ }
+
+ if (cur != path) {
+ out = path;
+ while (cur[0] != '\0') {
+ (out++)[0] = (cur++)[0];
+ }
+ out[0] = 0;
+ }
+ }
+
+ return 0;
+}
+
+static int is_hex(char c)
+{
+ if (((c >= '0') && (c <= '9')) || ((c >= 'a') && (c <= 'f')) ||
+ ((c >= 'A') && (c <= 'F'))) {
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * uri_string_unescape:
+ * @str: the string to unescape
+ * @len: the length in bytes to unescape (or <= 0 to indicate full string)
+ * @target: optional destination buffer
+ *
+ * Unescaping routine, but does not check that the string is an URI. The
+ * output is a direct unsigned char translation of %XX values (no encoding)
+ * Note that the length of the result can only be smaller or same size as
+ * the input string.
+ *
+ * Returns a copy of the string, but unescaped, will return NULL only in case
+ * of error
+ */
+char *uri_string_unescape(const char *str, int len, char *target)
+{
+ char *ret, *out;
+ const char *in;
+
+ if (str == NULL) {
+ return NULL;
+ }
+ if (len <= 0) {
+ len = strlen(str);
+ }
+ if (len < 0) {
+ return NULL;
+ }
+
+ if (target == NULL) {
+ ret = g_malloc(len + 1);
+ } else {
+ ret = target;
+ }
+ in = str;
+ out = ret;
+ while (len > 0) {
+ if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) {
+ in++;
+ if ((*in >= '0') && (*in <= '9')) {
+ *out = (*in - '0');
+ } else if ((*in >= 'a') && (*in <= 'f')) {
+ *out = (*in - 'a') + 10;
+ } else if ((*in >= 'A') && (*in <= 'F')) {
+ *out = (*in - 'A') + 10;
+ }
+ in++;
+ if ((*in >= '0') && (*in <= '9')) {
+ *out = *out * 16 + (*in - '0');
+ } else if ((*in >= 'a') && (*in <= 'f')) {
+ *out = *out * 16 + (*in - 'a') + 10;
+ } else if ((*in >= 'A') && (*in <= 'F')) {
+ *out = *out * 16 + (*in - 'A') + 10;
+ }
+ in++;
+ len -= 3;
+ out++;
+ } else {
+ *out++ = *in++;
+ len--;
+ }
+ }
+ *out = 0;
+ return ret;
+}
+
+/**
+ * uri_string_escape:
+ * @str: string to escape
+ * @list: exception list string of chars not to escape
+ *
+ * This routine escapes a string to hex, ignoring reserved characters (a-z)
+ * and the characters in the exception list.
+ *
+ * Returns a new escaped string or NULL in case of error.
+ */
+char *uri_string_escape(const char *str, const char *list)
+{
+ char *ret, ch;
+ char *temp;
+ const char *in;
+ int len, out;
+
+ if (str == NULL) {
+ return NULL;
+ }
+ if (str[0] == 0) {
+ return g_strdup(str);
+ }
+ len = strlen(str);
+ if (!(len > 0)) {
+ return NULL;
+ }
+
+ len += 20;
+ ret = g_malloc(len);
+ in = str;
+ out = 0;
+ while (*in != 0) {
+ if (len - out <= 3) {
+ temp = realloc2n(ret, &len);
+ ret = temp;
+ }
+
+ ch = *in;
+
+ if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!strchr(list, ch))) {
+ unsigned char val;
+ ret[out++] = '%';
+ val = ch >> 4;
+ if (val <= 9) {
+ ret[out++] = '0' + val;
+ } else {
+ ret[out++] = 'A' + val - 0xA;
+ }
+ val = ch & 0xF;
+ if (val <= 9) {
+ ret[out++] = '0' + val;
+ } else {
+ ret[out++] = 'A' + val - 0xA;
+ }
+ in++;
+ } else {
+ ret[out++] = *in++;
+ }
+ }
+ ret[out] = 0;
+ return ret;
+}
+
+/************************************************************************
+ * *
+ * Public functions *
+ * *
+ ************************************************************************/
+
+/**
+ * uri_resolve:
+ * @URI: the URI instance found in the document
+ * @base: the base value
+ *
+ * Computes he final URI of the reference done by checking that
+ * the given URI is valid, and building the final URI using the
+ * base URI. This is processed according to section 5.2 of the
+ * RFC 2396
+ *
+ * 5.2. Resolving Relative References to Absolute Form
+ *
+ * Returns a new URI string (to be freed by the caller) or NULL in case
+ * of error.
+ */
+char *uri_resolve(const char *uri, const char *base)
+{
+ char *val = NULL;
+ int ret, len, indx, cur, out;
+ URI *ref = NULL;
+ URI *bas = NULL;
+ URI *res = NULL;
+
+ /*
+ * 1) The URI reference is parsed into the potential four components and
+ * fragment identifier, as described in Section 4.3.
+ *
+ * NOTE that a completely empty URI is treated by modern browsers
+ * as a reference to "." rather than as a synonym for the current
+ * URI. Should we do that here?
+ */
+ if (uri == NULL) {
+ ret = -1;
+ } else {
+ if (*uri) {
+ ref = uri_new();
+ ret = uri_parse_into(ref, uri);
+ } else {
+ ret = 0;
+ }
+ }
+ if (ret != 0) {
+ goto done;
+ }
+ if ((ref != NULL) && (ref->scheme != NULL)) {
+ /*
+ * The URI is absolute don't modify.
+ */
+ val = g_strdup(uri);
+ goto done;
+ }
+ if (base == NULL) {
+ ret = -1;
+ } else {
+ bas = uri_new();
+ ret = uri_parse_into(bas, base);
+ }
+ if (ret != 0) {
+ if (ref) {
+ val = uri_to_string(ref);
+ }
+ goto done;
+ }
+ if (ref == NULL) {
+ /*
+ * the base fragment must be ignored
+ */
+ g_free(bas->fragment);
+ bas->fragment = NULL;
+ val = uri_to_string(bas);
+ goto done;
+ }
+
+ /*
+ * 2) If the path component is empty and the scheme, authority, and
+ * query components are undefined, then it is a reference to the
+ * current document and we are done. Otherwise, the reference URI's
+ * query and fragment components are defined as found (or not found)
+ * within the URI reference and not inherited from the base URI.
+ *
+ * NOTE that in modern browsers, the parsing differs from the above
+ * in the following aspect: the query component is allowed to be
+ * defined while still treating this as a reference to the current
+ * document.
+ */
+ res = uri_new();
+ if ((ref->scheme == NULL) && (ref->path == NULL) &&
+ ((ref->authority == NULL) && (ref->server == NULL))) {
+ res->scheme = g_strdup(bas->scheme);
+ if (bas->authority != NULL) {
+ res->authority = g_strdup(bas->authority);
+ } else if (bas->server != NULL) {
+ res->server = g_strdup(bas->server);
+ res->user = g_strdup(bas->user);
+ res->port = bas->port;
+ }
+ res->path = g_strdup(bas->path);
+ if (ref->query != NULL) {
+ res->query = g_strdup(ref->query);
+ } else {
+ res->query = g_strdup(bas->query);
+ }
+ res->fragment = g_strdup(ref->fragment);
+ goto step_7;
+ }
+
+ /*
+ * 3) If the scheme component is defined, indicating that the reference
+ * starts with a scheme name, then the reference is interpreted as an
+ * absolute URI and we are done. Otherwise, the reference URI's
+ * scheme is inherited from the base URI's scheme component.
+ */
+ if (ref->scheme != NULL) {
+ val = uri_to_string(ref);
+ goto done;
+ }
+ res->scheme = g_strdup(bas->scheme);
+
+ res->query = g_strdup(ref->query);
+ res->fragment = g_strdup(ref->fragment);
+
+ /*
+ * 4) If the authority component is defined, then the reference is a
+ * network-path and we skip to step 7. Otherwise, the reference
+ * URI's authority is inherited from the base URI's authority
+ * component, which will also be undefined if the URI scheme does not
+ * use an authority component.
+ */
+ if ((ref->authority != NULL) || (ref->server != NULL)) {
+ if (ref->authority != NULL) {
+ res->authority = g_strdup(ref->authority);
+ } else {
+ res->server = g_strdup(ref->server);
+ res->user = g_strdup(ref->user);
+ res->port = ref->port;
+ }
+ res->path = g_strdup(ref->path);
+ goto step_7;
+ }
+ if (bas->authority != NULL) {
+ res->authority = g_strdup(bas->authority);
+ } else if (bas->server != NULL) {
+ res->server = g_strdup(bas->server);
+ res->user = g_strdup(bas->user);
+ res->port = bas->port;
+ }
+
+ /*
+ * 5) If the path component begins with a slash character ("/"), then
+ * the reference is an absolute-path and we skip to step 7.
+ */
+ if ((ref->path != NULL) && (ref->path[0] == '/')) {
+ res->path = g_strdup(ref->path);
+ goto step_7;
+ }
+
+ /*
+ * 6) If this step is reached, then we are resolving a relative-path
+ * reference. The relative path needs to be merged with the base
+ * URI's path. Although there are many ways to do this, we will
+ * describe a simple method using a separate string buffer.
+ *
+ * Allocate a buffer large enough for the result string.
+ */
+ len = 2; /* extra / and 0 */
+ if (ref->path != NULL) {
+ len += strlen(ref->path);
+ }
+ if (bas->path != NULL) {
+ len += strlen(bas->path);
+ }
+ res->path = g_malloc(len);
+ res->path[0] = 0;
+
+ /*
+ * a) All but the last segment of the base URI's path component is
+ * copied to the buffer. In other words, any characters after the
+ * last (right-most) slash character, if any, are excluded.
+ */
+ cur = 0;
+ out = 0;
+ if (bas->path != NULL) {
+ while (bas->path[cur] != 0) {
+ while ((bas->path[cur] != 0) && (bas->path[cur] != '/')) {
+ cur++;
+ }
+ if (bas->path[cur] == 0) {
+ break;
+ }
+
+ cur++;
+ while (out < cur) {
+ res->path[out] = bas->path[out];
+ out++;
+ }
+ }
+ }
+ res->path[out] = 0;
+
+ /*
+ * b) The reference's path component is appended to the buffer
+ * string.
+ */
+ if (ref->path != NULL && ref->path[0] != 0) {
+ indx = 0;
+ /*
+ * Ensure the path includes a '/'
+ */
+ if ((out == 0) && (bas->server != NULL)) {
+ res->path[out++] = '/';
+ }
+ while (ref->path[indx] != 0) {
+ res->path[out++] = ref->path[indx++];
+ }
+ }
+ res->path[out] = 0;
+
+ /*
+ * Steps c) to h) are really path normalization steps
+ */
+ normalize_uri_path(res->path);
+
+step_7:
+
+ /*
+ * 7) The resulting URI components, including any inherited from the
+ * base URI, are recombined to give the absolute form of the URI
+ * reference.
+ */
+ val = uri_to_string(res);
+
+done:
+ uri_free(ref);
+ uri_free(bas);
+ uri_free(res);
+ return val;
+}
+
+/**
+ * uri_resolve_relative:
+ * @URI: the URI reference under consideration
+ * @base: the base value
+ *
+ * Expresses the URI of the reference in terms relative to the
+ * base. Some examples of this operation include:
+ * base = "http://site1.com/docs/book1.html"
+ * URI input URI returned
+ * docs/pic1.gif pic1.gif
+ * docs/img/pic1.gif img/pic1.gif
+ * img/pic1.gif ../img/pic1.gif
+ * http://site1.com/docs/pic1.gif pic1.gif
+ * http://site2.com/docs/pic1.gif http://site2.com/docs/pic1.gif
+ *
+ * base = "docs/book1.html"
+ * URI input URI returned
+ * docs/pic1.gif pic1.gif
+ * docs/img/pic1.gif img/pic1.gif
+ * img/pic1.gif ../img/pic1.gif
+ * http://site1.com/docs/pic1.gif http://site1.com/docs/pic1.gif
+ *
+ *
+ * Note: if the URI reference is really weird or complicated, it may be
+ * worthwhile to first convert it into a "nice" one by calling
+ * uri_resolve (using 'base') before calling this routine,
+ * since this routine (for reasonable efficiency) assumes URI has
+ * already been through some validation.
+ *
+ * Returns a new URI string (to be freed by the caller) or NULL in case
+ * error.
+ */
+char *uri_resolve_relative(const char *uri, const char *base)
+{
+ char *val = NULL;
+ int ret;
+ int ix;
+ int pos = 0;
+ int nbslash = 0;
+ int len;
+ URI *ref = NULL;
+ URI *bas = NULL;
+ char *bptr, *uptr, *vptr;
+ int remove_path = 0;
+
+ if ((uri == NULL) || (*uri == 0)) {
+ return NULL;
+ }
+
+ /*
+ * First parse URI into a standard form
+ */
+ ref = uri_new();
+ /* If URI not already in "relative" form */
+ if (uri[0] != '.') {
+ ret = uri_parse_into(ref, uri);
+ if (ret != 0) {
+ goto done; /* Error in URI, return NULL */
+ }
+ } else {
+ ref->path = g_strdup(uri);
+ }
+
+ /*
+ * Next parse base into the same standard form
+ */
+ if ((base == NULL) || (*base == 0)) {
+ val = g_strdup(uri);
+ goto done;
+ }
+ bas = uri_new();
+ if (base[0] != '.') {
+ ret = uri_parse_into(bas, base);
+ if (ret != 0) {
+ goto done; /* Error in base, return NULL */
+ }
+ } else {
+ bas->path = g_strdup(base);
+ }
+
+ /*
+ * If the scheme / server on the URI differs from the base,
+ * just return the URI
+ */
+ if ((ref->scheme != NULL) &&
+ ((bas->scheme == NULL) || (strcmp(bas->scheme, ref->scheme)) ||
+ (strcmp(bas->server, ref->server)))) {
+ val = g_strdup(uri);
+ goto done;
+ }
+ if (bas->path == ref->path ||
+ (bas->path && ref->path && !strcmp(bas->path, ref->path))) {
+ val = g_strdup("");
+ goto done;
+ }
+ if (bas->path == NULL) {
+ val = g_strdup(ref->path);
+ goto done;
+ }
+ if (ref->path == NULL) {
+ ref->path = (char *)"/";
+ remove_path = 1;
+ }
+
+ /*
+ * At this point (at last!) we can compare the two paths
+ *
+ * First we take care of the special case where either of the
+ * two path components may be missing (bug 316224)
+ */
+ if (bas->path == NULL) {
+ if (ref->path != NULL) {
+ uptr = ref->path;
+ if (*uptr == '/') {
+ uptr++;
+ }
+ /* exception characters from uri_to_string */
+ val = uri_string_escape(uptr, "/;&=+$,");
+ }
+ goto done;
+ }
+ bptr = bas->path;
+ if (ref->path == NULL) {
+ for (ix = 0; bptr[ix] != 0; ix++) {
+ if (bptr[ix] == '/') {
+ nbslash++;
+ }
+ }
+ uptr = NULL;
+ len = 1; /* this is for a string terminator only */
+ } else {
+ /*
+ * Next we compare the two strings and find where they first differ
+ */
+ if ((ref->path[pos] == '.') && (ref->path[pos + 1] == '/')) {
+ pos += 2;
+ }
+ if ((*bptr == '.') && (bptr[1] == '/')) {
+ bptr += 2;
+ } else if ((*bptr == '/') && (ref->path[pos] != '/')) {
+ bptr++;
+ }
+ while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0)) {
+ pos++;
+ }
+
+ if (bptr[pos] == ref->path[pos]) {
+ val = g_strdup("");
+ goto done; /* (I can't imagine why anyone would do this) */
+ }
+
+ /*
+ * In URI, "back up" to the last '/' encountered. This will be the
+ * beginning of the "unique" suffix of URI
+ */
+ ix = pos;
+ if ((ref->path[ix] == '/') && (ix > 0)) {
+ ix--;
+ } else if ((ref->path[ix] == 0) && (ix > 1)
+ && (ref->path[ix - 1] == '/')) {
+ ix -= 2;
+ }
+ for (; ix > 0; ix--) {
+ if (ref->path[ix] == '/') {
+ break;
+ }
+ }
+ if (ix == 0) {
+ uptr = ref->path;
+ } else {
+ ix++;
+ uptr = &ref->path[ix];
+ }
+
+ /*
+ * In base, count the number of '/' from the differing point
+ */
+ if (bptr[pos] != ref->path[pos]) { /* check for trivial URI == base */
+ for (; bptr[ix] != 0; ix++) {
+ if (bptr[ix] == '/') {
+ nbslash++;
+ }
+ }
+ }
+ len = strlen(uptr) + 1;
+ }
+
+ if (nbslash == 0) {
+ if (uptr != NULL) {
+ /* exception characters from uri_to_string */
+ val = uri_string_escape(uptr, "/;&=+$,");
+ }
+ goto done;
+ }
+
+ /*
+ * Allocate just enough space for the returned string -
+ * length of the remainder of the URI, plus enough space
+ * for the "../" groups, plus one for the terminator
+ */
+ val = g_malloc(len + 3 * nbslash);
+ vptr = val;
+ /*
+ * Put in as many "../" as needed
+ */
+ for (; nbslash > 0; nbslash--) {
+ *vptr++ = '.';
+ *vptr++ = '.';
+ *vptr++ = '/';
+ }
+ /*
+ * Finish up with the end of the URI
+ */
+ if (uptr != NULL) {
+ if ((vptr > val) && (len > 0) && (uptr[0] == '/') &&
+ (vptr[-1] == '/')) {
+ memcpy(vptr, uptr + 1, len - 1);
+ vptr[len - 2] = 0;
+ } else {
+ memcpy(vptr, uptr, len);
+ vptr[len - 1] = 0;
+ }
+ } else {
+ vptr[len - 1] = 0;
+ }
+
+ /* escape the freshly-built path */
+ vptr = val;
+ /* exception characters from uri_to_string */
+ val = uri_string_escape(vptr, "/;&=+$,");
+ g_free(vptr);
+
+done:
+ /*
+ * Free the working variables
+ */
+ if (remove_path != 0) {
+ ref->path = NULL;
+ }
+ uri_free(ref);
+ uri_free(bas);
+
+ return val;
+}
+
+/*
+ * Utility functions to help parse and assemble query strings.
+ */
+
+struct QueryParams *query_params_new(int init_alloc)
+{
+ struct QueryParams *ps;
+
+ if (init_alloc <= 0) {
+ init_alloc = 1;
+ }
+
+ ps = g_new(QueryParams, 1);
+ ps->n = 0;
+ ps->alloc = init_alloc;
+ ps->p = g_new(QueryParam, ps->alloc);
+
+ return ps;
+}
+
+/* Ensure there is space to store at least one more parameter
+ * at the end of the set.
+ */
+static int query_params_append(struct QueryParams *ps, const char *name,
+ const char *value)
+{
+ if (ps->n >= ps->alloc) {
+ ps->p = g_renew(QueryParam, ps->p, ps->alloc * 2);
+ ps->alloc *= 2;
+ }
+
+ ps->p[ps->n].name = g_strdup(name);
+ ps->p[ps->n].value = g_strdup(value);
+ ps->p[ps->n].ignore = 0;
+ ps->n++;
+
+ return 0;
+}
+
+void query_params_free(struct QueryParams *ps)
+{
+ int i;
+
+ for (i = 0; i < ps->n; ++i) {
+ g_free(ps->p[i].name);
+ g_free(ps->p[i].value);
+ }
+ g_free(ps->p);
+ g_free(ps);
+}
+
+struct QueryParams *query_params_parse(const char *query)
+{
+ struct QueryParams *ps;
+ const char *end, *eq;
+
+ ps = query_params_new(0);
+ if (!query || query[0] == '\0') {
+ return ps;
+ }
+
+ while (*query) {
+ char *name = NULL, *value = NULL;
+
+ /* Find the next separator, or end of the string. */
+ end = strchr(query, '&');
+ if (!end) {
+ end = qemu_strchrnul(query, ';');
+ }
+
+ /* Find the first '=' character between here and end. */
+ eq = strchr(query, '=');
+ if (eq && eq >= end) {
+ eq = NULL;
+ }
+
+ /* Empty section (eg. "&&"). */
+ if (end == query) {
+ goto next;
+ }
+
+ /* If there is no '=' character, then we have just "name"
+ * and consistent with CGI.pm we assume value is "".
+ */
+ else if (!eq) {
+ name = uri_string_unescape(query, end - query, NULL);
+ value = NULL;
+ }
+ /* Or if we have "name=" here (works around annoying
+ * problem when calling uri_string_unescape with len = 0).
+ */
+ else if (eq + 1 == end) {
+ name = uri_string_unescape(query, eq - query, NULL);
+ value = g_new0(char, 1);
+ }
+ /* If the '=' character is at the beginning then we have
+ * "=value" and consistent with CGI.pm we _ignore_ this.
+ */
+ else if (query == eq) {
+ goto next;
+ }
+
+ /* Otherwise it's "name=value". */
+ else {
+ name = uri_string_unescape(query, eq - query, NULL);
+ value = uri_string_unescape(eq + 1, end - (eq + 1), NULL);
+ }
+
+ /* Append to the parameter set. */
+ query_params_append(ps, name, value);
+ g_free(name);
+ g_free(value);
+
+ next:
+ query = end;
+ if (*query) {
+ query++; /* skip '&' separator */
+ }
+ }
+
+ return ps;
+}
diff --git a/util/userfaultfd.c b/util/userfaultfd.c
new file mode 100644
index 000000000..f1cd6af2b
--- /dev/null
+++ b/util/userfaultfd.c
@@ -0,0 +1,345 @@
+/*
+ * Linux UFFD-WP support
+ *
+ * Copyright Virtuozzo GmbH, 2020
+ *
+ * Authors:
+ * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/bitops.h"
+#include "qemu/error-report.h"
+#include "qemu/userfaultfd.h"
+#include "trace.h"
+#include <poll.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+
+/**
+ * uffd_query_features: query UFFD features
+ *
+ * Returns: 0 on success, negative value in case of an error
+ *
+ * @features: parameter to receive 'uffdio_api.features'
+ */
+int uffd_query_features(uint64_t *features)
+{
+ int uffd_fd;
+ struct uffdio_api api_struct = { 0 };
+ int ret = -1;
+
+ uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC);
+ if (uffd_fd < 0) {
+ trace_uffd_query_features_nosys(errno);
+ return -1;
+ }
+
+ api_struct.api = UFFD_API;
+ api_struct.features = 0;
+
+ if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+ trace_uffd_query_features_api_failed(errno);
+ goto out;
+ }
+ *features = api_struct.features;
+ ret = 0;
+
+out:
+ close(uffd_fd);
+ return ret;
+}
+
+/**
+ * uffd_create_fd: create UFFD file descriptor
+ *
+ * Returns non-negative file descriptor or negative value in case of an error
+ *
+ * @features: UFFD features to request
+ * @non_blocking: create UFFD file descriptor for non-blocking operation
+ */
+int uffd_create_fd(uint64_t features, bool non_blocking)
+{
+ int uffd_fd;
+ int flags;
+ struct uffdio_api api_struct = { 0 };
+ uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
+
+ flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
+ uffd_fd = syscall(__NR_userfaultfd, flags);
+ if (uffd_fd < 0) {
+ trace_uffd_create_fd_nosys(errno);
+ return -1;
+ }
+
+ api_struct.api = UFFD_API;
+ api_struct.features = features;
+ if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
+ trace_uffd_create_fd_api_failed(errno);
+ goto fail;
+ }
+ if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+ trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
+ goto fail;
+ }
+
+ return uffd_fd;
+
+fail:
+ close(uffd_fd);
+ return -1;
+}
+
+/**
+ * uffd_close_fd: close UFFD file descriptor
+ *
+ * @uffd_fd: UFFD file descriptor
+ */
+void uffd_close_fd(int uffd_fd)
+{
+ assert(uffd_fd >= 0);
+ close(uffd_fd);
+}
+
+/**
+ * uffd_register_memory: register memory range via UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
+ * @ioctls: optional pointer to receive supported IOCTL mask
+ */
+int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
+ uint64_t mode, uint64_t *ioctls)
+{
+ struct uffdio_register uffd_register;
+
+ uffd_register.range.start = (uintptr_t) addr;
+ uffd_register.range.len = length;
+ uffd_register.mode = mode;
+
+ if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
+ trace_uffd_register_memory_failed(addr, length, mode, errno);
+ return -1;
+ }
+ if (ioctls) {
+ *ioctls = uffd_register.ioctls;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_unregister_memory: un-register memory range with UFFD-IO
+ *
+ * Returns 0 in case of success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ */
+int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
+{
+ struct uffdio_range uffd_range;
+
+ uffd_range.start = (uintptr_t) addr;
+ uffd_range.len = length;
+
+ if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
+ trace_uffd_unregister_memory_failed(addr, length, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
+ *
+ * Returns 0 on success, negative value in case of error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address of memory range
+ * @length: length of memory range
+ * @wp: write-protect/unprotect
+ * @dont_wake: do not wake threads waiting on wr-protected page
+ */
+int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
+ bool wp, bool dont_wake)
+{
+ struct uffdio_writeprotect uffd_writeprotect;
+
+ uffd_writeprotect.range.start = (uintptr_t) addr;
+ uffd_writeprotect.range.len = length;
+ if (!wp && dont_wake) {
+ /* DONTWAKE is meaningful only on protection release */
+ uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
+ } else {
+ uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
+ }
+
+ if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
+ error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", addr, length,
+ (uint64_t) uffd_writeprotect.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_copy_page: copy range of pages to destination via UFFD-IO
+ *
+ * Copy range of source pages to the destination to resolve
+ * missing page fault somewhere in the destination range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @dst_addr: destination base address
+ * @src_addr: source base address
+ * @length: length of the range to copy
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
+ uint64_t length, bool dont_wake)
+{
+ struct uffdio_copy uffd_copy;
+
+ uffd_copy.dst = (uintptr_t) dst_addr;
+ uffd_copy.src = (uintptr_t) src_addr;
+ uffd_copy.len = length;
+ uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
+
+ if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
+ error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
+ length, (uint64_t) uffd_copy.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
+ *
+ * Fill range pages with zeroes to resolve missing page fault within the range.
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range to fill with zeroes
+ * @dont_wake: do not wake threads waiting on missing page
+ */
+int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
+{
+ struct uffdio_zeropage uffd_zeropage;
+
+ uffd_zeropage.range.start = (uintptr_t) addr;
+ uffd_zeropage.range.len = length;
+ uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
+
+ if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
+ error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
+ " mode=%" PRIx64 " errno=%i", addr, length,
+ (uint64_t) uffd_zeropage.mode, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
+ *
+ * Wake up threads waiting on any page/pages from the designated range.
+ * The main use case is when during some period, page faults are resolved
+ * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
+ * for the whole memory range are satisfied in a single call to uffd_wakeup().
+ *
+ * Returns 0 on success, negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @addr: base address
+ * @length: length of the range
+ */
+int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
+{
+ struct uffdio_range uffd_range;
+
+ uffd_range.start = (uintptr_t) addr;
+ uffd_range.len = length;
+
+ if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
+ error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
+ addr, length, errno);
+ return -1;
+ }
+
+ return 0;
+}
+
+/**
+ * uffd_read_events: read pending UFFD events
+ *
+ * Returns number of fetched messages, 0 if non is available or
+ * negative value in case of an error
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @msgs: pointer to message buffer
+ * @count: number of messages that can fit in the buffer
+ */
+int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
+{
+ ssize_t res;
+ do {
+ res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
+ } while (res < 0 && errno == EINTR);
+
+ if ((res < 0 && errno == EAGAIN)) {
+ return 0;
+ }
+ if (res < 0) {
+ error_report("uffd_read_events() failed: errno=%i", errno);
+ return -1;
+ }
+
+ return (int) (res / sizeof(struct uffd_msg));
+}
+
+/**
+ * uffd_poll_events: poll UFFD file descriptor for read
+ *
+ * Returns true if events are available for read, false otherwise
+ *
+ * @uffd_fd: UFFD file descriptor
+ * @tmo: timeout value
+ */
+bool uffd_poll_events(int uffd_fd, int tmo)
+{
+ int res;
+ struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
+
+ do {
+ res = poll(&poll_fd, 1, tmo);
+ } while (res < 0 && errno == EINTR);
+
+ if (res == 0) {
+ return false;
+ }
+ if (res < 0) {
+ error_report("uffd_poll_events() failed: errno=%i", errno);
+ return false;
+ }
+
+ return (poll_fd.revents & POLLIN) != 0;
+}
diff --git a/util/uuid.c b/util/uuid.c
new file mode 100644
index 000000000..b1108dde7
--- /dev/null
+++ b/util/uuid.c
@@ -0,0 +1,118 @@
+/*
+ * QEMU UUID functions
+ *
+ * Copyright 2016 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/uuid.h"
+#include "qemu/bswap.h"
+
+void qemu_uuid_generate(QemuUUID *uuid)
+{
+ int i;
+ uint32_t tmp[4];
+
+ QEMU_BUILD_BUG_ON(sizeof(QemuUUID) != 16);
+
+ for (i = 0; i < 4; ++i) {
+ tmp[i] = g_random_int();
+ }
+ memcpy(uuid, tmp, sizeof(tmp));
+ /* Set the two most significant bits (bits 6 and 7) of the
+ clock_seq_hi_and_reserved to zero and one, respectively. */
+ uuid->data[8] = (uuid->data[8] & 0x3f) | 0x80;
+ /* Set the four most significant bits (bits 12 through 15) of the
+ time_hi_and_version field to the 4-bit version number.
+ */
+ uuid->data[6] = (uuid->data[6] & 0xf) | 0x40;
+}
+
+int qemu_uuid_is_null(const QemuUUID *uu)
+{
+ static QemuUUID null_uuid;
+ return qemu_uuid_is_equal(uu, &null_uuid);
+}
+
+int qemu_uuid_is_equal(const QemuUUID *lhv, const QemuUUID *rhv)
+{
+ return memcmp(lhv, rhv, sizeof(QemuUUID)) == 0;
+}
+
+void qemu_uuid_unparse(const QemuUUID *uuid, char *out)
+{
+ const unsigned char *uu = &uuid->data[0];
+ snprintf(out, UUID_FMT_LEN + 1, UUID_FMT,
+ uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7],
+ uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]);
+}
+
+char *qemu_uuid_unparse_strdup(const QemuUUID *uuid)
+{
+ const unsigned char *uu = &uuid->data[0];
+ return g_strdup_printf(UUID_FMT,
+ uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6],
+ uu[7], uu[8], uu[9], uu[10], uu[11], uu[12],
+ uu[13], uu[14], uu[15]);
+}
+
+static bool qemu_uuid_is_valid(const char *str)
+{
+ int i;
+
+ for (i = 0; i < strlen(str); i++) {
+ const char c = str[i];
+ if (i == 8 || i == 13 || i == 18 || i == 23) {
+ if (str[i] != '-') {
+ return false;
+ }
+ } else {
+ if ((c >= '0' && c <= '9') ||
+ (c >= 'A' && c <= 'F') ||
+ (c >= 'a' && c <= 'f')) {
+ continue;
+ }
+ return false;
+ }
+ }
+ return i == 36;
+}
+
+int qemu_uuid_parse(const char *str, QemuUUID *uuid)
+{
+ unsigned char *uu = &uuid->data[0];
+ int ret;
+
+ if (!qemu_uuid_is_valid(str)) {
+ return -1;
+ }
+
+ ret = sscanf(str, UUID_FMT, &uu[0], &uu[1], &uu[2], &uu[3],
+ &uu[4], &uu[5], &uu[6], &uu[7], &uu[8], &uu[9],
+ &uu[10], &uu[11], &uu[12], &uu[13], &uu[14],
+ &uu[15]);
+
+ if (ret != 16) {
+ return -1;
+ }
+ return 0;
+}
+
+/* Swap from UUID format endian (BE) to the opposite or vice versa.
+ */
+QemuUUID qemu_uuid_bswap(QemuUUID uuid)
+{
+ bswap32s(&uuid.fields.time_low);
+ bswap16s(&uuid.fields.time_mid);
+ bswap16s(&uuid.fields.time_high_and_version);
+ return uuid;
+}
diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c
new file mode 100644
index 000000000..00a80431a
--- /dev/null
+++ b/util/vfio-helpers.c
@@ -0,0 +1,861 @@
+/*
+ * VFIO utility
+ *
+ * Copyright 2016 - 2018 Red Hat, Inc.
+ *
+ * Authors:
+ * Fam Zheng <famz@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <linux/vfio.h>
+#include "qapi/error.h"
+#include "exec/ramlist.h"
+#include "exec/cpu-common.h"
+#include "exec/memory.h"
+#include "trace.h"
+#include "qemu/error-report.h"
+#include "standard-headers/linux/pci_regs.h"
+#include "qemu/event_notifier.h"
+#include "qemu/vfio-helpers.h"
+#include "qemu/lockable.h"
+#include "trace.h"
+
+#define QEMU_VFIO_DEBUG 0
+
+#define QEMU_VFIO_IOVA_MIN 0x10000ULL
+/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface,
+ * we can use a runtime limit; alternatively it's also possible to do platform
+ * specific detection by reading sysfs entries. Until then, 39 is a safe bet.
+ **/
+#define QEMU_VFIO_IOVA_MAX (1ULL << 39)
+
+typedef struct {
+ /* Page aligned addr. */
+ void *host;
+ size_t size;
+ uint64_t iova;
+} IOVAMapping;
+
+struct IOVARange {
+ uint64_t start;
+ uint64_t end;
+};
+
+struct QEMUVFIOState {
+ QemuMutex lock;
+
+ /* These fields are protected by BQL */
+ int container;
+ int group;
+ int device;
+ RAMBlockNotifier ram_notifier;
+ struct vfio_region_info config_region_info, bar_region_info[6];
+ struct IOVARange *usable_iova_ranges;
+ uint8_t nb_iova_ranges;
+
+ /* These fields are protected by @lock */
+ /* VFIO's IO virtual address space is managed by splitting into a few
+ * sections:
+ *
+ * --------------- <= 0
+ * |xxxxxxxxxxxxx|
+ * |-------------| <= QEMU_VFIO_IOVA_MIN
+ * | |
+ * | Fixed |
+ * | |
+ * |-------------| <= low_water_mark
+ * | |
+ * | Free |
+ * | |
+ * |-------------| <= high_water_mark
+ * | |
+ * | Temp |
+ * | |
+ * |-------------| <= QEMU_VFIO_IOVA_MAX
+ * |xxxxxxxxxxxxx|
+ * |xxxxxxxxxxxxx|
+ * ---------------
+ *
+ * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid;
+ *
+ * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of
+ * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be
+ * reclaimed - low_water_mark never shrinks;
+ *
+ * - IOVAs in range [low_water_mark, high_water_mark) are free;
+ *
+ * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile
+ * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area
+ * is recycled. The caller should make sure I/O's depending on these
+ * mappings are completed before calling.
+ **/
+ uint64_t low_water_mark;
+ uint64_t high_water_mark;
+ IOVAMapping *mappings;
+ int nr_mappings;
+};
+
+/**
+ * Find group file by PCI device address as specified @device, and return the
+ * path. The returned string is owned by caller and should be g_free'ed later.
+ */
+static char *sysfs_find_group_file(const char *device, Error **errp)
+{
+ char *sysfs_link;
+ char *sysfs_group;
+ char *p;
+ char *path = NULL;
+
+ sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device);
+ sysfs_group = g_malloc0(PATH_MAX);
+ if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) {
+ error_setg_errno(errp, errno, "Failed to find iommu group sysfs path");
+ goto out;
+ }
+ p = strrchr(sysfs_group, '/');
+ if (!p) {
+ error_setg(errp, "Failed to find iommu group number");
+ goto out;
+ }
+
+ path = g_strdup_printf("/dev/vfio/%s", p + 1);
+out:
+ g_free(sysfs_link);
+ g_free(sysfs_group);
+ return path;
+}
+
+static inline void assert_bar_index_valid(QEMUVFIOState *s, int index)
+{
+ assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info));
+}
+
+static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp)
+{
+ g_autofree char *barname = NULL;
+ assert_bar_index_valid(s, index);
+ s->bar_region_info[index] = (struct vfio_region_info) {
+ .index = VFIO_PCI_BAR0_REGION_INDEX + index,
+ .argsz = sizeof(struct vfio_region_info),
+ };
+ if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) {
+ error_setg_errno(errp, errno, "Failed to get BAR region info");
+ return -errno;
+ }
+ barname = g_strdup_printf("bar[%d]", index);
+ trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset,
+ s->bar_region_info[index].size,
+ s->bar_region_info[index].cap_offset);
+
+ return 0;
+}
+
+/**
+ * Map a PCI bar area.
+ */
+void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index,
+ uint64_t offset, uint64_t size, int prot,
+ Error **errp)
+{
+ void *p;
+ assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size));
+ assert_bar_index_valid(s, index);
+ p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset),
+ prot, MAP_SHARED,
+ s->device, s->bar_region_info[index].offset + offset);
+ trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset ,
+ size, offset, p);
+ if (p == MAP_FAILED) {
+ error_setg_errno(errp, errno, "Failed to map BAR region");
+ p = NULL;
+ }
+ return p;
+}
+
+/**
+ * Unmap a PCI bar area.
+ */
+void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar,
+ uint64_t offset, uint64_t size)
+{
+ if (bar) {
+ munmap(bar, MIN(size, s->bar_region_info[index].size - offset));
+ }
+}
+
+/**
+ * Initialize device IRQ with @irq_type and register an event notifier.
+ */
+int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e,
+ int irq_type, Error **errp)
+{
+ int r;
+ struct vfio_irq_set *irq_set;
+ size_t irq_set_size;
+ struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) };
+
+ irq_info.index = irq_type;
+ if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) {
+ error_setg_errno(errp, errno, "Failed to get device interrupt info");
+ return -errno;
+ }
+ if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) {
+ error_setg(errp, "Device interrupt doesn't support eventfd");
+ return -EINVAL;
+ }
+
+ irq_set_size = sizeof(*irq_set) + sizeof(int);
+ irq_set = g_malloc0(irq_set_size);
+
+ /* Get to a known IRQ state */
+ *irq_set = (struct vfio_irq_set) {
+ .argsz = irq_set_size,
+ .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER,
+ .index = irq_info.index,
+ .start = 0,
+ .count = 1,
+ };
+
+ *(int *)&irq_set->data = event_notifier_get_fd(e);
+ r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set);
+ g_free(irq_set);
+ if (r) {
+ error_setg_errno(errp, errno, "Failed to setup device interrupt");
+ return -errno;
+ }
+ return 0;
+}
+
+static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf,
+ int size, int ofs)
+{
+ int ret;
+
+ trace_qemu_vfio_pci_read_config(buf, ofs, size,
+ s->config_region_info.offset,
+ s->config_region_info.size);
+ assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+ do {
+ ret = pread(s->device, buf, size, s->config_region_info.offset + ofs);
+ } while (ret == -1 && errno == EINTR);
+ return ret == size ? 0 : -errno;
+}
+
+static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs)
+{
+ int ret;
+
+ trace_qemu_vfio_pci_write_config(buf, ofs, size,
+ s->config_region_info.offset,
+ s->config_region_info.size);
+ assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size));
+ do {
+ ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs);
+ } while (ret == -1 && errno == EINTR);
+ return ret == size ? 0 : -errno;
+}
+
+static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf)
+{
+ struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf;
+ struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset;
+ struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range;
+ int i;
+
+ while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) {
+ if (!cap->next) {
+ return;
+ }
+ cap = (struct vfio_info_cap_header *)(buf + cap->next);
+ }
+
+ cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap;
+
+ s->nb_iova_ranges = cap_iova_range->nr_iovas;
+ if (s->nb_iova_ranges > 1) {
+ s->usable_iova_ranges =
+ g_realloc(s->usable_iova_ranges,
+ s->nb_iova_ranges * sizeof(struct IOVARange));
+ }
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start;
+ s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end;
+ }
+}
+
+static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device,
+ Error **errp)
+{
+ int ret;
+ int i;
+ uint16_t pci_cmd;
+ struct vfio_group_status group_status = { .argsz = sizeof(group_status) };
+ struct vfio_iommu_type1_info *iommu_info = NULL;
+ size_t iommu_info_size = sizeof(*iommu_info);
+ struct vfio_device_info device_info = { .argsz = sizeof(device_info) };
+ char *group_file = NULL;
+
+ s->usable_iova_ranges = NULL;
+
+ /* Create a new container */
+ s->container = open("/dev/vfio/vfio", O_RDWR);
+
+ if (s->container == -1) {
+ error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio");
+ return -errno;
+ }
+ if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) {
+ error_setg(errp, "Invalid VFIO version");
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) {
+ error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported");
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ /* Open the group */
+ group_file = sysfs_find_group_file(device, errp);
+ if (!group_file) {
+ ret = -EINVAL;
+ goto fail_container;
+ }
+
+ s->group = open(group_file, O_RDWR);
+ if (s->group == -1) {
+ error_setg_errno(errp, errno, "Failed to open VFIO group file: %s",
+ group_file);
+ g_free(group_file);
+ ret = -errno;
+ goto fail_container;
+ }
+ g_free(group_file);
+
+ /* Test the group is viable and available */
+ if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) {
+ error_setg_errno(errp, errno, "Failed to get VFIO group status");
+ ret = -errno;
+ goto fail;
+ }
+
+ if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
+ error_setg(errp, "VFIO group is not viable");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ /* Add the group to the container */
+ if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) {
+ error_setg_errno(errp, errno, "Failed to add group to VFIO container");
+ ret = -errno;
+ goto fail;
+ }
+
+ /* Enable the IOMMU model we want */
+ if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) {
+ error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type");
+ ret = -errno;
+ goto fail;
+ }
+
+ iommu_info = g_malloc0(iommu_info_size);
+ iommu_info->argsz = iommu_info_size;
+
+ /* Get additional IOMMU info */
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ error_setg_errno(errp, errno, "Failed to get IOMMU info");
+ ret = -errno;
+ goto fail;
+ }
+
+ /*
+ * if the kernel does not report usable IOVA regions, choose
+ * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region
+ */
+ s->nb_iova_ranges = 1;
+ s->usable_iova_ranges = g_new0(struct IOVARange, 1);
+ s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN;
+ s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1;
+
+ if (iommu_info->argsz > iommu_info_size) {
+ iommu_info_size = iommu_info->argsz;
+ iommu_info = g_realloc(iommu_info, iommu_info_size);
+ if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) {
+ ret = -errno;
+ goto fail;
+ }
+ collect_usable_iova_ranges(s, iommu_info);
+ }
+
+ s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device);
+
+ if (s->device < 0) {
+ error_setg_errno(errp, errno, "Failed to get device fd");
+ ret = -errno;
+ goto fail;
+ }
+
+ /* Test and setup the device */
+ if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) {
+ error_setg_errno(errp, errno, "Failed to get device info");
+ ret = -errno;
+ goto fail;
+ }
+
+ if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) {
+ error_setg(errp, "Invalid device regions");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ s->config_region_info = (struct vfio_region_info) {
+ .index = VFIO_PCI_CONFIG_REGION_INDEX,
+ .argsz = sizeof(struct vfio_region_info),
+ };
+ if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) {
+ error_setg_errno(errp, errno, "Failed to get config region info");
+ ret = -errno;
+ goto fail;
+ }
+ trace_qemu_vfio_region_info("config", s->config_region_info.offset,
+ s->config_region_info.size,
+ s->config_region_info.cap_offset);
+
+ for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) {
+ ret = qemu_vfio_pci_init_bar(s, i, errp);
+ if (ret) {
+ goto fail;
+ }
+ }
+
+ /* Enable bus master */
+ ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+ if (ret) {
+ goto fail;
+ }
+ pci_cmd |= PCI_COMMAND_MASTER;
+ ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND);
+ if (ret) {
+ goto fail;
+ }
+ g_free(iommu_info);
+ return 0;
+fail:
+ g_free(s->usable_iova_ranges);
+ s->usable_iova_ranges = NULL;
+ s->nb_iova_ranges = 0;
+ g_free(iommu_info);
+ close(s->group);
+fail_container:
+ close(s->container);
+ return ret;
+}
+
+static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host,
+ size_t size, size_t max_size)
+{
+ QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+ Error *local_err = NULL;
+ int ret;
+
+ trace_qemu_vfio_ram_block_added(s, host, max_size);
+ ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err);
+ if (ret) {
+ error_reportf_err(local_err,
+ "qemu_vfio_dma_map(%p, %zu) failed: ",
+ host, max_size);
+ }
+}
+
+static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host,
+ size_t size, size_t max_size)
+{
+ QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier);
+ if (host) {
+ trace_qemu_vfio_ram_block_removed(s, host, max_size);
+ qemu_vfio_dma_unmap(s, host);
+ }
+}
+
+static void qemu_vfio_open_common(QEMUVFIOState *s)
+{
+ qemu_mutex_init(&s->lock);
+ s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added;
+ s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed;
+ s->low_water_mark = QEMU_VFIO_IOVA_MIN;
+ s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+ ram_block_notifier_add(&s->ram_notifier);
+}
+
+/**
+ * Open a PCI device, e.g. "0000:00:01.0".
+ */
+QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp)
+{
+ int r;
+ QEMUVFIOState *s = g_new0(QEMUVFIOState, 1);
+
+ /*
+ * VFIO may pin all memory inside mappings, resulting it in pinning
+ * all memory inside RAM blocks unconditionally.
+ */
+ r = ram_block_discard_disable(true);
+ if (r) {
+ error_setg_errno(errp, -r, "Cannot set discarding of RAM broken");
+ g_free(s);
+ return NULL;
+ }
+
+ r = qemu_vfio_init_pci(s, device, errp);
+ if (r) {
+ ram_block_discard_disable(false);
+ g_free(s);
+ return NULL;
+ }
+ qemu_vfio_open_common(s);
+ return s;
+}
+
+static void qemu_vfio_dump_mappings(QEMUVFIOState *s)
+{
+ for (int i = 0; i < s->nr_mappings; ++i) {
+ trace_qemu_vfio_dump_mapping(s->mappings[i].host,
+ s->mappings[i].iova,
+ s->mappings[i].size);
+ }
+}
+
+/**
+ * Find the mapping entry that contains [host, host + size) and set @index to
+ * the position. If no entry contains it, @index is the position _after_ which
+ * to insert the new mapping. IOW, it is the index of the largest element that
+ * is smaller than @host, or -1 if no entry is.
+ */
+static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host,
+ int *index)
+{
+ IOVAMapping *p = s->mappings;
+ IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL;
+ IOVAMapping *mid;
+ trace_qemu_vfio_find_mapping(s, host);
+ if (!p) {
+ *index = -1;
+ return NULL;
+ }
+ while (true) {
+ mid = p + (q - p) / 2;
+ if (mid == p) {
+ break;
+ }
+ if (mid->host > host) {
+ q = mid;
+ } else if (mid->host < host) {
+ p = mid;
+ } else {
+ break;
+ }
+ }
+ if (mid->host > host) {
+ mid--;
+ } else if (mid < &s->mappings[s->nr_mappings - 1]
+ && (mid + 1)->host <= host) {
+ mid++;
+ }
+ *index = mid - &s->mappings[0];
+ if (mid >= &s->mappings[0] &&
+ mid->host <= host && mid->host + mid->size > host) {
+ assert(mid < &s->mappings[s->nr_mappings]);
+ return mid;
+ }
+ /* At this point *index + 1 is the right position to insert the new
+ * mapping.*/
+ return NULL;
+}
+
+/**
+ * Allocate IOVA and create a new mapping record and insert it in @s.
+ */
+static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s,
+ void *host, size_t size,
+ int index, uint64_t iova)
+{
+ int shift;
+ IOVAMapping m = {.host = host, .size = size, .iova = iova};
+ IOVAMapping *insert;
+
+ assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+ assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size));
+ assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size));
+ trace_qemu_vfio_new_mapping(s, host, size, index, iova);
+
+ assert(index >= 0);
+ s->nr_mappings++;
+ s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
+ insert = &s->mappings[index];
+ shift = s->nr_mappings - index - 1;
+ if (shift) {
+ memmove(insert + 1, insert, shift * sizeof(s->mappings[0]));
+ }
+ *insert = m;
+ return insert;
+}
+
+/* Do the DMA mapping with VFIO. */
+static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size,
+ uint64_t iova, Error **errp)
+{
+ struct vfio_iommu_type1_dma_map dma_map = {
+ .argsz = sizeof(dma_map),
+ .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
+ .iova = iova,
+ .vaddr = (uintptr_t)host,
+ .size = size,
+ };
+ trace_qemu_vfio_do_mapping(s, host, iova, size);
+
+ if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) {
+ error_setg_errno(errp, errno, "VFIO_MAP_DMA failed");
+ return -errno;
+ }
+ return 0;
+}
+
+/**
+ * Undo the DMA mapping from @s with VFIO, and remove from mapping list.
+ */
+static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping,
+ Error **errp)
+{
+ int index;
+ struct vfio_iommu_type1_dma_unmap unmap = {
+ .argsz = sizeof(unmap),
+ .flags = 0,
+ .iova = mapping->iova,
+ .size = mapping->size,
+ };
+
+ index = mapping - s->mappings;
+ assert(mapping->size > 0);
+ assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size));
+ assert(index >= 0 && index < s->nr_mappings);
+ if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+ error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed");
+ }
+ memmove(mapping, &s->mappings[index + 1],
+ sizeof(s->mappings[0]) * (s->nr_mappings - index - 1));
+ s->nr_mappings--;
+ s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings);
+}
+
+/* Check if the mapping list is (ascending) ordered. */
+static bool qemu_vfio_verify_mappings(QEMUVFIOState *s)
+{
+ int i;
+ if (QEMU_VFIO_DEBUG) {
+ for (i = 0; i < s->nr_mappings - 1; ++i) {
+ if (!(s->mappings[i].host < s->mappings[i + 1].host)) {
+ error_report("item %d not sorted!", i);
+ qemu_vfio_dump_mappings(s);
+ return false;
+ }
+ if (!(s->mappings[i].host + s->mappings[i].size <=
+ s->mappings[i + 1].host)) {
+ error_report("item %d overlap with next!", i);
+ qemu_vfio_dump_mappings(s);
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size,
+ uint64_t *iova, Error **errp)
+{
+ int i;
+
+ for (i = 0; i < s->nb_iova_ranges; i++) {
+ if (s->usable_iova_ranges[i].end < s->low_water_mark) {
+ continue;
+ }
+ s->low_water_mark =
+ MAX(s->low_water_mark, s->usable_iova_ranges[i].start);
+
+ if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size ||
+ s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) {
+ *iova = s->low_water_mark;
+ s->low_water_mark += size;
+ return true;
+ }
+ }
+ error_setg(errp, "fixed iova range not found");
+
+ return false;
+}
+
+static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size,
+ uint64_t *iova, Error **errp)
+{
+ int i;
+
+ for (i = s->nb_iova_ranges - 1; i >= 0; i--) {
+ if (s->usable_iova_ranges[i].start > s->high_water_mark) {
+ continue;
+ }
+ s->high_water_mark =
+ MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1);
+
+ if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size ||
+ s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) {
+ *iova = s->high_water_mark - size;
+ s->high_water_mark = *iova;
+ return true;
+ }
+ }
+ error_setg(errp, "temporary iova range not found");
+
+ return false;
+}
+
+/**
+ * qemu_vfio_water_mark_reached:
+ *
+ * Returns %true if high watermark has been reached, %false otherwise.
+ */
+static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size,
+ Error **errp)
+{
+ if (s->high_water_mark - s->low_water_mark + 1 < size) {
+ error_setg(errp, "iova exhausted (water mark reached)");
+ return true;
+ }
+ return false;
+}
+
+/* Map [host, host + size) area into a contiguous IOVA address space, and store
+ * the result in @iova if not NULL. The caller need to make sure the area is
+ * aligned to page size, and mustn't overlap with existing mapping areas (split
+ * mapping status within this area is not allowed).
+ */
+int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size,
+ bool temporary, uint64_t *iova, Error **errp)
+{
+ int index;
+ IOVAMapping *mapping;
+ uint64_t iova0;
+
+ assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size));
+ assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size));
+ trace_qemu_vfio_dma_map(s, host, size, temporary, iova);
+ QEMU_LOCK_GUARD(&s->lock);
+ mapping = qemu_vfio_find_mapping(s, host, &index);
+ if (mapping) {
+ iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host);
+ } else {
+ int ret;
+
+ if (qemu_vfio_water_mark_reached(s, size, errp)) {
+ return -ENOMEM;
+ }
+ if (!temporary) {
+ if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) {
+ return -ENOMEM;
+ }
+
+ mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0);
+ assert(qemu_vfio_verify_mappings(s));
+ ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
+ if (ret < 0) {
+ qemu_vfio_undo_mapping(s, mapping, NULL);
+ return ret;
+ }
+ qemu_vfio_dump_mappings(s);
+ } else {
+ if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) {
+ return -ENOMEM;
+ }
+ ret = qemu_vfio_do_mapping(s, host, size, iova0, errp);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+ }
+ trace_qemu_vfio_dma_mapped(s, host, iova0, size);
+ if (iova) {
+ *iova = iova0;
+ }
+ return 0;
+}
+
+/* Reset the high watermark and free all "temporary" mappings. */
+int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s)
+{
+ struct vfio_iommu_type1_dma_unmap unmap = {
+ .argsz = sizeof(unmap),
+ .flags = 0,
+ .iova = s->high_water_mark,
+ .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark,
+ };
+ trace_qemu_vfio_dma_reset_temporary(s);
+ QEMU_LOCK_GUARD(&s->lock);
+ if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
+ error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno));
+ return -errno;
+ }
+ s->high_water_mark = QEMU_VFIO_IOVA_MAX;
+ return 0;
+}
+
+/* Unmapping the whole area that was previously mapped with
+ * qemu_vfio_dma_map(). */
+void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host)
+{
+ int index = 0;
+ IOVAMapping *m;
+
+ if (!host) {
+ return;
+ }
+
+ trace_qemu_vfio_dma_unmap(s, host);
+ QEMU_LOCK_GUARD(&s->lock);
+ m = qemu_vfio_find_mapping(s, host, &index);
+ if (!m) {
+ return;
+ }
+ qemu_vfio_undo_mapping(s, m, NULL);
+}
+
+static void qemu_vfio_reset(QEMUVFIOState *s)
+{
+ ioctl(s->device, VFIO_DEVICE_RESET);
+}
+
+/* Close and free the VFIO resources. */
+void qemu_vfio_close(QEMUVFIOState *s)
+{
+ int i;
+
+ if (!s) {
+ return;
+ }
+ for (i = 0; i < s->nr_mappings; ++i) {
+ qemu_vfio_undo_mapping(s, &s->mappings[i], NULL);
+ }
+ ram_block_notifier_remove(&s->ram_notifier);
+ g_free(s->usable_iova_ranges);
+ s->nb_iova_ranges = 0;
+ qemu_vfio_reset(s);
+ close(s->device);
+ close(s->group);
+ close(s->container);
+ ram_block_discard_disable(false);
+}
diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c
new file mode 100644
index 000000000..783d847a6
--- /dev/null
+++ b/util/vhost-user-server.c
@@ -0,0 +1,446 @@
+/*
+ * Sharing QEMU devices via vhost-user protocol
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/main-loop.h"
+#include "qemu/vhost-user-server.h"
+#include "block/aio-wait.h"
+
+/*
+ * Theory of operation:
+ *
+ * VuServer is started and stopped by vhost_user_server_start() and
+ * vhost_user_server_stop() from the main loop thread. Starting the server
+ * opens a vhost-user UNIX domain socket and listens for incoming connections.
+ * Only one connection is allowed at a time.
+ *
+ * The connection is handled by the vu_client_trip() coroutine in the
+ * VuServer->ctx AioContext. The coroutine consists of a vu_dispatch() loop
+ * where libvhost-user calls vu_message_read() to receive the next vhost-user
+ * protocol messages over the UNIX domain socket.
+ *
+ * When virtqueues are set up libvhost-user calls set_watch() to monitor kick
+ * fds. These fds are also handled in the VuServer->ctx AioContext.
+ *
+ * Both vu_client_trip() and kick fd monitoring can be stopped by shutting down
+ * the socket connection. Shutting down the socket connection causes
+ * vu_message_read() to fail since no more data can be received from the socket.
+ * After vu_dispatch() fails, vu_client_trip() calls vu_deinit() to stop
+ * libvhost-user before terminating the coroutine. vu_deinit() calls
+ * remove_watch() to stop monitoring kick fds and this stops virtqueue
+ * processing.
+ *
+ * When vu_client_trip() has finished cleaning up it schedules a BH in the main
+ * loop thread to accept the next client connection.
+ *
+ * When libvhost-user detects an error it calls panic_cb() and sets the
+ * dev->broken flag. Both vu_client_trip() and kick fd processing stop when
+ * the dev->broken flag is set.
+ *
+ * It is possible to switch AioContexts using
+ * vhost_user_server_detach_aio_context() and
+ * vhost_user_server_attach_aio_context(). They stop monitoring fds in the old
+ * AioContext and resume monitoring in the new AioContext. The vu_client_trip()
+ * coroutine remains in a yielded state during the switch. This is made
+ * possible by QIOChannel's support for spurious coroutine re-entry in
+ * qio_channel_yield(). The coroutine will restart I/O when re-entered from the
+ * new AioContext.
+ */
+
+static void vmsg_close_fds(VhostUserMsg *vmsg)
+{
+ int i;
+ for (i = 0; i < vmsg->fd_num; i++) {
+ close(vmsg->fds[i]);
+ }
+}
+
+static void vmsg_unblock_fds(VhostUserMsg *vmsg)
+{
+ int i;
+ for (i = 0; i < vmsg->fd_num; i++) {
+ qemu_set_nonblock(vmsg->fds[i]);
+ }
+}
+
+static void panic_cb(VuDev *vu_dev, const char *buf)
+{
+ error_report("vu_panic: %s", buf);
+}
+
+static bool coroutine_fn
+vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg)
+{
+ struct iovec iov = {
+ .iov_base = (char *)vmsg,
+ .iov_len = VHOST_USER_HDR_SIZE,
+ };
+ int rc, read_bytes = 0;
+ Error *local_err = NULL;
+ const size_t max_fds = G_N_ELEMENTS(vmsg->fds);
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+ QIOChannel *ioc = server->ioc;
+
+ vmsg->fd_num = 0;
+ if (!ioc) {
+ error_report_err(local_err);
+ goto fail;
+ }
+
+ assert(qemu_in_coroutine());
+ do {
+ size_t nfds = 0;
+ int *fds = NULL;
+
+ /*
+ * qio_channel_readv_full may have short reads, keeping calling it
+ * until getting VHOST_USER_HDR_SIZE or 0 bytes in total
+ */
+ rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err);
+ if (rc < 0) {
+ if (rc == QIO_CHANNEL_ERR_BLOCK) {
+ assert(local_err == NULL);
+ qio_channel_yield(ioc, G_IO_IN);
+ continue;
+ } else {
+ error_report_err(local_err);
+ goto fail;
+ }
+ }
+
+ if (nfds > 0) {
+ if (vmsg->fd_num + nfds > max_fds) {
+ error_report("A maximum of %zu fds are allowed, "
+ "however got %zu fds now",
+ max_fds, vmsg->fd_num + nfds);
+ g_free(fds);
+ goto fail;
+ }
+ memcpy(vmsg->fds + vmsg->fd_num, fds, nfds * sizeof(vmsg->fds[0]));
+ vmsg->fd_num += nfds;
+ g_free(fds);
+ }
+
+ if (rc == 0) { /* socket closed */
+ goto fail;
+ }
+
+ iov.iov_base += rc;
+ iov.iov_len -= rc;
+ read_bytes += rc;
+ } while (read_bytes != VHOST_USER_HDR_SIZE);
+
+ /* qio_channel_readv_full will make socket fds blocking, unblock them */
+ vmsg_unblock_fds(vmsg);
+ if (vmsg->size > sizeof(vmsg->payload)) {
+ error_report("Error: too big message request: %d, "
+ "size: vmsg->size: %u, "
+ "while sizeof(vmsg->payload) = %zu",
+ vmsg->request, vmsg->size, sizeof(vmsg->payload));
+ goto fail;
+ }
+
+ struct iovec iov_payload = {
+ .iov_base = (char *)&vmsg->payload,
+ .iov_len = vmsg->size,
+ };
+ if (vmsg->size) {
+ rc = qio_channel_readv_all_eof(ioc, &iov_payload, 1, &local_err);
+ if (rc != 1) {
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ goto fail;
+ }
+ }
+
+ return true;
+
+fail:
+ vmsg_close_fds(vmsg);
+
+ return false;
+}
+
+static coroutine_fn void vu_client_trip(void *opaque)
+{
+ VuServer *server = opaque;
+ VuDev *vu_dev = &server->vu_dev;
+
+ while (!vu_dev->broken && vu_dispatch(vu_dev)) {
+ /* Keep running */
+ }
+
+ vu_deinit(vu_dev);
+
+ /* vu_deinit() should have called remove_watch() */
+ assert(QTAILQ_EMPTY(&server->vu_fd_watches));
+
+ object_unref(OBJECT(server->sioc));
+ server->sioc = NULL;
+
+ object_unref(OBJECT(server->ioc));
+ server->ioc = NULL;
+
+ server->co_trip = NULL;
+ if (server->restart_listener_bh) {
+ qemu_bh_schedule(server->restart_listener_bh);
+ }
+ aio_wait_kick();
+}
+
+/*
+ * a wrapper for vu_kick_cb
+ *
+ * since aio_dispatch can only pass one user data pointer to the
+ * callback function, pack VuDev and pvt into a struct. Then unpack it
+ * and pass them to vu_kick_cb
+ */
+static void kick_handler(void *opaque)
+{
+ VuFdWatch *vu_fd_watch = opaque;
+ VuDev *vu_dev = vu_fd_watch->vu_dev;
+
+ vu_fd_watch->cb(vu_dev, 0, vu_fd_watch->pvt);
+
+ /* Stop vu_client_trip() if an error occurred in vu_fd_watch->cb() */
+ if (vu_dev->broken) {
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+
+ qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+ }
+}
+
+static VuFdWatch *find_vu_fd_watch(VuServer *server, int fd)
+{
+
+ VuFdWatch *vu_fd_watch, *next;
+ QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) {
+ if (vu_fd_watch->fd == fd) {
+ return vu_fd_watch;
+ }
+ }
+ return NULL;
+}
+
+static void
+set_watch(VuDev *vu_dev, int fd, int vu_evt,
+ vu_watch_cb cb, void *pvt)
+{
+
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+ g_assert(vu_dev);
+ g_assert(fd >= 0);
+ g_assert(cb);
+
+ VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
+
+ if (!vu_fd_watch) {
+ VuFdWatch *vu_fd_watch = g_new0(VuFdWatch, 1);
+
+ QTAILQ_INSERT_TAIL(&server->vu_fd_watches, vu_fd_watch, next);
+
+ vu_fd_watch->fd = fd;
+ vu_fd_watch->cb = cb;
+ qemu_set_nonblock(fd);
+ aio_set_fd_handler(server->ioc->ctx, fd, true, kick_handler,
+ NULL, NULL, vu_fd_watch);
+ vu_fd_watch->vu_dev = vu_dev;
+ vu_fd_watch->pvt = pvt;
+ }
+}
+
+
+static void remove_watch(VuDev *vu_dev, int fd)
+{
+ VuServer *server;
+ g_assert(vu_dev);
+ g_assert(fd >= 0);
+
+ server = container_of(vu_dev, VuServer, vu_dev);
+
+ VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd);
+
+ if (!vu_fd_watch) {
+ return;
+ }
+ aio_set_fd_handler(server->ioc->ctx, fd, true, NULL, NULL, NULL, NULL);
+
+ QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next);
+ g_free(vu_fd_watch);
+}
+
+
+static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc,
+ gpointer opaque)
+{
+ VuServer *server = opaque;
+
+ if (server->sioc) {
+ warn_report("Only one vhost-user client is allowed to "
+ "connect the server one time");
+ return;
+ }
+
+ if (!vu_init(&server->vu_dev, server->max_queues, sioc->fd, panic_cb,
+ vu_message_read, set_watch, remove_watch, server->vu_iface)) {
+ error_report("Failed to initialize libvhost-user");
+ return;
+ }
+
+ /*
+ * Unset the callback function for network listener to make another
+ * vhost-user client keeping waiting until this client disconnects
+ */
+ qio_net_listener_set_client_func(server->listener,
+ NULL,
+ NULL,
+ NULL);
+ server->sioc = sioc;
+ /*
+ * Increase the object reference, so sioc will not freed by
+ * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc))
+ */
+ object_ref(OBJECT(server->sioc));
+ qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client");
+ server->ioc = QIO_CHANNEL(sioc);
+ object_ref(OBJECT(server->ioc));
+
+ /* TODO vu_message_write() spins if non-blocking! */
+ qio_channel_set_blocking(server->ioc, false, NULL);
+
+ server->co_trip = qemu_coroutine_create(vu_client_trip, server);
+
+ aio_context_acquire(server->ctx);
+ vhost_user_server_attach_aio_context(server, server->ctx);
+ aio_context_release(server->ctx);
+}
+
+void vhost_user_server_stop(VuServer *server)
+{
+ aio_context_acquire(server->ctx);
+
+ qemu_bh_delete(server->restart_listener_bh);
+ server->restart_listener_bh = NULL;
+
+ if (server->sioc) {
+ VuFdWatch *vu_fd_watch;
+
+ QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+ aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+ NULL, NULL, NULL, vu_fd_watch);
+ }
+
+ qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
+
+ AIO_WAIT_WHILE(server->ctx, server->co_trip);
+ }
+
+ aio_context_release(server->ctx);
+
+ if (server->listener) {
+ qio_net_listener_disconnect(server->listener);
+ object_unref(OBJECT(server->listener));
+ }
+}
+
+/*
+ * Allow the next client to connect to the server. Called from a BH in the main
+ * loop.
+ */
+static void restart_listener_bh(void *opaque)
+{
+ VuServer *server = opaque;
+
+ qio_net_listener_set_client_func(server->listener, vu_accept, server,
+ NULL);
+}
+
+/* Called with ctx acquired */
+void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx)
+{
+ VuFdWatch *vu_fd_watch;
+
+ server->ctx = ctx;
+
+ if (!server->sioc) {
+ return;
+ }
+
+ qio_channel_attach_aio_context(server->ioc, ctx);
+
+ QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+ aio_set_fd_handler(ctx, vu_fd_watch->fd, true, kick_handler, NULL,
+ NULL, vu_fd_watch);
+ }
+
+ aio_co_schedule(ctx, server->co_trip);
+}
+
+/* Called with server->ctx acquired */
+void vhost_user_server_detach_aio_context(VuServer *server)
+{
+ if (server->sioc) {
+ VuFdWatch *vu_fd_watch;
+
+ QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) {
+ aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true,
+ NULL, NULL, NULL, vu_fd_watch);
+ }
+
+ qio_channel_detach_aio_context(server->ioc);
+ }
+
+ server->ctx = NULL;
+}
+
+bool vhost_user_server_start(VuServer *server,
+ SocketAddress *socket_addr,
+ AioContext *ctx,
+ uint16_t max_queues,
+ const VuDevIface *vu_iface,
+ Error **errp)
+{
+ QEMUBH *bh;
+ QIONetListener *listener;
+
+ if (socket_addr->type != SOCKET_ADDRESS_TYPE_UNIX &&
+ socket_addr->type != SOCKET_ADDRESS_TYPE_FD) {
+ error_setg(errp, "Only socket address types 'unix' and 'fd' are supported");
+ return false;
+ }
+
+ listener = qio_net_listener_new();
+ if (qio_net_listener_open_sync(listener, socket_addr, 1,
+ errp) < 0) {
+ object_unref(OBJECT(listener));
+ return false;
+ }
+
+ bh = qemu_bh_new(restart_listener_bh, server);
+
+ /* zero out unspecified fields */
+ *server = (VuServer) {
+ .listener = listener,
+ .restart_listener_bh = bh,
+ .vu_iface = vu_iface,
+ .max_queues = max_queues,
+ .ctx = ctx,
+ };
+
+ qio_net_listener_set_name(server->listener, "vhost-user-backend-listener");
+
+ qio_net_listener_set_client_func(server->listener,
+ vu_accept,
+ server,
+ NULL);
+
+ QTAILQ_INIT(&server->vu_fd_watches);
+ return true;
+}
diff --git a/util/yank.c b/util/yank.c
new file mode 100644
index 000000000..abf47c346
--- /dev/null
+++ b/util/yank.c
@@ -0,0 +1,199 @@
+/*
+ * QEMU yank feature
+ *
+ * Copyright (c) Lukas Straub <lukasstraub2@web.de>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/lockable.h"
+#include "qapi/qapi-commands-yank.h"
+#include "qapi/qapi-visit-yank.h"
+#include "qapi/clone-visitor.h"
+#include "qemu/yank.h"
+
+struct YankFuncAndParam {
+ YankFn *func;
+ void *opaque;
+ QLIST_ENTRY(YankFuncAndParam) next;
+};
+
+struct YankInstanceEntry {
+ YankInstance *instance;
+ QLIST_HEAD(, YankFuncAndParam) yankfns;
+ QLIST_ENTRY(YankInstanceEntry) next;
+};
+
+typedef struct YankFuncAndParam YankFuncAndParam;
+typedef struct YankInstanceEntry YankInstanceEntry;
+
+/*
+ * This lock protects the yank_instance_list below. Because it's taken by
+ * OOB-capable commands, it must be "fast", i.e. it may only be held for a
+ * bounded, short time. See docs/devel/qapi-code-gen.txt for additional
+ * information.
+ */
+static QemuMutex yank_lock;
+
+static QLIST_HEAD(, YankInstanceEntry) yank_instance_list
+ = QLIST_HEAD_INITIALIZER(yank_instance_list);
+
+static bool yank_instance_equal(const YankInstance *a, const YankInstance *b)
+{
+ if (a->type != b->type) {
+ return false;
+ }
+
+ switch (a->type) {
+ case YANK_INSTANCE_TYPE_BLOCK_NODE:
+ return g_str_equal(a->u.block_node.node_name,
+ b->u.block_node.node_name);
+
+ case YANK_INSTANCE_TYPE_CHARDEV:
+ return g_str_equal(a->u.chardev.id, b->u.chardev.id);
+
+ case YANK_INSTANCE_TYPE_MIGRATION:
+ return true;
+
+ default:
+ abort();
+ }
+}
+
+static YankInstanceEntry *yank_find_entry(const YankInstance *instance)
+{
+ YankInstanceEntry *entry;
+
+ QLIST_FOREACH(entry, &yank_instance_list, next) {
+ if (yank_instance_equal(entry->instance, instance)) {
+ return entry;
+ }
+ }
+ return NULL;
+}
+
+bool yank_register_instance(const YankInstance *instance, Error **errp)
+{
+ YankInstanceEntry *entry;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+
+ if (yank_find_entry(instance)) {
+ error_setg(errp, "duplicate yank instance");
+ return false;
+ }
+
+ entry = g_new0(YankInstanceEntry, 1);
+ entry->instance = QAPI_CLONE(YankInstance, instance);
+ QLIST_INIT(&entry->yankfns);
+ QLIST_INSERT_HEAD(&yank_instance_list, entry, next);
+
+ return true;
+}
+
+void yank_unregister_instance(const YankInstance *instance)
+{
+ YankInstanceEntry *entry;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+ entry = yank_find_entry(instance);
+ assert(entry);
+
+ assert(QLIST_EMPTY(&entry->yankfns));
+ QLIST_REMOVE(entry, next);
+ qapi_free_YankInstance(entry->instance);
+ g_free(entry);
+}
+
+void yank_register_function(const YankInstance *instance,
+ YankFn *func,
+ void *opaque)
+{
+ YankInstanceEntry *entry;
+ YankFuncAndParam *func_entry;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+ entry = yank_find_entry(instance);
+ assert(entry);
+
+ func_entry = g_new0(YankFuncAndParam, 1);
+ func_entry->func = func;
+ func_entry->opaque = opaque;
+
+ QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next);
+}
+
+void yank_unregister_function(const YankInstance *instance,
+ YankFn *func,
+ void *opaque)
+{
+ YankInstanceEntry *entry;
+ YankFuncAndParam *func_entry;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+ entry = yank_find_entry(instance);
+ assert(entry);
+
+ QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+ if (func_entry->func == func && func_entry->opaque == opaque) {
+ QLIST_REMOVE(func_entry, next);
+ g_free(func_entry);
+ return;
+ }
+ }
+
+ abort();
+}
+
+void qmp_yank(YankInstanceList *instances,
+ Error **errp)
+{
+ YankInstanceList *tail;
+ YankInstanceEntry *entry;
+ YankFuncAndParam *func_entry;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+ for (tail = instances; tail; tail = tail->next) {
+ entry = yank_find_entry(tail->value);
+ if (!entry) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found");
+ return;
+ }
+ }
+ for (tail = instances; tail; tail = tail->next) {
+ entry = yank_find_entry(tail->value);
+ assert(entry);
+ QLIST_FOREACH(func_entry, &entry->yankfns, next) {
+ func_entry->func(func_entry->opaque);
+ }
+ }
+}
+
+YankInstanceList *qmp_query_yank(Error **errp)
+{
+ YankInstanceEntry *entry;
+ YankInstanceList *ret;
+
+ ret = NULL;
+
+ QEMU_LOCK_GUARD(&yank_lock);
+ QLIST_FOREACH(entry, &yank_instance_list, next) {
+ YankInstanceList *new_entry;
+ new_entry = g_new0(YankInstanceList, 1);
+ new_entry->value = QAPI_CLONE(YankInstance, entry->instance);
+ new_entry->next = ret;
+ ret = new_entry;
+ }
+
+ return ret;
+}
+
+static void __attribute__((__constructor__)) yank_init(void)
+{
+ qemu_mutex_init(&yank_lock);
+}