diff options
author | 2023-10-10 11:40:56 +0000 | |
---|---|---|
committer | 2023-10-10 11:40:56 +0000 | |
commit | e02cda008591317b1625707ff8e115a4841aa889 (patch) | |
tree | aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /util | |
parent | cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff) |
Introduce Virtio-loopback epsilon release:
Epsilon release introduces a new compatibility layer which make virtio-loopback
design to work with QEMU and rust-vmm vhost-user backend without require any
changes.
Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>
Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
Diffstat (limited to 'util')
98 files changed, 31989 insertions, 0 deletions
diff --git a/util/aio-posix.c b/util/aio-posix.c new file mode 100644 index 000000000..2b86777e9 --- /dev/null +++ b/util/aio-posix.c @@ -0,0 +1,730 @@ +/* + * QEMU aio implementation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "block/block.h" +#include "qemu/main-loop.h" +#include "qemu/rcu.h" +#include "qemu/rcu_queue.h" +#include "qemu/sockets.h" +#include "qemu/cutils.h" +#include "trace.h" +#include "aio-posix.h" + +/* Stop userspace polling on a handler if it isn't active for some time */ +#define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND) + +bool aio_poll_disabled(AioContext *ctx) +{ + return qatomic_read(&ctx->poll_disable_cnt); +} + +void aio_add_ready_handler(AioHandlerList *ready_list, + AioHandler *node, + int revents) +{ + QLIST_SAFE_REMOVE(node, node_ready); /* remove from nested parent's list */ + node->pfd.revents = revents; + QLIST_INSERT_HEAD(ready_list, node, node_ready); +} + +static AioHandler *find_aio_handler(AioContext *ctx, int fd) +{ + AioHandler *node; + + QLIST_FOREACH(node, &ctx->aio_handlers, node) { + if (node->pfd.fd == fd) { + if (!QLIST_IS_INSERTED(node, node_deleted)) { + return node; + } + } + } + + return NULL; +} + +static bool aio_remove_fd_handler(AioContext *ctx, AioHandler *node) +{ + /* If the GSource is in the process of being destroyed then + * g_source_remove_poll() causes an assertion failure. Skip + * removal in that case, because glib cleans up its state during + * destruction anyway. + */ + if (!g_source_is_destroyed(&ctx->source)) { + g_source_remove_poll(&ctx->source, &node->pfd); + } + + node->pfd.revents = 0; + + /* If the fd monitor has already marked it deleted, leave it alone */ + if (QLIST_IS_INSERTED(node, node_deleted)) { + return false; + } + + /* If a read is in progress, just mark the node as deleted */ + if (qemu_lockcnt_count(&ctx->list_lock)) { + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + return false; + } + /* Otherwise, delete it for real. We can't just mark it as + * deleted because deleted nodes are only cleaned up while + * no one is walking the handlers list. + */ + QLIST_SAFE_REMOVE(node, node_poll); + QLIST_REMOVE(node, node); + return true; +} + +void aio_set_fd_handler(AioContext *ctx, + int fd, + bool is_external, + IOHandler *io_read, + IOHandler *io_write, + AioPollFn *io_poll, + void *opaque) +{ + AioHandler *node; + AioHandler *new_node = NULL; + bool is_new = false; + bool deleted = false; + int poll_disable_change; + + qemu_lockcnt_lock(&ctx->list_lock); + + node = find_aio_handler(ctx, fd); + + /* Are we deleting the fd handler? */ + if (!io_read && !io_write && !io_poll) { + if (node == NULL) { + qemu_lockcnt_unlock(&ctx->list_lock); + return; + } + /* Clean events in order to unregister fd from the ctx epoll. */ + node->pfd.events = 0; + + poll_disable_change = -!node->io_poll; + } else { + poll_disable_change = !io_poll - (node && !node->io_poll); + if (node == NULL) { + is_new = true; + } + /* Alloc and insert if it's not already there */ + new_node = g_new0(AioHandler, 1); + + /* Update handler with latest information */ + new_node->io_read = io_read; + new_node->io_write = io_write; + new_node->io_poll = io_poll; + new_node->opaque = opaque; + new_node->is_external = is_external; + + if (is_new) { + new_node->pfd.fd = fd; + } else { + new_node->pfd = node->pfd; + } + g_source_add_poll(&ctx->source, &new_node->pfd); + + new_node->pfd.events = (io_read ? G_IO_IN | G_IO_HUP | G_IO_ERR : 0); + new_node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0); + + QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, new_node, node); + } + + /* No need to order poll_disable_cnt writes against other updates; + * the counter is only used to avoid wasting time and latency on + * iterated polling when the system call will be ultimately necessary. + * Changing handlers is a rare event, and a little wasted polling until + * the aio_notify below is not an issue. + */ + qatomic_set(&ctx->poll_disable_cnt, + qatomic_read(&ctx->poll_disable_cnt) + poll_disable_change); + + ctx->fdmon_ops->update(ctx, node, new_node); + if (node) { + deleted = aio_remove_fd_handler(ctx, node); + } + qemu_lockcnt_unlock(&ctx->list_lock); + aio_notify(ctx); + + if (deleted) { + g_free(node); + } +} + +void aio_set_fd_poll(AioContext *ctx, int fd, + IOHandler *io_poll_begin, + IOHandler *io_poll_end) +{ + AioHandler *node = find_aio_handler(ctx, fd); + + if (!node) { + return; + } + + node->io_poll_begin = io_poll_begin; + node->io_poll_end = io_poll_end; +} + +void aio_set_event_notifier(AioContext *ctx, + EventNotifier *notifier, + bool is_external, + EventNotifierHandler *io_read, + AioPollFn *io_poll) +{ + aio_set_fd_handler(ctx, event_notifier_get_fd(notifier), is_external, + (IOHandler *)io_read, NULL, io_poll, notifier); +} + +void aio_set_event_notifier_poll(AioContext *ctx, + EventNotifier *notifier, + EventNotifierHandler *io_poll_begin, + EventNotifierHandler *io_poll_end) +{ + aio_set_fd_poll(ctx, event_notifier_get_fd(notifier), + (IOHandler *)io_poll_begin, + (IOHandler *)io_poll_end); +} + +static bool poll_set_started(AioContext *ctx, bool started) +{ + AioHandler *node; + bool progress = false; + + if (started == ctx->poll_started) { + return false; + } + + ctx->poll_started = started; + + qemu_lockcnt_inc(&ctx->list_lock); + QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) { + IOHandler *fn; + + if (QLIST_IS_INSERTED(node, node_deleted)) { + continue; + } + + if (started) { + fn = node->io_poll_begin; + } else { + fn = node->io_poll_end; + } + + if (fn) { + fn(node->opaque); + } + + /* Poll one last time in case ->io_poll_end() raced with the event */ + if (!started) { + progress = node->io_poll(node->opaque) || progress; + } + } + qemu_lockcnt_dec(&ctx->list_lock); + + return progress; +} + + +bool aio_prepare(AioContext *ctx) +{ + /* Poll mode cannot be used with glib's event loop, disable it. */ + poll_set_started(ctx, false); + + return false; +} + +bool aio_pending(AioContext *ctx) +{ + AioHandler *node; + bool result = false; + + /* + * We have to walk very carefully in case aio_set_fd_handler is + * called while we're walking. + */ + qemu_lockcnt_inc(&ctx->list_lock); + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + int revents; + + revents = node->pfd.revents & node->pfd.events; + if (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR) && node->io_read && + aio_node_check(ctx, node->is_external)) { + result = true; + break; + } + if (revents & (G_IO_OUT | G_IO_ERR) && node->io_write && + aio_node_check(ctx, node->is_external)) { + result = true; + break; + } + } + qemu_lockcnt_dec(&ctx->list_lock); + + return result; +} + +static void aio_free_deleted_handlers(AioContext *ctx) +{ + AioHandler *node; + + if (QLIST_EMPTY_RCU(&ctx->deleted_aio_handlers)) { + return; + } + if (!qemu_lockcnt_dec_if_lock(&ctx->list_lock)) { + return; /* we are nested, let the parent do the freeing */ + } + + while ((node = QLIST_FIRST_RCU(&ctx->deleted_aio_handlers))) { + QLIST_REMOVE(node, node); + QLIST_REMOVE(node, node_deleted); + QLIST_SAFE_REMOVE(node, node_poll); + g_free(node); + } + + qemu_lockcnt_inc_and_unlock(&ctx->list_lock); +} + +static bool aio_dispatch_handler(AioContext *ctx, AioHandler *node) +{ + bool progress = false; + int revents; + + revents = node->pfd.revents & node->pfd.events; + node->pfd.revents = 0; + + /* + * Start polling AioHandlers when they become ready because activity is + * likely to continue. Note that starvation is theoretically possible when + * fdmon_supports_polling(), but only until the fd fires for the first + * time. + */ + if (!QLIST_IS_INSERTED(node, node_deleted) && + !QLIST_IS_INSERTED(node, node_poll) && + node->io_poll) { + trace_poll_add(ctx, node, node->pfd.fd, revents); + if (ctx->poll_started && node->io_poll_begin) { + node->io_poll_begin(node->opaque); + } + QLIST_INSERT_HEAD(&ctx->poll_aio_handlers, node, node_poll); + } + + if (!QLIST_IS_INSERTED(node, node_deleted) && + (revents & (G_IO_IN | G_IO_HUP | G_IO_ERR)) && + aio_node_check(ctx, node->is_external) && + node->io_read) { + node->io_read(node->opaque); + + /* aio_notify() does not count as progress */ + if (node->opaque != &ctx->notifier) { + progress = true; + } + } + if (!QLIST_IS_INSERTED(node, node_deleted) && + (revents & (G_IO_OUT | G_IO_ERR)) && + aio_node_check(ctx, node->is_external) && + node->io_write) { + node->io_write(node->opaque); + progress = true; + } + + return progress; +} + +/* + * If we have a list of ready handlers then this is more efficient than + * scanning all handlers with aio_dispatch_handlers(). + */ +static bool aio_dispatch_ready_handlers(AioContext *ctx, + AioHandlerList *ready_list) +{ + bool progress = false; + AioHandler *node; + + while ((node = QLIST_FIRST(ready_list))) { + QLIST_REMOVE(node, node_ready); + progress = aio_dispatch_handler(ctx, node) || progress; + } + + return progress; +} + +/* Slower than aio_dispatch_ready_handlers() but only used via glib */ +static bool aio_dispatch_handlers(AioContext *ctx) +{ + AioHandler *node, *tmp; + bool progress = false; + + QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) { + progress = aio_dispatch_handler(ctx, node) || progress; + } + + return progress; +} + +void aio_dispatch(AioContext *ctx) +{ + qemu_lockcnt_inc(&ctx->list_lock); + aio_bh_poll(ctx); + aio_dispatch_handlers(ctx); + aio_free_deleted_handlers(ctx); + qemu_lockcnt_dec(&ctx->list_lock); + + timerlistgroup_run_timers(&ctx->tlg); +} + +static bool run_poll_handlers_once(AioContext *ctx, + int64_t now, + int64_t *timeout) +{ + bool progress = false; + AioHandler *node; + AioHandler *tmp; + + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (aio_node_check(ctx, node->is_external) && + node->io_poll(node->opaque)) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + + /* + * Polling was successful, exit try_poll_mode immediately + * to adjust the next polling time. + */ + *timeout = 0; + if (node->opaque != &ctx->notifier) { + progress = true; + } + } + + /* Caller handles freeing deleted nodes. Don't do it here. */ + } + + return progress; +} + +static bool fdmon_supports_polling(AioContext *ctx) +{ + return ctx->fdmon_ops->need_wait != aio_poll_disabled; +} + +static bool remove_idle_poll_handlers(AioContext *ctx, int64_t now) +{ + AioHandler *node; + AioHandler *tmp; + bool progress = false; + + /* + * File descriptor monitoring implementations without userspace polling + * support suffer from starvation when a subset of handlers is polled + * because fds will not be processed in a timely fashion. Don't remove + * idle poll handlers. + */ + if (!fdmon_supports_polling(ctx)) { + return false; + } + + QLIST_FOREACH_SAFE(node, &ctx->poll_aio_handlers, node_poll, tmp) { + if (node->poll_idle_timeout == 0LL) { + node->poll_idle_timeout = now + POLL_IDLE_INTERVAL_NS; + } else if (now >= node->poll_idle_timeout) { + trace_poll_remove(ctx, node, node->pfd.fd); + node->poll_idle_timeout = 0LL; + QLIST_SAFE_REMOVE(node, node_poll); + if (ctx->poll_started && node->io_poll_end) { + node->io_poll_end(node->opaque); + + /* + * Final poll in case ->io_poll_end() races with an event. + * Nevermind about re-adding the handler in the rare case where + * this causes progress. + */ + progress = node->io_poll(node->opaque) || progress; + } + } + } + + return progress; +} + +/* run_poll_handlers: + * @ctx: the AioContext + * @max_ns: maximum time to poll for, in nanoseconds + * + * Polls for a given time. + * + * Note that the caller must have incremented ctx->list_lock. + * + * Returns: true if progress was made, false otherwise + */ +static bool run_poll_handlers(AioContext *ctx, int64_t max_ns, int64_t *timeout) +{ + bool progress; + int64_t start_time, elapsed_time; + + assert(qemu_lockcnt_count(&ctx->list_lock) > 0); + + trace_run_poll_handlers_begin(ctx, max_ns, *timeout); + + /* + * Optimization: ->io_poll() handlers often contain RCU read critical + * sections and we therefore see many rcu_read_lock() -> rcu_read_unlock() + * -> rcu_read_lock() -> ... sequences with expensive memory + * synchronization primitives. Make the entire polling loop an RCU + * critical section because nested rcu_read_lock()/rcu_read_unlock() calls + * are cheap. + */ + RCU_READ_LOCK_GUARD(); + + start_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + do { + progress = run_poll_handlers_once(ctx, start_time, timeout); + elapsed_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start_time; + max_ns = qemu_soonest_timeout(*timeout, max_ns); + assert(!(max_ns && progress)); + } while (elapsed_time < max_ns && !ctx->fdmon_ops->need_wait(ctx)); + + if (remove_idle_poll_handlers(ctx, start_time + elapsed_time)) { + *timeout = 0; + progress = true; + } + + /* If time has passed with no successful polling, adjust *timeout to + * keep the same ending time. + */ + if (*timeout != -1) { + *timeout -= MIN(*timeout, elapsed_time); + } + + trace_run_poll_handlers_end(ctx, progress, *timeout); + return progress; +} + +/* try_poll_mode: + * @ctx: the AioContext + * @timeout: timeout for blocking wait, computed by the caller and updated if + * polling succeeds. + * + * Note that the caller must have incremented ctx->list_lock. + * + * Returns: true if progress was made, false otherwise + */ +static bool try_poll_mode(AioContext *ctx, int64_t *timeout) +{ + int64_t max_ns; + + if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) { + return false; + } + + max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns); + if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) { + poll_set_started(ctx, true); + + if (run_poll_handlers(ctx, max_ns, timeout)) { + return true; + } + } + + if (poll_set_started(ctx, false)) { + *timeout = 0; + return true; + } + + return false; +} + +bool aio_poll(AioContext *ctx, bool blocking) +{ + AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list); + int ret = 0; + bool progress; + bool use_notify_me; + int64_t timeout; + int64_t start = 0; + + /* + * There cannot be two concurrent aio_poll calls for the same AioContext (or + * an aio_poll concurrent with a GSource prepare/check/dispatch callback). + * We rely on this below to avoid slow locked accesses to ctx->notify_me. + * + * aio_poll() may only be called in the AioContext's thread. iohandler_ctx + * is special in that it runs in the main thread, but that thread's context + * is qemu_aio_context. + */ + assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ? + qemu_get_aio_context() : ctx)); + + qemu_lockcnt_inc(&ctx->list_lock); + + if (ctx->poll_max_ns) { + start = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + } + + timeout = blocking ? aio_compute_timeout(ctx) : 0; + progress = try_poll_mode(ctx, &timeout); + assert(!(timeout && progress)); + + /* + * aio_notify can avoid the expensive event_notifier_set if + * everything (file descriptors, bottom halves, timers) will + * be re-evaluated before the next blocking poll(). This is + * already true when aio_poll is called with blocking == false; + * if blocking == true, it is only true after poll() returns, + * so disable the optimization now. + */ + use_notify_me = timeout != 0; + if (use_notify_me) { + qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2); + /* + * Write ctx->notify_me before reading ctx->notified. Pairs with + * smp_mb in aio_notify(). + */ + smp_mb(); + + /* Don't block if aio_notify() was called */ + if (qatomic_read(&ctx->notified)) { + timeout = 0; + } + } + + /* If polling is allowed, non-blocking aio_poll does not need the + * system call---a single round of run_poll_handlers_once suffices. + */ + if (timeout || ctx->fdmon_ops->need_wait(ctx)) { + ret = ctx->fdmon_ops->wait(ctx, &ready_list, timeout); + } + + if (use_notify_me) { + /* Finish the poll before clearing the flag. */ + qatomic_store_release(&ctx->notify_me, + qatomic_read(&ctx->notify_me) - 2); + } + + aio_notify_accept(ctx); + + /* Adjust polling time */ + if (ctx->poll_max_ns) { + int64_t block_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - start; + + if (block_ns <= ctx->poll_ns) { + /* This is the sweet spot, no adjustment needed */ + } else if (block_ns > ctx->poll_max_ns) { + /* We'd have to poll for too long, poll less */ + int64_t old = ctx->poll_ns; + + if (ctx->poll_shrink) { + ctx->poll_ns /= ctx->poll_shrink; + } else { + ctx->poll_ns = 0; + } + + trace_poll_shrink(ctx, old, ctx->poll_ns); + } else if (ctx->poll_ns < ctx->poll_max_ns && + block_ns < ctx->poll_max_ns) { + /* There is room to grow, poll longer */ + int64_t old = ctx->poll_ns; + int64_t grow = ctx->poll_grow; + + if (grow == 0) { + grow = 2; + } + + if (ctx->poll_ns) { + ctx->poll_ns *= grow; + } else { + ctx->poll_ns = 4000; /* start polling at 4 microseconds */ + } + + if (ctx->poll_ns > ctx->poll_max_ns) { + ctx->poll_ns = ctx->poll_max_ns; + } + + trace_poll_grow(ctx, old, ctx->poll_ns); + } + } + + progress |= aio_bh_poll(ctx); + + if (ret > 0) { + progress |= aio_dispatch_ready_handlers(ctx, &ready_list); + } + + aio_free_deleted_handlers(ctx); + + qemu_lockcnt_dec(&ctx->list_lock); + + progress |= timerlistgroup_run_timers(&ctx->tlg); + + return progress; +} + +void aio_context_setup(AioContext *ctx) +{ + ctx->fdmon_ops = &fdmon_poll_ops; + ctx->epollfd = -1; + + /* Use the fastest fd monitoring implementation if available */ + if (fdmon_io_uring_setup(ctx)) { + return; + } + + fdmon_epoll_setup(ctx); +} + +void aio_context_destroy(AioContext *ctx) +{ + fdmon_io_uring_destroy(ctx); + fdmon_epoll_disable(ctx); + aio_free_deleted_handlers(ctx); +} + +void aio_context_use_g_source(AioContext *ctx) +{ + /* + * Disable io_uring when the glib main loop is used because it doesn't + * support mixed glib/aio_poll() usage. It relies on aio_poll() being + * called regularly so that changes to the monitored file descriptors are + * submitted, otherwise a list of pending fd handlers builds up. + */ + fdmon_io_uring_destroy(ctx); + aio_free_deleted_handlers(ctx); +} + +void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, + int64_t grow, int64_t shrink, Error **errp) +{ + /* No thread synchronization here, it doesn't matter if an incorrect value + * is used once. + */ + ctx->poll_max_ns = max_ns; + ctx->poll_ns = 0; + ctx->poll_grow = grow; + ctx->poll_shrink = shrink; + + aio_notify(ctx); +} + +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch, + Error **errp) +{ + /* + * No thread synchronization here, it doesn't matter if an incorrect value + * is used once. + */ + ctx->aio_max_batch = max_batch; + + aio_notify(ctx); +} diff --git a/util/aio-posix.h b/util/aio-posix.h new file mode 100644 index 000000000..c80c04506 --- /dev/null +++ b/util/aio-posix.h @@ -0,0 +1,81 @@ +/* + * AioContext POSIX event loop implementation internal APIs + * + * Copyright IBM, Corp. 2008 + * Copyright Red Hat, Inc. 2020 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#ifndef AIO_POSIX_H +#define AIO_POSIX_H + +#include "block/aio.h" + +struct AioHandler { + GPollFD pfd; + IOHandler *io_read; + IOHandler *io_write; + AioPollFn *io_poll; + IOHandler *io_poll_begin; + IOHandler *io_poll_end; + void *opaque; + QLIST_ENTRY(AioHandler) node; + QLIST_ENTRY(AioHandler) node_ready; /* only used during aio_poll() */ + QLIST_ENTRY(AioHandler) node_deleted; + QLIST_ENTRY(AioHandler) node_poll; +#ifdef CONFIG_LINUX_IO_URING + QSLIST_ENTRY(AioHandler) node_submitted; + unsigned flags; /* see fdmon-io_uring.c */ +#endif + int64_t poll_idle_timeout; /* when to stop userspace polling */ + bool is_external; +}; + +/* Add a handler to a ready list */ +void aio_add_ready_handler(AioHandlerList *ready_list, AioHandler *node, + int revents); + +extern const FDMonOps fdmon_poll_ops; + +#ifdef CONFIG_EPOLL_CREATE1 +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd); +void fdmon_epoll_setup(AioContext *ctx); +void fdmon_epoll_disable(AioContext *ctx); +#else +static inline bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) +{ + return false; +} + +static inline void fdmon_epoll_setup(AioContext *ctx) +{ +} + +static inline void fdmon_epoll_disable(AioContext *ctx) +{ +} +#endif /* !CONFIG_EPOLL_CREATE1 */ + +#ifdef CONFIG_LINUX_IO_URING +bool fdmon_io_uring_setup(AioContext *ctx); +void fdmon_io_uring_destroy(AioContext *ctx); +#else +static inline bool fdmon_io_uring_setup(AioContext *ctx) +{ + return false; +} + +static inline void fdmon_io_uring_destroy(AioContext *ctx) +{ +} +#endif /* !CONFIG_LINUX_IO_URING */ + +#endif /* AIO_POSIX_H */ diff --git a/util/aio-wait.c b/util/aio-wait.c new file mode 100644 index 000000000..bdb3d3af2 --- /dev/null +++ b/util/aio-wait.c @@ -0,0 +1,72 @@ +/* + * AioContext wait support + * + * Copyright (C) 2018 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "block/aio-wait.h" + +AioWait global_aio_wait; + +static void dummy_bh_cb(void *opaque) +{ + /* The point is to make AIO_WAIT_WHILE()'s aio_poll() return */ +} + +void aio_wait_kick(void) +{ + /* The barrier (or an atomic op) is in the caller. */ + if (qatomic_read(&global_aio_wait.num_waiters)) { + aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL); + } +} + +typedef struct { + bool done; + QEMUBHFunc *cb; + void *opaque; +} AioWaitBHData; + +/* Context: BH in IOThread */ +static void aio_wait_bh(void *opaque) +{ + AioWaitBHData *data = opaque; + + data->cb(data->opaque); + + data->done = true; + aio_wait_kick(); +} + +void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque) +{ + AioWaitBHData data = { + .cb = cb, + .opaque = opaque, + }; + + assert(qemu_get_current_aio_context() == qemu_get_aio_context()); + + aio_bh_schedule_oneshot(ctx, aio_wait_bh, &data); + AIO_WAIT_WHILE(ctx, !data.done); +} diff --git a/util/aio-win32.c b/util/aio-win32.c new file mode 100644 index 000000000..d5b09a119 --- /dev/null +++ b/util/aio-win32.c @@ -0,0 +1,447 @@ +/* + * QEMU aio implementation + * + * Copyright IBM Corp., 2008 + * Copyright Red Hat Inc., 2012 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "block/block.h" +#include "qemu/main-loop.h" +#include "qemu/queue.h" +#include "qemu/sockets.h" +#include "qapi/error.h" +#include "qemu/rcu_queue.h" + +struct AioHandler { + EventNotifier *e; + IOHandler *io_read; + IOHandler *io_write; + EventNotifierHandler *io_notify; + GPollFD pfd; + int deleted; + void *opaque; + bool is_external; + QLIST_ENTRY(AioHandler) node; +}; + +static void aio_remove_fd_handler(AioContext *ctx, AioHandler *node) +{ + /* + * If the GSource is in the process of being destroyed then + * g_source_remove_poll() causes an assertion failure. Skip + * removal in that case, because glib cleans up its state during + * destruction anyway. + */ + if (!g_source_is_destroyed(&ctx->source)) { + g_source_remove_poll(&ctx->source, &node->pfd); + } + + /* If aio_poll is in progress, just mark the node as deleted */ + if (qemu_lockcnt_count(&ctx->list_lock)) { + node->deleted = 1; + node->pfd.revents = 0; + } else { + /* Otherwise, delete it for real. We can't just mark it as + * deleted because deleted nodes are only cleaned up after + * releasing the list_lock. + */ + QLIST_REMOVE(node, node); + g_free(node); + } +} + +void aio_set_fd_handler(AioContext *ctx, + int fd, + bool is_external, + IOHandler *io_read, + IOHandler *io_write, + AioPollFn *io_poll, + void *opaque) +{ + /* fd is a SOCKET in our case */ + AioHandler *old_node; + AioHandler *node = NULL; + + qemu_lockcnt_lock(&ctx->list_lock); + QLIST_FOREACH(old_node, &ctx->aio_handlers, node) { + if (old_node->pfd.fd == fd && !old_node->deleted) { + break; + } + } + + if (io_read || io_write) { + HANDLE event; + long bitmask = 0; + + /* Alloc and insert if it's not already there */ + node = g_new0(AioHandler, 1); + node->pfd.fd = fd; + + node->pfd.events = 0; + if (node->io_read) { + node->pfd.events |= G_IO_IN; + } + if (node->io_write) { + node->pfd.events |= G_IO_OUT; + } + + node->e = &ctx->notifier; + + /* Update handler with latest information */ + node->opaque = opaque; + node->io_read = io_read; + node->io_write = io_write; + node->is_external = is_external; + + if (io_read) { + bitmask |= FD_READ | FD_ACCEPT | FD_CLOSE; + } + + if (io_write) { + bitmask |= FD_WRITE | FD_CONNECT; + } + + QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node); + event = event_notifier_get_handle(&ctx->notifier); + WSAEventSelect(node->pfd.fd, event, bitmask); + } + if (old_node) { + aio_remove_fd_handler(ctx, old_node); + } + + qemu_lockcnt_unlock(&ctx->list_lock); + aio_notify(ctx); +} + +void aio_set_fd_poll(AioContext *ctx, int fd, + IOHandler *io_poll_begin, + IOHandler *io_poll_end) +{ + /* Not implemented */ +} + +void aio_set_event_notifier(AioContext *ctx, + EventNotifier *e, + bool is_external, + EventNotifierHandler *io_notify, + AioPollFn *io_poll) +{ + AioHandler *node; + + qemu_lockcnt_lock(&ctx->list_lock); + QLIST_FOREACH(node, &ctx->aio_handlers, node) { + if (node->e == e && !node->deleted) { + break; + } + } + + /* Are we deleting the fd handler? */ + if (!io_notify) { + if (node) { + aio_remove_fd_handler(ctx, node); + } + } else { + if (node == NULL) { + /* Alloc and insert if it's not already there */ + node = g_new0(AioHandler, 1); + node->e = e; + node->pfd.fd = (uintptr_t)event_notifier_get_handle(e); + node->pfd.events = G_IO_IN; + node->is_external = is_external; + QLIST_INSERT_HEAD_RCU(&ctx->aio_handlers, node, node); + + g_source_add_poll(&ctx->source, &node->pfd); + } + /* Update handler with latest information */ + node->io_notify = io_notify; + } + + qemu_lockcnt_unlock(&ctx->list_lock); + aio_notify(ctx); +} + +void aio_set_event_notifier_poll(AioContext *ctx, + EventNotifier *notifier, + EventNotifierHandler *io_poll_begin, + EventNotifierHandler *io_poll_end) +{ + /* Not implemented */ +} + +bool aio_prepare(AioContext *ctx) +{ + static struct timeval tv0; + AioHandler *node; + bool have_select_revents = false; + fd_set rfds, wfds; + + /* + * We have to walk very carefully in case aio_set_fd_handler is + * called while we're walking. + */ + qemu_lockcnt_inc(&ctx->list_lock); + + /* fill fd sets */ + FD_ZERO(&rfds); + FD_ZERO(&wfds); + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (node->io_read) { + FD_SET ((SOCKET)node->pfd.fd, &rfds); + } + if (node->io_write) { + FD_SET ((SOCKET)node->pfd.fd, &wfds); + } + } + + if (select(0, &rfds, &wfds, NULL, &tv0) > 0) { + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + node->pfd.revents = 0; + if (FD_ISSET(node->pfd.fd, &rfds)) { + node->pfd.revents |= G_IO_IN; + have_select_revents = true; + } + + if (FD_ISSET(node->pfd.fd, &wfds)) { + node->pfd.revents |= G_IO_OUT; + have_select_revents = true; + } + } + } + + qemu_lockcnt_dec(&ctx->list_lock); + return have_select_revents; +} + +bool aio_pending(AioContext *ctx) +{ + AioHandler *node; + bool result = false; + + /* + * We have to walk very carefully in case aio_set_fd_handler is + * called while we're walking. + */ + qemu_lockcnt_inc(&ctx->list_lock); + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (node->pfd.revents && node->io_notify) { + result = true; + break; + } + + if ((node->pfd.revents & G_IO_IN) && node->io_read) { + result = true; + break; + } + if ((node->pfd.revents & G_IO_OUT) && node->io_write) { + result = true; + break; + } + } + + qemu_lockcnt_dec(&ctx->list_lock); + return result; +} + +static bool aio_dispatch_handlers(AioContext *ctx, HANDLE event) +{ + AioHandler *node; + bool progress = false; + AioHandler *tmp; + + /* + * We have to walk very carefully in case aio_set_fd_handler is + * called while we're walking. + */ + QLIST_FOREACH_SAFE_RCU(node, &ctx->aio_handlers, node, tmp) { + int revents = node->pfd.revents; + + if (!node->deleted && + (revents || event_notifier_get_handle(node->e) == event) && + node->io_notify) { + node->pfd.revents = 0; + node->io_notify(node->e); + + /* aio_notify() does not count as progress */ + if (node->e != &ctx->notifier) { + progress = true; + } + } + + if (!node->deleted && + (node->io_read || node->io_write)) { + node->pfd.revents = 0; + if ((revents & G_IO_IN) && node->io_read) { + node->io_read(node->opaque); + progress = true; + } + if ((revents & G_IO_OUT) && node->io_write) { + node->io_write(node->opaque); + progress = true; + } + + /* if the next select() will return an event, we have progressed */ + if (event == event_notifier_get_handle(&ctx->notifier)) { + WSANETWORKEVENTS ev; + WSAEnumNetworkEvents(node->pfd.fd, event, &ev); + if (ev.lNetworkEvents) { + progress = true; + } + } + } + + if (node->deleted) { + if (qemu_lockcnt_dec_if_lock(&ctx->list_lock)) { + QLIST_REMOVE(node, node); + g_free(node); + qemu_lockcnt_inc_and_unlock(&ctx->list_lock); + } + } + } + + return progress; +} + +void aio_dispatch(AioContext *ctx) +{ + qemu_lockcnt_inc(&ctx->list_lock); + aio_bh_poll(ctx); + aio_dispatch_handlers(ctx, INVALID_HANDLE_VALUE); + qemu_lockcnt_dec(&ctx->list_lock); + timerlistgroup_run_timers(&ctx->tlg); +} + +bool aio_poll(AioContext *ctx, bool blocking) +{ + AioHandler *node; + HANDLE events[MAXIMUM_WAIT_OBJECTS + 1]; + bool progress, have_select_revents, first; + int count; + int timeout; + + /* + * There cannot be two concurrent aio_poll calls for the same AioContext (or + * an aio_poll concurrent with a GSource prepare/check/dispatch callback). + * We rely on this below to avoid slow locked accesses to ctx->notify_me. + * + * aio_poll() may only be called in the AioContext's thread. iohandler_ctx + * is special in that it runs in the main thread, but that thread's context + * is qemu_aio_context. + */ + assert(in_aio_context_home_thread(ctx == iohandler_get_aio_context() ? + qemu_get_aio_context() : ctx)); + progress = false; + + /* aio_notify can avoid the expensive event_notifier_set if + * everything (file descriptors, bottom halves, timers) will + * be re-evaluated before the next blocking poll(). This is + * already true when aio_poll is called with blocking == false; + * if blocking == true, it is only true after poll() returns, + * so disable the optimization now. + */ + if (blocking) { + qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) + 2); + /* + * Write ctx->notify_me before computing the timeout + * (reading bottom half flags, etc.). Pairs with + * smp_mb in aio_notify(). + */ + smp_mb(); + } + + qemu_lockcnt_inc(&ctx->list_lock); + have_select_revents = aio_prepare(ctx); + + /* fill fd sets */ + count = 0; + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (!node->deleted && node->io_notify + && aio_node_check(ctx, node->is_external)) { + events[count++] = event_notifier_get_handle(node->e); + } + } + + first = true; + + /* ctx->notifier is always registered. */ + assert(count > 0); + + /* Multiple iterations, all of them non-blocking except the first, + * may be necessary to process all pending events. After the first + * WaitForMultipleObjects call ctx->notify_me will be decremented. + */ + do { + HANDLE event; + int ret; + + timeout = blocking && !have_select_revents + ? qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)) : 0; + ret = WaitForMultipleObjects(count, events, FALSE, timeout); + if (blocking) { + assert(first); + qatomic_store_release(&ctx->notify_me, + qatomic_read(&ctx->notify_me) - 2); + aio_notify_accept(ctx); + } + + if (first) { + progress |= aio_bh_poll(ctx); + first = false; + } + + /* if we have any signaled events, dispatch event */ + event = NULL; + if ((DWORD) (ret - WAIT_OBJECT_0) < count) { + event = events[ret - WAIT_OBJECT_0]; + events[ret - WAIT_OBJECT_0] = events[--count]; + } else if (!have_select_revents) { + break; + } + + have_select_revents = false; + blocking = false; + + progress |= aio_dispatch_handlers(ctx, event); + } while (count > 0); + + qemu_lockcnt_dec(&ctx->list_lock); + + progress |= timerlistgroup_run_timers(&ctx->tlg); + return progress; +} + +void aio_context_setup(AioContext *ctx) +{ +} + +void aio_context_destroy(AioContext *ctx) +{ +} + +void aio_context_use_g_source(AioContext *ctx) +{ +} + +void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, + int64_t grow, int64_t shrink, Error **errp) +{ + if (max_ns) { + error_setg(errp, "AioContext polling is not implemented on Windows"); + } +} + +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch, + Error **errp) +{ +} diff --git a/util/aiocb.c b/util/aiocb.c new file mode 100644 index 000000000..5aef3a069 --- /dev/null +++ b/util/aiocb.c @@ -0,0 +1,55 @@ +/* + * BlockAIOCB allocation + * + * Copyright (c) 2003-2017 Fabrice Bellard and other QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "block/aio.h" + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_malloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_free(acb); + } +} diff --git a/util/async.c b/util/async.c new file mode 100644 index 000000000..6f6717a34 --- /dev/null +++ b/util/async.c @@ -0,0 +1,690 @@ +/* + * Data plane event loop + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2009-2017 QEMU contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "block/aio.h" +#include "block/thread-pool.h" +#include "qemu/main-loop.h" +#include "qemu/atomic.h" +#include "qemu/rcu_queue.h" +#include "block/raw-aio.h" +#include "qemu/coroutine_int.h" +#include "trace.h" + +/***********************************************************/ +/* bottom halves (can be seen as timers which expire ASAP) */ + +/* QEMUBH::flags values */ +enum { + /* Already enqueued and waiting for aio_bh_poll() */ + BH_PENDING = (1 << 0), + + /* Invoke the callback */ + BH_SCHEDULED = (1 << 1), + + /* Delete without invoking callback */ + BH_DELETED = (1 << 2), + + /* Delete after invoking callback */ + BH_ONESHOT = (1 << 3), + + /* Schedule periodically when the event loop is idle */ + BH_IDLE = (1 << 4), +}; + +struct QEMUBH { + AioContext *ctx; + const char *name; + QEMUBHFunc *cb; + void *opaque; + QSLIST_ENTRY(QEMUBH) next; + unsigned flags; +}; + +/* Called concurrently from any thread */ +static void aio_bh_enqueue(QEMUBH *bh, unsigned new_flags) +{ + AioContext *ctx = bh->ctx; + unsigned old_flags; + + /* + * The memory barrier implicit in qatomic_fetch_or makes sure that: + * 1. idle & any writes needed by the callback are done before the + * locations are read in the aio_bh_poll. + * 2. ctx is loaded before the callback has a chance to execute and bh + * could be freed. + */ + old_flags = qatomic_fetch_or(&bh->flags, BH_PENDING | new_flags); + if (!(old_flags & BH_PENDING)) { + QSLIST_INSERT_HEAD_ATOMIC(&ctx->bh_list, bh, next); + } + + aio_notify(ctx); +} + +/* Only called from aio_bh_poll() and aio_ctx_finalize() */ +static QEMUBH *aio_bh_dequeue(BHList *head, unsigned *flags) +{ + QEMUBH *bh = QSLIST_FIRST_RCU(head); + + if (!bh) { + return NULL; + } + + QSLIST_REMOVE_HEAD(head, next); + + /* + * The qatomic_and is paired with aio_bh_enqueue(). The implicit memory + * barrier ensures that the callback sees all writes done by the scheduling + * thread. It also ensures that the scheduling thread sees the cleared + * flag before bh->cb has run, and thus will call aio_notify again if + * necessary. + */ + *flags = qatomic_fetch_and(&bh->flags, + ~(BH_PENDING | BH_SCHEDULED | BH_IDLE)); + return bh; +} + +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, + void *opaque, const char *name) +{ + QEMUBH *bh; + bh = g_new(QEMUBH, 1); + *bh = (QEMUBH){ + .ctx = ctx, + .cb = cb, + .opaque = opaque, + .name = name, + }; + aio_bh_enqueue(bh, BH_SCHEDULED | BH_ONESHOT); +} + +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name) +{ + QEMUBH *bh; + bh = g_new(QEMUBH, 1); + *bh = (QEMUBH){ + .ctx = ctx, + .cb = cb, + .opaque = opaque, + .name = name, + }; + return bh; +} + +void aio_bh_call(QEMUBH *bh) +{ + bh->cb(bh->opaque); +} + +/* Multiple occurrences of aio_bh_poll cannot be called concurrently. */ +int aio_bh_poll(AioContext *ctx) +{ + BHListSlice slice; + BHListSlice *s; + int ret = 0; + + QSLIST_MOVE_ATOMIC(&slice.bh_list, &ctx->bh_list); + QSIMPLEQ_INSERT_TAIL(&ctx->bh_slice_list, &slice, next); + + while ((s = QSIMPLEQ_FIRST(&ctx->bh_slice_list))) { + QEMUBH *bh; + unsigned flags; + + bh = aio_bh_dequeue(&s->bh_list, &flags); + if (!bh) { + QSIMPLEQ_REMOVE_HEAD(&ctx->bh_slice_list, next); + continue; + } + + if ((flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { + /* Idle BHs don't count as progress */ + if (!(flags & BH_IDLE)) { + ret = 1; + } + aio_bh_call(bh); + } + if (flags & (BH_DELETED | BH_ONESHOT)) { + g_free(bh); + } + } + + return ret; +} + +void qemu_bh_schedule_idle(QEMUBH *bh) +{ + aio_bh_enqueue(bh, BH_SCHEDULED | BH_IDLE); +} + +void qemu_bh_schedule(QEMUBH *bh) +{ + aio_bh_enqueue(bh, BH_SCHEDULED); +} + +/* This func is async. + */ +void qemu_bh_cancel(QEMUBH *bh) +{ + qatomic_and(&bh->flags, ~BH_SCHEDULED); +} + +/* This func is async.The bottom half will do the delete action at the finial + * end. + */ +void qemu_bh_delete(QEMUBH *bh) +{ + aio_bh_enqueue(bh, BH_DELETED); +} + +static int64_t aio_compute_bh_timeout(BHList *head, int timeout) +{ + QEMUBH *bh; + + QSLIST_FOREACH_RCU(bh, head, next) { + if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { + if (bh->flags & BH_IDLE) { + /* idle bottom halves will be polled at least + * every 10ms */ + timeout = 10000000; + } else { + /* non-idle bottom halves will be executed + * immediately */ + return 0; + } + } + } + + return timeout; +} + +int64_t +aio_compute_timeout(AioContext *ctx) +{ + BHListSlice *s; + int64_t deadline; + int timeout = -1; + + timeout = aio_compute_bh_timeout(&ctx->bh_list, timeout); + if (timeout == 0) { + return 0; + } + + QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) { + timeout = aio_compute_bh_timeout(&s->bh_list, timeout); + if (timeout == 0) { + return 0; + } + } + + deadline = timerlistgroup_deadline_ns(&ctx->tlg); + if (deadline == 0) { + return 0; + } else { + return qemu_soonest_timeout(timeout, deadline); + } +} + +static gboolean +aio_ctx_prepare(GSource *source, gint *timeout) +{ + AioContext *ctx = (AioContext *) source; + + qatomic_set(&ctx->notify_me, qatomic_read(&ctx->notify_me) | 1); + + /* + * Write ctx->notify_me before computing the timeout + * (reading bottom half flags, etc.). Pairs with + * smp_mb in aio_notify(). + */ + smp_mb(); + + /* We assume there is no timeout already supplied */ + *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx)); + + if (aio_prepare(ctx)) { + *timeout = 0; + } + + return *timeout == 0; +} + +static gboolean +aio_ctx_check(GSource *source) +{ + AioContext *ctx = (AioContext *) source; + QEMUBH *bh; + BHListSlice *s; + + /* Finish computing the timeout before clearing the flag. */ + qatomic_store_release(&ctx->notify_me, qatomic_read(&ctx->notify_me) & ~1); + aio_notify_accept(ctx); + + QSLIST_FOREACH_RCU(bh, &ctx->bh_list, next) { + if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { + return true; + } + } + + QSIMPLEQ_FOREACH(s, &ctx->bh_slice_list, next) { + QSLIST_FOREACH_RCU(bh, &s->bh_list, next) { + if ((bh->flags & (BH_SCHEDULED | BH_DELETED)) == BH_SCHEDULED) { + return true; + } + } + } + return aio_pending(ctx) || (timerlistgroup_deadline_ns(&ctx->tlg) == 0); +} + +static gboolean +aio_ctx_dispatch(GSource *source, + GSourceFunc callback, + gpointer user_data) +{ + AioContext *ctx = (AioContext *) source; + + assert(callback == NULL); + aio_dispatch(ctx); + return true; +} + +static void +aio_ctx_finalize(GSource *source) +{ + AioContext *ctx = (AioContext *) source; + QEMUBH *bh; + unsigned flags; + + thread_pool_free(ctx->thread_pool); + +#ifdef CONFIG_LINUX_AIO + if (ctx->linux_aio) { + laio_detach_aio_context(ctx->linux_aio, ctx); + laio_cleanup(ctx->linux_aio); + ctx->linux_aio = NULL; + } +#endif + +#ifdef CONFIG_LINUX_IO_URING + if (ctx->linux_io_uring) { + luring_detach_aio_context(ctx->linux_io_uring, ctx); + luring_cleanup(ctx->linux_io_uring); + ctx->linux_io_uring = NULL; + } +#endif + + assert(QSLIST_EMPTY(&ctx->scheduled_coroutines)); + qemu_bh_delete(ctx->co_schedule_bh); + + /* There must be no aio_bh_poll() calls going on */ + assert(QSIMPLEQ_EMPTY(&ctx->bh_slice_list)); + + while ((bh = aio_bh_dequeue(&ctx->bh_list, &flags))) { + /* + * qemu_bh_delete() must have been called on BHs in this AioContext. In + * many cases memory leaks, hangs, or inconsistent state occur when a + * BH is leaked because something still expects it to run. + * + * If you hit this, fix the lifecycle of the BH so that + * qemu_bh_delete() and any associated cleanup is called before the + * AioContext is finalized. + */ + if (unlikely(!(flags & BH_DELETED))) { + fprintf(stderr, "%s: BH '%s' leaked, aborting...\n", + __func__, bh->name); + abort(); + } + + g_free(bh); + } + + aio_set_event_notifier(ctx, &ctx->notifier, false, NULL, NULL); + event_notifier_cleanup(&ctx->notifier); + qemu_rec_mutex_destroy(&ctx->lock); + qemu_lockcnt_destroy(&ctx->list_lock); + timerlistgroup_deinit(&ctx->tlg); + aio_context_destroy(ctx); +} + +static GSourceFuncs aio_source_funcs = { + aio_ctx_prepare, + aio_ctx_check, + aio_ctx_dispatch, + aio_ctx_finalize +}; + +GSource *aio_get_g_source(AioContext *ctx) +{ + aio_context_use_g_source(ctx); + g_source_ref(&ctx->source); + return &ctx->source; +} + +ThreadPool *aio_get_thread_pool(AioContext *ctx) +{ + if (!ctx->thread_pool) { + ctx->thread_pool = thread_pool_new(ctx); + } + return ctx->thread_pool; +} + +#ifdef CONFIG_LINUX_AIO +LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp) +{ + if (!ctx->linux_aio) { + ctx->linux_aio = laio_init(errp); + if (ctx->linux_aio) { + laio_attach_aio_context(ctx->linux_aio, ctx); + } + } + return ctx->linux_aio; +} + +LinuxAioState *aio_get_linux_aio(AioContext *ctx) +{ + assert(ctx->linux_aio); + return ctx->linux_aio; +} +#endif + +#ifdef CONFIG_LINUX_IO_URING +LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp) +{ + if (ctx->linux_io_uring) { + return ctx->linux_io_uring; + } + + ctx->linux_io_uring = luring_init(errp); + if (!ctx->linux_io_uring) { + return NULL; + } + + luring_attach_aio_context(ctx->linux_io_uring, ctx); + return ctx->linux_io_uring; +} + +LuringState *aio_get_linux_io_uring(AioContext *ctx) +{ + assert(ctx->linux_io_uring); + return ctx->linux_io_uring; +} +#endif + +void aio_notify(AioContext *ctx) +{ + /* + * Write e.g. bh->flags before writing ctx->notified. Pairs with smp_mb in + * aio_notify_accept. + */ + smp_wmb(); + qatomic_set(&ctx->notified, true); + + /* + * Write ctx->notified before reading ctx->notify_me. Pairs + * with smp_mb in aio_ctx_prepare or aio_poll. + */ + smp_mb(); + if (qatomic_read(&ctx->notify_me)) { + event_notifier_set(&ctx->notifier); + } +} + +void aio_notify_accept(AioContext *ctx) +{ + qatomic_set(&ctx->notified, false); + + /* + * Write ctx->notified before reading e.g. bh->flags. Pairs with smp_wmb + * in aio_notify. + */ + smp_mb(); +} + +static void aio_timerlist_notify(void *opaque, QEMUClockType type) +{ + aio_notify(opaque); +} + +static void aio_context_notifier_cb(EventNotifier *e) +{ + AioContext *ctx = container_of(e, AioContext, notifier); + + event_notifier_test_and_clear(&ctx->notifier); +} + +/* Returns true if aio_notify() was called (e.g. a BH was scheduled) */ +static bool aio_context_notifier_poll(void *opaque) +{ + EventNotifier *e = opaque; + AioContext *ctx = container_of(e, AioContext, notifier); + + return qatomic_read(&ctx->notified); +} + +static void co_schedule_bh_cb(void *opaque) +{ + AioContext *ctx = opaque; + QSLIST_HEAD(, Coroutine) straight, reversed; + + QSLIST_MOVE_ATOMIC(&reversed, &ctx->scheduled_coroutines); + QSLIST_INIT(&straight); + + while (!QSLIST_EMPTY(&reversed)) { + Coroutine *co = QSLIST_FIRST(&reversed); + QSLIST_REMOVE_HEAD(&reversed, co_scheduled_next); + QSLIST_INSERT_HEAD(&straight, co, co_scheduled_next); + } + + while (!QSLIST_EMPTY(&straight)) { + Coroutine *co = QSLIST_FIRST(&straight); + QSLIST_REMOVE_HEAD(&straight, co_scheduled_next); + trace_aio_co_schedule_bh_cb(ctx, co); + aio_context_acquire(ctx); + + /* Protected by write barrier in qemu_aio_coroutine_enter */ + qatomic_set(&co->scheduled, NULL); + qemu_aio_coroutine_enter(ctx, co); + aio_context_release(ctx); + } +} + +AioContext *aio_context_new(Error **errp) +{ + int ret; + AioContext *ctx; + + ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext)); + QSLIST_INIT(&ctx->bh_list); + QSIMPLEQ_INIT(&ctx->bh_slice_list); + aio_context_setup(ctx); + + ret = event_notifier_init(&ctx->notifier, false); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to initialize event notifier"); + goto fail; + } + g_source_set_can_recurse(&ctx->source, true); + qemu_lockcnt_init(&ctx->list_lock); + + ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx); + QSLIST_INIT(&ctx->scheduled_coroutines); + + aio_set_event_notifier(ctx, &ctx->notifier, + false, + aio_context_notifier_cb, + aio_context_notifier_poll); +#ifdef CONFIG_LINUX_AIO + ctx->linux_aio = NULL; +#endif + +#ifdef CONFIG_LINUX_IO_URING + ctx->linux_io_uring = NULL; +#endif + + ctx->thread_pool = NULL; + qemu_rec_mutex_init(&ctx->lock); + timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx); + + ctx->poll_ns = 0; + ctx->poll_max_ns = 0; + ctx->poll_grow = 0; + ctx->poll_shrink = 0; + + ctx->aio_max_batch = 0; + + return ctx; +fail: + g_source_destroy(&ctx->source); + return NULL; +} + +void aio_co_schedule(AioContext *ctx, Coroutine *co) +{ + trace_aio_co_schedule(ctx, co); + const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL, + __func__); + + if (scheduled) { + fprintf(stderr, + "%s: Co-routine was already scheduled in '%s'\n", + __func__, scheduled); + abort(); + } + + /* The coroutine might run and release the last ctx reference before we + * invoke qemu_bh_schedule(). Take a reference to keep ctx alive until + * we're done. + */ + aio_context_ref(ctx); + + QSLIST_INSERT_HEAD_ATOMIC(&ctx->scheduled_coroutines, + co, co_scheduled_next); + qemu_bh_schedule(ctx->co_schedule_bh); + + aio_context_unref(ctx); +} + +typedef struct AioCoRescheduleSelf { + Coroutine *co; + AioContext *new_ctx; +} AioCoRescheduleSelf; + +static void aio_co_reschedule_self_bh(void *opaque) +{ + AioCoRescheduleSelf *data = opaque; + aio_co_schedule(data->new_ctx, data->co); +} + +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx) +{ + AioContext *old_ctx = qemu_get_current_aio_context(); + + if (old_ctx != new_ctx) { + AioCoRescheduleSelf data = { + .co = qemu_coroutine_self(), + .new_ctx = new_ctx, + }; + /* + * We can't directly schedule the coroutine in the target context + * because this would be racy: The other thread could try to enter the + * coroutine before it has yielded in this one. + */ + aio_bh_schedule_oneshot(old_ctx, aio_co_reschedule_self_bh, &data); + qemu_coroutine_yield(); + } +} + +void aio_co_wake(struct Coroutine *co) +{ + AioContext *ctx; + + /* Read coroutine before co->ctx. Matches smp_wmb in + * qemu_coroutine_enter. + */ + smp_read_barrier_depends(); + ctx = qatomic_read(&co->ctx); + + aio_co_enter(ctx, co); +} + +void aio_co_enter(AioContext *ctx, struct Coroutine *co) +{ + if (ctx != qemu_get_current_aio_context()) { + aio_co_schedule(ctx, co); + return; + } + + if (qemu_in_coroutine()) { + Coroutine *self = qemu_coroutine_self(); + assert(self != co); + QSIMPLEQ_INSERT_TAIL(&self->co_queue_wakeup, co, co_queue_next); + } else { + aio_context_acquire(ctx); + qemu_aio_coroutine_enter(ctx, co); + aio_context_release(ctx); + } +} + +void aio_context_ref(AioContext *ctx) +{ + g_source_ref(&ctx->source); +} + +void aio_context_unref(AioContext *ctx) +{ + g_source_unref(&ctx->source); +} + +void aio_context_acquire(AioContext *ctx) +{ + qemu_rec_mutex_lock(&ctx->lock); +} + +void aio_context_release(AioContext *ctx) +{ + qemu_rec_mutex_unlock(&ctx->lock); +} + +static __thread AioContext *my_aiocontext; + +AioContext *qemu_get_current_aio_context(void) +{ + if (my_aiocontext) { + return my_aiocontext; + } + if (qemu_mutex_iothread_locked()) { + /* Possibly in a vCPU thread. */ + return qemu_get_aio_context(); + } + return NULL; +} + +void qemu_set_current_aio_context(AioContext *ctx) +{ + assert(!my_aiocontext); + my_aiocontext = ctx; +} diff --git a/util/atomic64.c b/util/atomic64.c new file mode 100644 index 000000000..93037d5b1 --- /dev/null +++ b/util/atomic64.c @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2018, Emilio G. Cota <cota@braap.org> + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/atomic.h" +#include "qemu/thread.h" + +#ifdef CONFIG_ATOMIC64 +#error This file must only be compiled if !CONFIG_ATOMIC64 +#endif + +/* + * When !CONFIG_ATOMIC64, we serialize both reads and writes with spinlocks. + * We use an array of spinlocks, with padding computed at run-time based on + * the host's dcache line size. + * We point to the array with a void * to simplify the padding's computation. + * Each spinlock is located every lock_size bytes. + */ +static void *lock_array; +static size_t lock_size; + +/* + * Systems without CONFIG_ATOMIC64 are unlikely to have many cores, so we use a + * small array of locks. + */ +#define NR_LOCKS 16 + +static QemuSpin *addr_to_lock(const void *addr) +{ + uintptr_t a = (uintptr_t)addr; + uintptr_t idx; + + idx = a >> qemu_dcache_linesize_log; + idx ^= (idx >> 8) ^ (idx >> 16); + idx &= NR_LOCKS - 1; + return lock_array + idx * lock_size; +} + +#define GEN_READ(name, type) \ + type name(const type *ptr) \ + { \ + QemuSpin *lock = addr_to_lock(ptr); \ + type ret; \ + \ + qemu_spin_lock(lock); \ + ret = *ptr; \ + qemu_spin_unlock(lock); \ + return ret; \ + } + +GEN_READ(qatomic_read_i64, int64_t) +GEN_READ(qatomic_read_u64, uint64_t) +#undef GEN_READ + +#define GEN_SET(name, type) \ + void name(type *ptr, type val) \ + { \ + QemuSpin *lock = addr_to_lock(ptr); \ + \ + qemu_spin_lock(lock); \ + *ptr = val; \ + qemu_spin_unlock(lock); \ + } + +GEN_SET(qatomic_set_i64, int64_t) +GEN_SET(qatomic_set_u64, uint64_t) +#undef GEN_SET + +void qatomic64_init(void) +{ + int i; + + lock_size = ROUND_UP(sizeof(QemuSpin), qemu_dcache_linesize); + lock_array = qemu_memalign(qemu_dcache_linesize, lock_size * NR_LOCKS); + for (i = 0; i < NR_LOCKS; i++) { + QemuSpin *lock = lock_array + i * lock_size; + + qemu_spin_init(lock); + } +} diff --git a/util/base64.c b/util/base64.c new file mode 100644 index 000000000..811111ac4 --- /dev/null +++ b/util/base64.c @@ -0,0 +1,60 @@ +/* + * QEMU base64 helpers + * + * Copyright (c) 2015 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/base64.h" + +static const char *base64_valid_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=\n"; + +uint8_t *qbase64_decode(const char *input, + size_t in_len, + size_t *out_len, + Error **errp) +{ + *out_len = 0; + + if (in_len != -1) { + /* Lack of NUL terminator is an error */ + if (input[in_len] != '\0') { + error_setg(errp, "Base64 data is not NUL terminated"); + return NULL; + } + /* Check there's no NULs embedded since we expect + * this to be valid base64 data */ + if (memchr(input, '\0', in_len) != NULL) { + error_setg(errp, "Base64 data contains embedded NUL characters"); + return NULL; + } + + /* Now we know its a valid nul terminated string + * strspn is safe to use... */ + } else { + in_len = strlen(input); + } + + if (strspn(input, base64_valid_chars) != in_len) { + error_setg(errp, "Base64 data contains invalid characters"); + return NULL; + } + + return g_base64_decode(input, out_len); +} diff --git a/util/bitmap.c b/util/bitmap.c new file mode 100644 index 000000000..1f201393a --- /dev/null +++ b/util/bitmap.c @@ -0,0 +1,489 @@ +/* + * Bitmap Module + * + * Stolen from linux/src/lib/bitmap.c + * + * Copyright (C) 2010 Corentin Chary + * + * This source code is licensed under the GNU General Public License, + * Version 2. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "qemu/bitmap.h" +#include "qemu/atomic.h" + +/* + * bitmaps provide an array of bits, implemented using an + * array of unsigned longs. The number of valid bits in a + * given bitmap does _not_ need to be an exact multiple of + * BITS_PER_LONG. + * + * The possible unused bits in the last, partially used word + * of a bitmap are 'don't care'. The implementation makes + * no particular effort to keep them zero. It ensures that + * their value will not affect the results of any operation. + * The bitmap operations that return Boolean (bitmap_empty, + * for example) or scalar (bitmap_weight, for example) results + * carefully filter out these unused bits from impacting their + * results. + * + * These operations actually hold to a slightly stronger rule: + * if you don't input any bitmaps to these ops that have some + * unused bits set, then they won't output any set unused bits + * in output bitmaps. + * + * The byte ordering of bitmaps is more natural on little + * endian architectures. + */ + +int slow_bitmap_empty(const unsigned long *bitmap, long bits) +{ + long k, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; ++k) { + if (bitmap[k]) { + return 0; + } + } + if (bits % BITS_PER_LONG) { + if (bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) { + return 0; + } + } + + return 1; +} + +int slow_bitmap_full(const unsigned long *bitmap, long bits) +{ + long k, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; ++k) { + if (~bitmap[k]) { + return 0; + } + } + + if (bits % BITS_PER_LONG) { + if (~bitmap[k] & BITMAP_LAST_WORD_MASK(bits)) { + return 0; + } + } + + return 1; +} + +int slow_bitmap_equal(const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; ++k) { + if (bitmap1[k] != bitmap2[k]) { + return 0; + } + } + + if (bits % BITS_PER_LONG) { + if ((bitmap1[k] ^ bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) { + return 0; + } + } + + return 1; +} + +void slow_bitmap_complement(unsigned long *dst, const unsigned long *src, + long bits) +{ + long k, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; ++k) { + dst[k] = ~src[k]; + } + + if (bits % BITS_PER_LONG) { + dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); + } +} + +int slow_bitmap_and(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k; + long nr = BITS_TO_LONGS(bits); + unsigned long result = 0; + + for (k = 0; k < nr; k++) { + result |= (dst[k] = bitmap1[k] & bitmap2[k]); + } + return result != 0; +} + +void slow_bitmap_or(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k; + long nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) { + dst[k] = bitmap1[k] | bitmap2[k]; + } +} + +void slow_bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k; + long nr = BITS_TO_LONGS(bits); + + for (k = 0; k < nr; k++) { + dst[k] = bitmap1[k] ^ bitmap2[k]; + } +} + +int slow_bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k; + long nr = BITS_TO_LONGS(bits); + unsigned long result = 0; + + for (k = 0; k < nr; k++) { + result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + } + return result != 0; +} + +void bitmap_set(unsigned long *map, long start, long nr) +{ + unsigned long *p = map + BIT_WORD(start); + const long size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + assert(start >= 0 && nr >= 0); + + while (nr - bits_to_set >= 0) { + *p |= mask_to_set; + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + *p |= mask_to_set; + } +} + +void bitmap_set_atomic(unsigned long *map, long start, long nr) +{ + unsigned long *p = map + BIT_WORD(start); + const long size = start + nr; + int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); + + assert(start >= 0 && nr >= 0); + + /* First word */ + if (nr - bits_to_set > 0) { + qatomic_or(p, mask_to_set); + nr -= bits_to_set; + bits_to_set = BITS_PER_LONG; + mask_to_set = ~0UL; + p++; + } + + /* Full words */ + if (bits_to_set == BITS_PER_LONG) { + while (nr >= BITS_PER_LONG) { + *p = ~0UL; + nr -= BITS_PER_LONG; + p++; + } + } + + /* Last word */ + if (nr) { + mask_to_set &= BITMAP_LAST_WORD_MASK(size); + qatomic_or(p, mask_to_set); + } else { + /* If we avoided the full barrier in qatomic_or(), issue a + * barrier to account for the assignments in the while loop. + */ + smp_mb(); + } +} + +void bitmap_clear(unsigned long *map, long start, long nr) +{ + unsigned long *p = map + BIT_WORD(start); + const long size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + + assert(start >= 0 && nr >= 0); + + while (nr - bits_to_clear >= 0) { + *p &= ~mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + *p &= ~mask_to_clear; + } +} + +bool bitmap_test_and_clear_atomic(unsigned long *map, long start, long nr) +{ + unsigned long *p = map + BIT_WORD(start); + const long size = start + nr; + int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); + unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); + unsigned long dirty = 0; + unsigned long old_bits; + + assert(start >= 0 && nr >= 0); + + /* First word */ + if (nr - bits_to_clear > 0) { + old_bits = qatomic_fetch_and(p, ~mask_to_clear); + dirty |= old_bits & mask_to_clear; + nr -= bits_to_clear; + bits_to_clear = BITS_PER_LONG; + mask_to_clear = ~0UL; + p++; + } + + /* Full words */ + if (bits_to_clear == BITS_PER_LONG) { + while (nr >= BITS_PER_LONG) { + if (*p) { + old_bits = qatomic_xchg(p, 0); + dirty |= old_bits; + } + nr -= BITS_PER_LONG; + p++; + } + } + + /* Last word */ + if (nr) { + mask_to_clear &= BITMAP_LAST_WORD_MASK(size); + old_bits = qatomic_fetch_and(p, ~mask_to_clear); + dirty |= old_bits & mask_to_clear; + } else { + if (!dirty) { + smp_mb(); + } + } + + return dirty != 0; +} + +void bitmap_copy_and_clear_atomic(unsigned long *dst, unsigned long *src, + long nr) +{ + while (nr > 0) { + *dst = qatomic_xchg(src, 0); + dst++; + src++; + nr -= BITS_PER_LONG; + } +} + +#define ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) + +/** + * bitmap_find_next_zero_area - find a contiguous aligned zero area + * @map: The address to base the search on + * @size: The bitmap size in bits + * @start: The bitnumber to start searching at + * @nr: The number of zeroed bits we're looking for + * @align_mask: Alignment mask for zero area + * + * The @align_mask should be one less than a power of 2; the effect is that + * the bit offset of all zero areas this function finds is multiples of that + * power of 2. A @align_mask of 0 means no alignment is required. + */ +unsigned long bitmap_find_next_zero_area(unsigned long *map, + unsigned long size, + unsigned long start, + unsigned long nr, + unsigned long align_mask) +{ + unsigned long index, end, i; +again: + index = find_next_zero_bit(map, size, start); + + /* Align allocation */ + index = ALIGN_MASK(index, align_mask); + + end = index + nr; + if (end > size) { + return end; + } + i = find_next_bit(map, end, index); + if (i < end) { + start = i + 1; + goto again; + } + return index; +} + +int slow_bitmap_intersects(const unsigned long *bitmap1, + const unsigned long *bitmap2, long bits) +{ + long k, lim = bits/BITS_PER_LONG; + + for (k = 0; k < lim; ++k) { + if (bitmap1[k] & bitmap2[k]) { + return 1; + } + } + + if (bits % BITS_PER_LONG) { + if ((bitmap1[k] & bitmap2[k]) & BITMAP_LAST_WORD_MASK(bits)) { + return 1; + } + } + return 0; +} + +long slow_bitmap_count_one(const unsigned long *bitmap, long nbits) +{ + long k, lim = nbits / BITS_PER_LONG, result = 0; + + for (k = 0; k < lim; k++) { + result += ctpopl(bitmap[k]); + } + + if (nbits % BITS_PER_LONG) { + result += ctpopl(bitmap[k] & BITMAP_LAST_WORD_MASK(nbits)); + } + + return result; +} + +static void bitmap_to_from_le(unsigned long *dst, + const unsigned long *src, long nbits) +{ + long len = BITS_TO_LONGS(nbits); + +#ifdef HOST_WORDS_BIGENDIAN + long index; + + for (index = 0; index < len; index++) { +# if HOST_LONG_BITS == 64 + dst[index] = bswap64(src[index]); +# else + dst[index] = bswap32(src[index]); +# endif + } +#else + memcpy(dst, src, len * sizeof(unsigned long)); +#endif +} + +void bitmap_from_le(unsigned long *dst, const unsigned long *src, + long nbits) +{ + bitmap_to_from_le(dst, src, nbits); +} + +void bitmap_to_le(unsigned long *dst, const unsigned long *src, + long nbits) +{ + bitmap_to_from_le(dst, src, nbits); +} + +/* + * Copy "src" bitmap with a positive offset and put it into the "dst" + * bitmap. The caller needs to make sure the bitmap size of "src" + * is bigger than (shift + nbits). + */ +void bitmap_copy_with_src_offset(unsigned long *dst, const unsigned long *src, + unsigned long shift, unsigned long nbits) +{ + unsigned long left_mask, right_mask, last_mask; + + /* Proper shift src pointer to the first word to copy from */ + src += BIT_WORD(shift); + shift %= BITS_PER_LONG; + + if (!shift) { + /* Fast path */ + bitmap_copy(dst, src, nbits); + return; + } + + right_mask = (1ul << shift) - 1; + left_mask = ~right_mask; + + while (nbits >= BITS_PER_LONG) { + *dst = (*src & left_mask) >> shift; + *dst |= (src[1] & right_mask) << (BITS_PER_LONG - shift); + dst++; + src++; + nbits -= BITS_PER_LONG; + } + + if (nbits > BITS_PER_LONG - shift) { + *dst = (*src & left_mask) >> shift; + nbits -= BITS_PER_LONG - shift; + last_mask = (1ul << nbits) - 1; + *dst |= (src[1] & last_mask) << (BITS_PER_LONG - shift); + } else if (nbits) { + last_mask = (1ul << nbits) - 1; + *dst = (*src >> shift) & last_mask; + } +} + +/* + * Copy "src" bitmap into the "dst" bitmap with an offset in the + * "dst". The caller needs to make sure the bitmap size of "dst" is + * bigger than (shift + nbits). + */ +void bitmap_copy_with_dst_offset(unsigned long *dst, const unsigned long *src, + unsigned long shift, unsigned long nbits) +{ + unsigned long left_mask, right_mask, last_mask; + + /* Proper shift dst pointer to the first word to copy from */ + dst += BIT_WORD(shift); + shift %= BITS_PER_LONG; + + if (!shift) { + /* Fast path */ + bitmap_copy(dst, src, nbits); + return; + } + + right_mask = (1ul << (BITS_PER_LONG - shift)) - 1; + left_mask = ~right_mask; + + *dst &= (1ul << shift) - 1; + while (nbits >= BITS_PER_LONG) { + *dst |= (*src & right_mask) << shift; + dst[1] = (*src & left_mask) >> (BITS_PER_LONG - shift); + dst++; + src++; + nbits -= BITS_PER_LONG; + } + + if (nbits > BITS_PER_LONG - shift) { + *dst |= (*src & right_mask) << shift; + nbits -= BITS_PER_LONG - shift; + last_mask = ((1ul << nbits) - 1) << (BITS_PER_LONG - shift); + dst[1] = (*src & last_mask) >> (BITS_PER_LONG - shift); + } else if (nbits) { + last_mask = (1ul << nbits) - 1; + *dst |= (*src & last_mask) << shift; + } +} diff --git a/util/bitops.c b/util/bitops.c new file mode 100644 index 000000000..3fe6b1c4f --- /dev/null +++ b/util/bitops.c @@ -0,0 +1,157 @@ +/* + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * Copyright (C) 2008 IBM Corporation + * Written by Rusty Russell <rusty@rustcorp.com.au> + * (Inspired by David Howell's find_next_bit implementation) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" + +/* + * Find the next set bit in a memory region. + */ +unsigned long find_next_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) { + return size; + } + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp &= (~0UL << offset); + if (size < BITS_PER_LONG) { + goto found_first; + } + if (tmp) { + goto found_middle; + } + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size >= 4*BITS_PER_LONG) { + unsigned long d1, d2, d3; + tmp = *p; + d1 = *(p+1); + d2 = *(p+2); + d3 = *(p+3); + if (tmp) { + goto found_middle; + } + if (d1 | d2 | d3) { + break; + } + p += 4; + result += 4*BITS_PER_LONG; + size -= 4*BITS_PER_LONG; + } + while (size >= BITS_PER_LONG) { + if ((tmp = *(p++))) { + goto found_middle; + } + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) { + return result; + } + tmp = *p; + +found_first: + tmp &= (~0UL >> (BITS_PER_LONG - size)); + if (tmp == 0UL) { /* Are any bits set? */ + return result + size; /* Nope. */ + } +found_middle: + return result + ctzl(tmp); +} + +/* + * This implementation of find_{first,next}_zero_bit was stolen from + * Linus' asm-alpha/bitops.h. + */ +unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size, + unsigned long offset) +{ + const unsigned long *p = addr + BIT_WORD(offset); + unsigned long result = offset & ~(BITS_PER_LONG-1); + unsigned long tmp; + + if (offset >= size) { + return size; + } + size -= result; + offset %= BITS_PER_LONG; + if (offset) { + tmp = *(p++); + tmp |= ~0UL >> (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) { + goto found_first; + } + if (~tmp) { + goto found_middle; + } + size -= BITS_PER_LONG; + result += BITS_PER_LONG; + } + while (size & ~(BITS_PER_LONG-1)) { + if (~(tmp = *(p++))) { + goto found_middle; + } + result += BITS_PER_LONG; + size -= BITS_PER_LONG; + } + if (!size) { + return result; + } + tmp = *p; + +found_first: + tmp |= ~0UL << size; + if (tmp == ~0UL) { /* Are any bits zero? */ + return result + size; /* Nope. */ + } +found_middle: + return result + ctzl(~tmp); +} + +unsigned long find_last_bit(const unsigned long *addr, unsigned long size) +{ + unsigned long words; + unsigned long tmp; + + /* Start at final word. */ + words = size / BITS_PER_LONG; + + /* Partial final word? */ + if (size & (BITS_PER_LONG-1)) { + tmp = (addr[words] & (~0UL >> (BITS_PER_LONG + - (size & (BITS_PER_LONG-1))))); + if (tmp) { + goto found; + } + } + + while (words) { + tmp = addr[--words]; + if (tmp) { + found: + return words * BITS_PER_LONG + BITS_PER_LONG - 1 - clzl(tmp); + } + } + + /* Not found */ + return size; +} diff --git a/util/block-helpers.c b/util/block-helpers.c new file mode 100644 index 000000000..c4851432f --- /dev/null +++ b/util/block-helpers.c @@ -0,0 +1,46 @@ +/* + * Block utility functions + * + * Copyright IBM, Corp. 2011 + * Copyright (c) 2020 Coiby Xu <coiby.xu@gmail.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qmp/qerror.h" +#include "block-helpers.h" + +/** + * check_block_size: + * @id: The unique ID of the object + * @name: The name of the property being validated + * @value: The block size in bytes + * @errp: A pointer to an area to store an error + * + * This function checks that the block size meets the following conditions: + * 1. At least MIN_BLOCK_SIZE + * 2. No larger than MAX_BLOCK_SIZE + * 3. A power of 2 + */ +void check_block_size(const char *id, const char *name, int64_t value, + Error **errp) +{ + /* value of 0 means "unset" */ + if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) { + error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE, + id, name, value, MIN_BLOCK_SIZE, MAX_BLOCK_SIZE); + return; + } + + /* We rely on power-of-2 blocksizes for bitmasks */ + if ((value & (value - 1)) != 0) { + error_setg(errp, + "Property %s.%s doesn't take value '%" PRId64 + "', it's not a power of 2", + id, name, value); + return; + } +} diff --git a/util/block-helpers.h b/util/block-helpers.h new file mode 100644 index 000000000..b53295a52 --- /dev/null +++ b/util/block-helpers.h @@ -0,0 +1,19 @@ +#ifndef BLOCK_HELPERS_H +#define BLOCK_HELPERS_H + +#include "qemu/units.h" + +/* lower limit is sector size */ +#define MIN_BLOCK_SIZE INT64_C(512) +#define MIN_BLOCK_SIZE_STR "512 B" +/* + * upper limit is arbitrary, 2 MiB looks sufficient for all sensible uses, and + * matches qcow2 cluster size limit + */ +#define MAX_BLOCK_SIZE (2 * MiB) +#define MAX_BLOCK_SIZE_STR "2 MiB" + +void check_block_size(const char *id, const char *name, int64_t value, + Error **errp); + +#endif /* BLOCK_HELPERS_H */ diff --git a/util/buffer.c b/util/buffer.c new file mode 100644 index 000000000..743eaa930 --- /dev/null +++ b/util/buffer.c @@ -0,0 +1,173 @@ +/* + * QEMU generic buffers + * + * Copyright (c) 2015 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/buffer.h" +#include "trace.h" + +#define BUFFER_MIN_INIT_SIZE 4096 +#define BUFFER_MIN_SHRINK_SIZE 65536 + +/* define the factor alpha for the exponential smoothing + * that is used in the average size calculation. a shift + * of 7 results in an alpha of 1/2^7. */ +#define BUFFER_AVG_SIZE_SHIFT 7 + +static size_t buffer_req_size(Buffer *buffer, size_t len) +{ + return MAX(BUFFER_MIN_INIT_SIZE, + pow2ceil(buffer->offset + len)); +} + +static void buffer_adj_size(Buffer *buffer, size_t len) +{ + size_t old = buffer->capacity; + buffer->capacity = buffer_req_size(buffer, len); + buffer->buffer = g_realloc(buffer->buffer, buffer->capacity); + trace_buffer_resize(buffer->name ?: "unnamed", + old, buffer->capacity); + + /* make it even harder for the buffer to shrink, reset average size + * to current capacity if it is larger than the average. */ + buffer->avg_size = MAX(buffer->avg_size, + buffer->capacity << BUFFER_AVG_SIZE_SHIFT); +} + +void buffer_init(Buffer *buffer, const char *name, ...) +{ + va_list ap; + + va_start(ap, name); + buffer->name = g_strdup_vprintf(name, ap); + va_end(ap); +} + +static uint64_t buffer_get_avg_size(Buffer *buffer) +{ + return buffer->avg_size >> BUFFER_AVG_SIZE_SHIFT; +} + +void buffer_shrink(Buffer *buffer) +{ + size_t new; + + /* Calculate the average size of the buffer as + * avg_size = avg_size * ( 1 - a ) + required_size * a + * where a is 1 / 2 ^ BUFFER_AVG_SIZE_SHIFT. */ + buffer->avg_size *= (1 << BUFFER_AVG_SIZE_SHIFT) - 1; + buffer->avg_size >>= BUFFER_AVG_SIZE_SHIFT; + buffer->avg_size += buffer_req_size(buffer, 0); + + /* And then only shrink if the average size of the buffer is much + * too big, to avoid bumping up & down the buffers all the time. + * realloc() isn't exactly cheap ... */ + new = buffer_req_size(buffer, buffer_get_avg_size(buffer)); + if (new < buffer->capacity >> 3 && + new >= BUFFER_MIN_SHRINK_SIZE) { + buffer_adj_size(buffer, buffer_get_avg_size(buffer)); + } + + buffer_adj_size(buffer, 0); +} + +void buffer_reserve(Buffer *buffer, size_t len) +{ + if ((buffer->capacity - buffer->offset) < len) { + buffer_adj_size(buffer, len); + } +} + +gboolean buffer_empty(Buffer *buffer) +{ + return buffer->offset == 0; +} + +uint8_t *buffer_end(Buffer *buffer) +{ + return buffer->buffer + buffer->offset; +} + +void buffer_reset(Buffer *buffer) +{ + buffer->offset = 0; + buffer_shrink(buffer); +} + +void buffer_free(Buffer *buffer) +{ + trace_buffer_free(buffer->name ?: "unnamed", buffer->capacity); + g_free(buffer->buffer); + g_free(buffer->name); + buffer->offset = 0; + buffer->capacity = 0; + buffer->buffer = NULL; + buffer->name = NULL; +} + +void buffer_append(Buffer *buffer, const void *data, size_t len) +{ + memcpy(buffer->buffer + buffer->offset, data, len); + buffer->offset += len; +} + +void buffer_advance(Buffer *buffer, size_t len) +{ + memmove(buffer->buffer, buffer->buffer + len, + (buffer->offset - len)); + buffer->offset -= len; + buffer_shrink(buffer); +} + +void buffer_move_empty(Buffer *to, Buffer *from) +{ + trace_buffer_move_empty(to->name ?: "unnamed", + from->offset, + from->name ?: "unnamed"); + assert(to->offset == 0); + + g_free(to->buffer); + to->offset = from->offset; + to->capacity = from->capacity; + to->buffer = from->buffer; + + from->offset = 0; + from->capacity = 0; + from->buffer = NULL; +} + +void buffer_move(Buffer *to, Buffer *from) +{ + if (to->offset == 0) { + buffer_move_empty(to, from); + return; + } + + trace_buffer_move(to->name ?: "unnamed", + from->offset, + from->name ?: "unnamed"); + buffer_reserve(to, from->offset); + buffer_append(to, from->buffer, from->offset); + + g_free(from->buffer); + from->offset = 0; + from->capacity = 0; + from->buffer = NULL; +} diff --git a/util/bufferiszero.c b/util/bufferiszero.c new file mode 100644 index 000000000..695bb4ce2 --- /dev/null +++ b/util/bufferiszero.c @@ -0,0 +1,355 @@ +/* + * Simple C functions to supplement the C library + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qemu/bswap.h" + +static bool +buffer_zero_int(const void *buf, size_t len) +{ + if (unlikely(len < 8)) { + /* For a very small buffer, simply accumulate all the bytes. */ + const unsigned char *p = buf; + const unsigned char *e = buf + len; + unsigned char t = 0; + + do { + t |= *p++; + } while (p < e); + + return t == 0; + } else { + /* Otherwise, use the unaligned memory access functions to + handle the beginning and end of the buffer, with a couple + of loops handling the middle aligned section. */ + uint64_t t = ldq_he_p(buf); + const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); + const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); + + for (; p + 8 <= e; p += 8) { + __builtin_prefetch(p + 8); + if (t) { + return false; + } + t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; + } + while (p < e) { + t |= *p++; + } + t |= ldq_he_p(buf + len - 8); + + return t == 0; + } +} + +#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) +/* Do not use push_options pragmas unnecessarily, because clang + * does not support them. + */ +#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) +#pragma GCC push_options +#pragma GCC target("sse2") +#endif +#include <emmintrin.h> + +/* Note that each of these vectorized functions require len >= 64. */ + +static bool +buffer_zero_sse2(const void *buf, size_t len) +{ + __m128i t = _mm_loadu_si128(buf); + __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); + __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); + __m128i zero = _mm_setzero_si128(); + + /* Loop over 16-byte aligned blocks of 64. */ + while (likely(p <= e)) { + __builtin_prefetch(p); + t = _mm_cmpeq_epi8(t, zero); + if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + /* Finish the aligned tail. */ + t |= e[-3]; + t |= e[-2]; + t |= e[-1]; + + /* Finish the unaligned tail. */ + t |= _mm_loadu_si128(buf + len - 16); + + return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; +} +#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) +#pragma GCC pop_options +#endif + +#ifdef CONFIG_AVX2_OPT +/* Note that due to restrictions/bugs wrt __builtin functions in gcc <= 4.8, + * the includes have to be within the corresponding push_options region, and + * therefore the regions themselves have to be ordered with increasing ISA. + */ +#pragma GCC push_options +#pragma GCC target("sse4") +#include <smmintrin.h> + +static bool +buffer_zero_sse4(const void *buf, size_t len) +{ + __m128i t = _mm_loadu_si128(buf); + __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); + __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); + + /* Loop over 16-byte aligned blocks of 64. */ + while (likely(p <= e)) { + __builtin_prefetch(p); + if (unlikely(!_mm_testz_si128(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + /* Finish the aligned tail. */ + t |= e[-3]; + t |= e[-2]; + t |= e[-1]; + + /* Finish the unaligned tail. */ + t |= _mm_loadu_si128(buf + len - 16); + + return _mm_testz_si128(t, t); +} + +#pragma GCC pop_options +#pragma GCC push_options +#pragma GCC target("avx2") +#include <immintrin.h> + +static bool +buffer_zero_avx2(const void *buf, size_t len) +{ + /* Begin with an unaligned head of 32 bytes. */ + __m256i t = _mm256_loadu_si256(buf); + __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); + __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); + + /* Loop over 32-byte aligned blocks of 128. */ + while (p <= e) { + __builtin_prefetch(p); + if (unlikely(!_mm256_testz_si256(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } ; + + /* Finish the last block of 128 unaligned. */ + t |= _mm256_loadu_si256(buf + len - 4 * 32); + t |= _mm256_loadu_si256(buf + len - 3 * 32); + t |= _mm256_loadu_si256(buf + len - 2 * 32); + t |= _mm256_loadu_si256(buf + len - 1 * 32); + + return _mm256_testz_si256(t, t); +} +#pragma GCC pop_options +#endif /* CONFIG_AVX2_OPT */ + +#ifdef CONFIG_AVX512F_OPT +#pragma GCC push_options +#pragma GCC target("avx512f") +#include <immintrin.h> + +static bool +buffer_zero_avx512(const void *buf, size_t len) +{ + /* Begin with an unaligned head of 64 bytes. */ + __m512i t = _mm512_loadu_si512(buf); + __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); + __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); + + /* Loop over 64-byte aligned blocks of 256. */ + while (p <= e) { + __builtin_prefetch(p); + if (unlikely(_mm512_test_epi64_mask(t, t))) { + return false; + } + t = p[-4] | p[-3] | p[-2] | p[-1]; + p += 4; + } + + t |= _mm512_loadu_si512(buf + len - 4 * 64); + t |= _mm512_loadu_si512(buf + len - 3 * 64); + t |= _mm512_loadu_si512(buf + len - 2 * 64); + t |= _mm512_loadu_si512(buf + len - 1 * 64); + + return !_mm512_test_epi64_mask(t, t); + +} +#pragma GCC pop_options +#endif + + +/* Note that for test_buffer_is_zero_next_accel, the most preferred + * ISA must have the least significant bit. + */ +#define CACHE_AVX512F 1 +#define CACHE_AVX2 2 +#define CACHE_SSE4 4 +#define CACHE_SSE2 8 + +/* Make sure that these variables are appropriately initialized when + * SSE2 is enabled on the compiler command-line, but the compiler is + * too old to support CONFIG_AVX2_OPT. + */ +#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) +# define INIT_CACHE 0 +# define INIT_ACCEL buffer_zero_int +#else +# ifndef __SSE2__ +# error "ISA selection confusion" +# endif +# define INIT_CACHE CACHE_SSE2 +# define INIT_ACCEL buffer_zero_sse2 +#endif + +static unsigned cpuid_cache = INIT_CACHE; +static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL; +static int length_to_accel = 64; + +static void init_accel(unsigned cache) +{ + bool (*fn)(const void *, size_t) = buffer_zero_int; + if (cache & CACHE_SSE2) { + fn = buffer_zero_sse2; + length_to_accel = 64; + } +#ifdef CONFIG_AVX2_OPT + if (cache & CACHE_SSE4) { + fn = buffer_zero_sse4; + length_to_accel = 64; + } + if (cache & CACHE_AVX2) { + fn = buffer_zero_avx2; + length_to_accel = 128; + } +#endif +#ifdef CONFIG_AVX512F_OPT + if (cache & CACHE_AVX512F) { + fn = buffer_zero_avx512; + length_to_accel = 256; + } +#endif + buffer_accel = fn; +} + +#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) +#include "qemu/cpuid.h" + +static void __attribute__((constructor)) init_cpuid_cache(void) +{ + int max = __get_cpuid_max(0, NULL); + int a, b, c, d; + unsigned cache = 0; + + if (max >= 1) { + __cpuid(1, a, b, c, d); + if (d & bit_SSE2) { + cache |= CACHE_SSE2; + } + if (c & bit_SSE4_1) { + cache |= CACHE_SSE4; + } + + /* We must check that AVX is not just available, but usable. */ + if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) { + int bv; + __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0)); + __cpuid_count(7, 0, a, b, c, d); + if ((bv & 0x6) == 0x6 && (b & bit_AVX2)) { + cache |= CACHE_AVX2; + } + /* 0xe6: + * XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15 + * and ZMM16-ZMM31 state are enabled by OS) + * XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS) + */ + if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512F)) { + cache |= CACHE_AVX512F; + } + } + } + cpuid_cache = cache; + init_accel(cache); +} +#endif /* CONFIG_AVX2_OPT */ + +bool test_buffer_is_zero_next_accel(void) +{ + /* If no bits set, we just tested buffer_zero_int, and there + are no more acceleration options to test. */ + if (cpuid_cache == 0) { + return false; + } + /* Disable the accelerator we used before and select a new one. */ + cpuid_cache &= cpuid_cache - 1; + init_accel(cpuid_cache); + return true; +} + +static bool select_accel_fn(const void *buf, size_t len) +{ + if (likely(len >= length_to_accel)) { + return buffer_accel(buf, len); + } + return buffer_zero_int(buf, len); +} + +#else +#define select_accel_fn buffer_zero_int +bool test_buffer_is_zero_next_accel(void) +{ + return false; +} +#endif + +/* + * Checks if a buffer is all zeroes + */ +bool buffer_is_zero(const void *buf, size_t len) +{ + if (unlikely(len == 0)) { + return true; + } + + /* Fetch the beginning of the buffer while we select the accelerator. */ + __builtin_prefetch(buf); + + /* Use an optimized zero check if possible. Note that this also + includes a check for an unrolled loop over 64-bit integers. */ + return select_accel_fn(buf, len); +} diff --git a/util/cacheflush.c b/util/cacheflush.c new file mode 100644 index 000000000..933355b0c --- /dev/null +++ b/util/cacheflush.c @@ -0,0 +1,146 @@ +/* + * Flush the host cpu caches. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/cacheflush.h" +#include "qemu/bitops.h" + + +#if defined(__i386__) || defined(__x86_64__) || defined(__s390__) + +/* Caches are coherent and do not require flushing; symbol inline. */ + +#elif defined(__aarch64__) + +#ifdef CONFIG_DARWIN +/* Apple does not expose CTR_EL0, so we must use system interfaces. */ +extern void sys_icache_invalidate(void *start, size_t len); +extern void sys_dcache_flush(void *start, size_t len); +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + sys_dcache_flush((void *)rw, len); + sys_icache_invalidate((void *)rx, len); +} +#else + +/* + * TODO: unify this with cacheinfo.c. + * We want to save the whole contents of CTR_EL0, so that we + * have more than the linesize, but also IDC and DIC. + */ +static uint64_t save_ctr_el0; +static void __attribute__((constructor)) init_ctr_el0(void) +{ + asm volatile("mrs\t%0, ctr_el0" : "=r"(save_ctr_el0)); +} + +/* + * This is a copy of gcc's __aarch64_sync_cache_range, modified + * to fit this three-operand interface. + */ +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + const unsigned CTR_IDC = 1u << 28; + const unsigned CTR_DIC = 1u << 29; + const uint64_t ctr_el0 = save_ctr_el0; + const uintptr_t icache_lsize = 4 << extract64(ctr_el0, 0, 4); + const uintptr_t dcache_lsize = 4 << extract64(ctr_el0, 16, 4); + uintptr_t p; + + /* + * If CTR_EL0.IDC is enabled, Data cache clean to the Point of Unification + * is not required for instruction to data coherence. + */ + if (!(ctr_el0 & CTR_IDC)) { + /* + * Loop over the address range, clearing one cache line at once. + * Data cache must be flushed to unification first to make sure + * the instruction cache fetches the updated data. + */ + for (p = rw & -dcache_lsize; p < rw + len; p += dcache_lsize) { + asm volatile("dc\tcvau, %0" : : "r" (p) : "memory"); + } + asm volatile("dsb\tish" : : : "memory"); + } + + /* + * If CTR_EL0.DIC is enabled, Instruction cache cleaning to the Point + * of Unification is not required for instruction to data coherence. + */ + if (!(ctr_el0 & CTR_DIC)) { + for (p = rx & -icache_lsize; p < rx + len; p += icache_lsize) { + asm volatile("ic\tivau, %0" : : "r"(p) : "memory"); + } + asm volatile ("dsb\tish" : : : "memory"); + } + + asm volatile("isb" : : : "memory"); +} +#endif /* CONFIG_DARWIN */ + +#elif defined(__mips__) + +#ifdef __OpenBSD__ +#include <machine/sysarch.h> +#else +#include <sys/cachectl.h> +#endif + +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + if (rx != rw) { + cacheflush((void *)rw, len, DCACHE); + } + cacheflush((void *)rx, len, ICACHE); +} + +#elif defined(__powerpc__) + +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + uintptr_t p, b, e; + size_t dsize = qemu_dcache_linesize; + size_t isize = qemu_icache_linesize; + + b = rw & ~(dsize - 1); + e = (rw + len + dsize - 1) & ~(dsize - 1); + for (p = b; p < e; p += dsize) { + asm volatile ("dcbst 0,%0" : : "r"(p) : "memory"); + } + asm volatile ("sync" : : : "memory"); + + b = rx & ~(isize - 1); + e = (rx + len + isize - 1) & ~(isize - 1); + for (p = b; p < e; p += isize) { + asm volatile ("icbi 0,%0" : : "r"(p) : "memory"); + } + asm volatile ("sync" : : : "memory"); + asm volatile ("isync" : : : "memory"); +} + +#elif defined(__sparc__) + +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + /* No additional data flush to the RW virtual address required. */ + uintptr_t p, end = (rx + len + 7) & -8; + for (p = rx & -8; p < end; p += 8) { + __asm__ __volatile__("flush\t%0" : : "r" (p)); + } +} + +#else + +void flush_idcache_range(uintptr_t rx, uintptr_t rw, size_t len) +{ + if (rw != rx) { + __builtin___clear_cache((char *)rw, (char *)rw + len); + } + __builtin___clear_cache((char *)rx, (char *)rx + len); +} + +#endif diff --git a/util/cacheinfo.c b/util/cacheinfo.c new file mode 100644 index 000000000..b182f0b69 --- /dev/null +++ b/util/cacheinfo.c @@ -0,0 +1,199 @@ +/* + * cacheinfo.c - helpers to query the host about its caches + * + * Copyright (C) 2017, Emilio G. Cota <cota@braap.org> + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include "qemu/atomic.h" + +int qemu_icache_linesize = 0; +int qemu_icache_linesize_log; +int qemu_dcache_linesize = 0; +int qemu_dcache_linesize_log; + +/* + * Operating system specific detection mechanisms. + */ + +#if defined(_WIN32) + +static void sys_cache_info(int *isize, int *dsize) +{ + SYSTEM_LOGICAL_PROCESSOR_INFORMATION *buf; + DWORD size = 0; + BOOL success; + size_t i, n; + + /* Check for the required buffer size first. Note that if the zero + size we use for the probe results in success, then there is no + data available; fail in that case. */ + success = GetLogicalProcessorInformation(0, &size); + if (success || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return; + } + + n = size / sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + size = n * sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + buf = g_new0(SYSTEM_LOGICAL_PROCESSOR_INFORMATION, n); + if (!GetLogicalProcessorInformation(buf, &size)) { + goto fail; + } + + for (i = 0; i < n; i++) { + if (buf[i].Relationship == RelationCache + && buf[i].Cache.Level == 1) { + switch (buf[i].Cache.Type) { + case CacheUnified: + *isize = *dsize = buf[i].Cache.LineSize; + break; + case CacheInstruction: + *isize = buf[i].Cache.LineSize; + break; + case CacheData: + *dsize = buf[i].Cache.LineSize; + break; + default: + break; + } + } + } + fail: + g_free(buf); +} + +#elif defined(__APPLE__) +# include <sys/sysctl.h> +static void sys_cache_info(int *isize, int *dsize) +{ + /* There's only a single sysctl for both I/D cache line sizes. */ + long size; + size_t len = sizeof(size); + if (!sysctlbyname("hw.cachelinesize", &size, &len, NULL, 0)) { + *isize = *dsize = size; + } +} +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +# include <sys/sysctl.h> +static void sys_cache_info(int *isize, int *dsize) +{ + /* There's only a single sysctl for both I/D cache line sizes. */ + int size; + size_t len = sizeof(size); + if (!sysctlbyname("machdep.cacheline_size", &size, &len, NULL, 0)) { + *isize = *dsize = size; + } +} +#else +/* POSIX */ + +static void sys_cache_info(int *isize, int *dsize) +{ +# ifdef _SC_LEVEL1_ICACHE_LINESIZE + int tmp_isize = (int) sysconf(_SC_LEVEL1_ICACHE_LINESIZE); + if (tmp_isize > 0) { + *isize = tmp_isize; + } +# endif +# ifdef _SC_LEVEL1_DCACHE_LINESIZE + int tmp_dsize = (int) sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (tmp_dsize > 0) { + *dsize = tmp_dsize; + } +# endif +} +#endif /* sys_cache_info */ + +/* + * Architecture (+ OS) specific detection mechanisms. + */ + +#if defined(__aarch64__) + +static void arch_cache_info(int *isize, int *dsize) +{ + if (*isize == 0 || *dsize == 0) { + uint64_t ctr; + + /* The real cache geometry is in CCSIDR_EL1/CLIDR_EL1/CSSELR_EL1, + but (at least under Linux) these are marked protected by the + kernel. However, CTR_EL0 contains the minimum linesize in the + entire hierarchy, and is used by userspace cache flushing. */ + asm volatile("mrs\t%0, ctr_el0" : "=r"(ctr)); + if (*isize == 0) { + *isize = 4 << (ctr & 0xf); + } + if (*dsize == 0) { + *dsize = 4 << ((ctr >> 16) & 0xf); + } + } +} + +#elif defined(_ARCH_PPC) && defined(__linux__) +# include "elf.h" + +static void arch_cache_info(int *isize, int *dsize) +{ + if (*isize == 0) { + *isize = qemu_getauxval(AT_ICACHEBSIZE); + } + if (*dsize == 0) { + *dsize = qemu_getauxval(AT_DCACHEBSIZE); + } +} + +#else +static void arch_cache_info(int *isize, int *dsize) { } +#endif /* arch_cache_info */ + +/* + * ... and if all else fails ... + */ + +static void fallback_cache_info(int *isize, int *dsize) +{ + /* If we can only find one of the two, assume they're the same. */ + if (*isize) { + if (*dsize) { + /* Success! */ + } else { + *dsize = *isize; + } + } else if (*dsize) { + *isize = *dsize; + } else { +#if defined(_ARCH_PPC) + /* + * For PPC, we're going to use the cache sizes computed for + * flush_idcache_range. Which means that we must use the + * architecture minimum. + */ + *isize = *dsize = 16; +#else + /* Otherwise, 64 bytes is not uncommon. */ + *isize = *dsize = 64; +#endif + } +} + +static void __attribute__((constructor)) init_cache_info(void) +{ + int isize = 0, dsize = 0; + + sys_cache_info(&isize, &dsize); + arch_cache_info(&isize, &dsize); + fallback_cache_info(&isize, &dsize); + + assert((isize & (isize - 1)) == 0); + assert((dsize & (dsize - 1)) == 0); + + qemu_icache_linesize = isize; + qemu_icache_linesize_log = ctz32(isize); + qemu_dcache_linesize = dsize; + qemu_dcache_linesize_log = ctz32(dsize); + + qatomic64_init(); +} diff --git a/util/compatfd.c b/util/compatfd.c new file mode 100644 index 000000000..ab810c42a --- /dev/null +++ b/util/compatfd.c @@ -0,0 +1,106 @@ +/* + * signalfd/eventfd compatibility + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/thread.h" + +#if defined(CONFIG_SIGNALFD) +#include <sys/signalfd.h> +#endif + +struct sigfd_compat_info { + sigset_t mask; + int fd; +}; + +static void *sigwait_compat(void *opaque) +{ + struct sigfd_compat_info *info = opaque; + + while (1) { + int sig; + int err; + + err = sigwait(&info->mask, &sig); + if (err != 0) { + if (errno == EINTR) { + continue; + } else { + return NULL; + } + } else { + struct qemu_signalfd_siginfo buffer; + size_t offset = 0; + + memset(&buffer, 0, sizeof(buffer)); + buffer.ssi_signo = sig; + + while (offset < sizeof(buffer)) { + ssize_t len; + + len = write(info->fd, (char *)&buffer + offset, + sizeof(buffer) - offset); + if (len == -1 && errno == EINTR) { + continue; + } + + if (len <= 0) { + return NULL; + } + + offset += len; + } + } + } +} + +static int qemu_signalfd_compat(const sigset_t *mask) +{ + struct sigfd_compat_info *info; + QemuThread thread; + int fds[2]; + + info = g_malloc(sizeof(*info)); + + if (pipe(fds) == -1) { + g_free(info); + return -1; + } + + qemu_set_cloexec(fds[0]); + qemu_set_cloexec(fds[1]); + + memcpy(&info->mask, mask, sizeof(*mask)); + info->fd = fds[1]; + + qemu_thread_create(&thread, "signalfd_compat", sigwait_compat, info, + QEMU_THREAD_DETACHED); + + return fds[0]; +} + +int qemu_signalfd(const sigset_t *mask) +{ +#if defined(CONFIG_SIGNALFD) + int ret; + + ret = signalfd(-1, mask, SFD_CLOEXEC); + if (ret != -1) { + return ret; + } +#endif + + return qemu_signalfd_compat(mask); +} diff --git a/util/coroutine-sigaltstack.c b/util/coroutine-sigaltstack.c new file mode 100644 index 000000000..e99b8a4f9 --- /dev/null +++ b/util/coroutine-sigaltstack.c @@ -0,0 +1,304 @@ +/* + * sigaltstack coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> + * Copyright (C) 2012 Alex Barcelo <abarcelo@ac.upc.edu> +** This file is partly based on pth_mctx.c, from the GNU Portable Threads +** Copyright (c) 1999-2006 Ralf S. Engelschall <rse@engelschall.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include "qemu/osdep.h" +#include <pthread.h> +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +#ifdef CONFIG_SAFESTACK +#error "SafeStack is not compatible with code run in alternate signal stacks" +#endif + +typedef struct { + Coroutine base; + void *stack; + size_t stack_size; + sigjmp_buf env; +} CoroutineSigAltStack; + +/** + * Per-thread coroutine bookkeeping + */ +typedef struct { + /** Currently executing coroutine */ + Coroutine *current; + + /** The default coroutine */ + CoroutineSigAltStack leader; + + /** Information for the signal handler (trampoline) */ + sigjmp_buf tr_reenter; + volatile sig_atomic_t tr_called; + void *tr_handler; +} CoroutineThreadState; + +static pthread_key_t thread_state_key; + +static CoroutineThreadState *coroutine_get_thread_state(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + if (!s) { + s = g_malloc0(sizeof(*s)); + s->current = &s->leader.base; + pthread_setspecific(thread_state_key, s); + } + return s; +} + +static void qemu_coroutine_thread_cleanup(void *opaque) +{ + CoroutineThreadState *s = opaque; + + g_free(s); +} + +static void __attribute__((constructor)) coroutine_init(void) +{ + int ret; + + ret = pthread_key_create(&thread_state_key, qemu_coroutine_thread_cleanup); + if (ret != 0) { + fprintf(stderr, "unable to create leader key: %s\n", strerror(errno)); + abort(); + } +} + +/* "boot" function + * This is what starts the coroutine, is called from the trampoline + * (from the signal handler when it is not signal handling, read ahead + * for more information). + */ +static void coroutine_bootstrap(CoroutineSigAltStack *self, Coroutine *co) +{ + /* Initialize longjmp environment and switch back the caller */ + if (!sigsetjmp(self->env, 0)) { + siglongjmp(*(sigjmp_buf *)co->entry_arg, 1); + } + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +/* + * This is used as the signal handler. This is called with the brand new stack + * (thanks to sigaltstack). We have to return, given that this is a signal + * handler and the sigmask and some other things are changed. + */ +static void coroutine_trampoline(int signal) +{ + CoroutineSigAltStack *self; + Coroutine *co; + CoroutineThreadState *coTS; + + /* Get the thread specific information */ + coTS = coroutine_get_thread_state(); + self = coTS->tr_handler; + coTS->tr_called = 1; + co = &self->base; + + /* + * Here we have to do a bit of a ping pong between the caller, given that + * this is a signal handler and we have to do a return "soon". Then the + * caller can reestablish everything and do a siglongjmp here again. + */ + if (!sigsetjmp(coTS->tr_reenter, 0)) { + return; + } + + /* + * Ok, the caller has siglongjmp'ed back to us, so now prepare + * us for the real machine state switching. We have to jump + * into another function here to get a new stack context for + * the auto variables (which have to be auto-variables + * because the start of the thread happens later). Else with + * PIC (i.e. Position Independent Code which is used when PTH + * is built as a shared library) most platforms would + * horrible core dump as experience showed. + */ + coroutine_bootstrap(self, co); +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineSigAltStack *co; + CoroutineThreadState *coTS; + struct sigaction sa; + struct sigaction osa; + stack_t ss; + stack_t oss; + sigset_t sigs; + sigset_t osigs; + sigjmp_buf old_env; + static pthread_mutex_t sigusr2_mutex = PTHREAD_MUTEX_INITIALIZER; + + /* The way to manipulate stack is with the sigaltstack function. We + * prepare a stack, with it delivering a signal to ourselves and then + * put sigsetjmp/siglongjmp where needed. + * This has been done keeping coroutine-ucontext as a model and with the + * pth ideas (GNU Portable Threads). See coroutine-ucontext for the basics + * of the coroutines and see pth_mctx.c (from the pth project) for the + * sigaltstack way of manipulating stacks. + */ + + co = g_malloc0(sizeof(*co)); + co->stack_size = COROUTINE_STACK_SIZE; + co->stack = qemu_alloc_stack(&co->stack_size); + co->base.entry_arg = &old_env; /* stash away our jmp_buf */ + + coTS = coroutine_get_thread_state(); + coTS->tr_handler = co; + + /* + * Preserve the SIGUSR2 signal state, block SIGUSR2, + * and establish our signal handler. The signal will + * later transfer control onto the signal stack. + */ + sigemptyset(&sigs); + sigaddset(&sigs, SIGUSR2); + pthread_sigmask(SIG_BLOCK, &sigs, &osigs); + sa.sa_handler = coroutine_trampoline; + sigfillset(&sa.sa_mask); + sa.sa_flags = SA_ONSTACK; + + /* + * sigaction() is a process-global operation. We must not run + * this code in multiple threads at once. + */ + pthread_mutex_lock(&sigusr2_mutex); + if (sigaction(SIGUSR2, &sa, &osa) != 0) { + abort(); + } + + /* + * Set the new stack. + */ + ss.ss_sp = co->stack; + ss.ss_size = co->stack_size; + ss.ss_flags = 0; + if (sigaltstack(&ss, &oss) < 0) { + abort(); + } + + /* + * Now transfer control onto the signal stack and set it up. + * It will return immediately via "return" after the sigsetjmp() + * was performed. Be careful here with race conditions. The + * signal can be delivered the first time sigsuspend() is + * called. + */ + coTS->tr_called = 0; + pthread_kill(pthread_self(), SIGUSR2); + sigfillset(&sigs); + sigdelset(&sigs, SIGUSR2); + while (!coTS->tr_called) { + sigsuspend(&sigs); + } + + /* + * Inform the system that we are back off the signal stack by + * removing the alternative signal stack. Be careful here: It + * first has to be disabled, before it can be removed. + */ + sigaltstack(NULL, &ss); + ss.ss_flags = SS_DISABLE; + if (sigaltstack(&ss, NULL) < 0) { + abort(); + } + sigaltstack(NULL, &ss); + if (!(oss.ss_flags & SS_DISABLE)) { + sigaltstack(&oss, NULL); + } + + /* + * Restore the old SIGUSR2 signal handler and mask + */ + sigaction(SIGUSR2, &osa, NULL); + pthread_mutex_unlock(&sigusr2_mutex); + + pthread_sigmask(SIG_SETMASK, &osigs, NULL); + + /* + * Now enter the trampoline again, but this time not as a signal + * handler. Instead we jump into it directly. The functionally + * redundant ping-pong pointer arithmetic is necessary to avoid + * type-conversion warnings related to the `volatile' qualifier and + * the fact that `jmp_buf' usually is an array type. + */ + if (!sigsetjmp(old_env, 0)) { + siglongjmp(coTS->tr_reenter, 1); + } + + /* + * Ok, we returned again, so now we're finished + */ + + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineSigAltStack *co = DO_UPCAST(CoroutineSigAltStack, base, co_); + + qemu_free_stack(co->stack, co->stack_size); + g_free(co); +} + +CoroutineAction qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineSigAltStack *from = DO_UPCAST(CoroutineSigAltStack, base, from_); + CoroutineSigAltStack *to = DO_UPCAST(CoroutineSigAltStack, base, to_); + CoroutineThreadState *s = coroutine_get_thread_state(); + int ret; + + s->current = to_; + + ret = sigsetjmp(from->env, 0); + if (ret == 0) { + siglongjmp(to->env, action); + } + return ret; +} + +Coroutine *qemu_coroutine_self(void) +{ + CoroutineThreadState *s = coroutine_get_thread_state(); + + return s->current; +} + +bool qemu_in_coroutine(void) +{ + CoroutineThreadState *s = pthread_getspecific(thread_state_key); + + return s && s->current->caller; +} + diff --git a/util/coroutine-ucontext.c b/util/coroutine-ucontext.c new file mode 100644 index 000000000..904b37519 --- /dev/null +++ b/util/coroutine-ucontext.c @@ -0,0 +1,332 @@ +/* + * ucontext coroutine initialization code + * + * Copyright (C) 2006 Anthony Liguori <anthony@codemonkey.ws> + * Copyright (C) 2011 Kevin Wolf <kwolf@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.0 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +/* XXX Is there a nicer way to disable glibc's stack check for longjmp? */ +#ifdef _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif +#include "qemu/osdep.h" +#include <ucontext.h> +#include "qemu/coroutine_int.h" + +#ifdef CONFIG_VALGRIND_H +#include <valgrind/valgrind.h> +#endif + +#if defined(__SANITIZE_ADDRESS__) || __has_feature(address_sanitizer) +#ifdef CONFIG_ASAN_IFACE_FIBER +#define CONFIG_ASAN 1 +#include <sanitizer/asan_interface.h> +#endif +#endif + +#ifdef CONFIG_TSAN +#include <sanitizer/tsan_interface.h> +#endif + +typedef struct { + Coroutine base; + void *stack; + size_t stack_size; +#ifdef CONFIG_SAFESTACK + /* Need an unsafe stack for each coroutine */ + void *unsafe_stack; + size_t unsafe_stack_size; +#endif + sigjmp_buf env; + +#ifdef CONFIG_TSAN + void *tsan_co_fiber; + void *tsan_caller_fiber; +#endif + +#ifdef CONFIG_VALGRIND_H + unsigned int valgrind_stack_id; +#endif + +} CoroutineUContext; + +/** + * Per-thread coroutine bookkeeping + */ +static __thread CoroutineUContext leader; +static __thread Coroutine *current; + +/* + * va_args to makecontext() must be type 'int', so passing + * the pointer we need may require several int args. This + * union is a quick hack to let us do that + */ +union cc_arg { + void *p; + int i[2]; +}; + +/* + * QEMU_ALWAYS_INLINE only does so if __OPTIMIZE__, so we cannot use it. + * always_inline is required to avoid TSan runtime fatal errors. + */ +static inline __attribute__((always_inline)) +void on_new_fiber(CoroutineUContext *co) +{ +#ifdef CONFIG_TSAN + co->tsan_co_fiber = __tsan_create_fiber(0); /* flags: sync on switch */ + co->tsan_caller_fiber = __tsan_get_current_fiber(); +#endif +} + +/* always_inline is required to avoid TSan runtime fatal errors. */ +static inline __attribute__((always_inline)) +void finish_switch_fiber(void *fake_stack_save) +{ +#ifdef CONFIG_ASAN + const void *bottom_old; + size_t size_old; + + __sanitizer_finish_switch_fiber(fake_stack_save, &bottom_old, &size_old); + + if (!leader.stack) { + leader.stack = (void *)bottom_old; + leader.stack_size = size_old; + } +#endif +#ifdef CONFIG_TSAN + if (fake_stack_save) { + __tsan_release(fake_stack_save); + __tsan_switch_to_fiber(fake_stack_save, 0); /* 0=synchronize */ + } +#endif +} + +/* always_inline is required to avoid TSan runtime fatal errors. */ +static inline __attribute__((always_inline)) +void start_switch_fiber_asan(CoroutineAction action, void **fake_stack_save, + const void *bottom, size_t size) +{ +#ifdef CONFIG_ASAN + __sanitizer_start_switch_fiber( + action == COROUTINE_TERMINATE ? NULL : fake_stack_save, + bottom, size); +#endif +} + +/* always_inline is required to avoid TSan runtime fatal errors. */ +static inline __attribute__((always_inline)) +void start_switch_fiber_tsan(void **fake_stack_save, + CoroutineUContext *co, + bool caller) +{ +#ifdef CONFIG_TSAN + void *new_fiber = caller ? + co->tsan_caller_fiber : + co->tsan_co_fiber; + void *curr_fiber = __tsan_get_current_fiber(); + __tsan_acquire(curr_fiber); + + *fake_stack_save = curr_fiber; + __tsan_switch_to_fiber(new_fiber, 0); /* 0=synchronize */ +#endif +} + +static void coroutine_trampoline(int i0, int i1) +{ + union cc_arg arg; + CoroutineUContext *self; + Coroutine *co; + void *fake_stack_save = NULL; + + finish_switch_fiber(NULL); + + arg.i[0] = i0; + arg.i[1] = i1; + self = arg.p; + co = &self->base; + + /* Initialize longjmp environment and switch back the caller */ + if (!sigsetjmp(self->env, 0)) { + start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, leader.stack, + leader.stack_size); + start_switch_fiber_tsan(&fake_stack_save, self, true); /* true=caller */ + siglongjmp(*(sigjmp_buf *)co->entry_arg, 1); + } + + finish_switch_fiber(fake_stack_save); + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + CoroutineUContext *co; + ucontext_t old_uc, uc; + sigjmp_buf old_env; + union cc_arg arg = {0}; + void *fake_stack_save = NULL; + + /* The ucontext functions preserve signal masks which incurs a + * system call overhead. sigsetjmp(buf, 0)/siglongjmp() does not + * preserve signal masks but only works on the current stack. + * Since we need a way to create and switch to a new stack, use + * the ucontext functions for that but sigsetjmp()/siglongjmp() for + * everything else. + */ + + if (getcontext(&uc) == -1) { + abort(); + } + + co = g_malloc0(sizeof(*co)); + co->stack_size = COROUTINE_STACK_SIZE; + co->stack = qemu_alloc_stack(&co->stack_size); +#ifdef CONFIG_SAFESTACK + co->unsafe_stack_size = COROUTINE_STACK_SIZE; + co->unsafe_stack = qemu_alloc_stack(&co->unsafe_stack_size); +#endif + co->base.entry_arg = &old_env; /* stash away our jmp_buf */ + + uc.uc_link = &old_uc; + uc.uc_stack.ss_sp = co->stack; + uc.uc_stack.ss_size = co->stack_size; + uc.uc_stack.ss_flags = 0; + +#ifdef CONFIG_VALGRIND_H + co->valgrind_stack_id = + VALGRIND_STACK_REGISTER(co->stack, co->stack + co->stack_size); +#endif + + arg.p = co; + + on_new_fiber(co); + makecontext(&uc, (void (*)(void))coroutine_trampoline, + 2, arg.i[0], arg.i[1]); + + /* swapcontext() in, siglongjmp() back out */ + if (!sigsetjmp(old_env, 0)) { + start_switch_fiber_asan(COROUTINE_YIELD, &fake_stack_save, co->stack, + co->stack_size); + start_switch_fiber_tsan(&fake_stack_save, + co, false); /* false=not caller */ + +#ifdef CONFIG_SAFESTACK + /* + * Before we swap the context, set the new unsafe stack + * The unsafe stack grows just like the normal stack, so start from + * the last usable location of the memory area. + * NOTE: we don't have to re-set the usp afterwards because we are + * coming back to this context through a siglongjmp. + * The compiler already wrapped the corresponding sigsetjmp call with + * code that saves the usp on the (safe) stack before the call, and + * restores it right after (which is where we return with siglongjmp). + */ + void *usp = co->unsafe_stack + co->unsafe_stack_size; + __safestack_unsafe_stack_ptr = usp; +#endif + + swapcontext(&old_uc, &uc); + } + + finish_switch_fiber(fake_stack_save); + + return &co->base; +} + +#ifdef CONFIG_VALGRIND_H +/* Work around an unused variable in the valgrind.h macro... */ +#if !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-but-set-variable" +#endif +static inline void valgrind_stack_deregister(CoroutineUContext *co) +{ + VALGRIND_STACK_DEREGISTER(co->valgrind_stack_id); +} +#if !defined(__clang__) +#pragma GCC diagnostic pop +#endif +#endif + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineUContext *co = DO_UPCAST(CoroutineUContext, base, co_); + +#ifdef CONFIG_VALGRIND_H + valgrind_stack_deregister(co); +#endif + + qemu_free_stack(co->stack, co->stack_size); +#ifdef CONFIG_SAFESTACK + qemu_free_stack(co->unsafe_stack, co->unsafe_stack_size); +#endif + g_free(co); +} + +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the sigsetjmp() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineUContext *from = DO_UPCAST(CoroutineUContext, base, from_); + CoroutineUContext *to = DO_UPCAST(CoroutineUContext, base, to_); + int ret; + void *fake_stack_save = NULL; + + current = to_; + + ret = sigsetjmp(from->env, 0); + if (ret == 0) { + start_switch_fiber_asan(action, &fake_stack_save, to->stack, + to->stack_size); + start_switch_fiber_tsan(&fake_stack_save, + to, false); /* false=not caller */ + siglongjmp(to->env, action); + } + + finish_switch_fiber(fake_stack_save); + + return ret; +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + } +#ifdef CONFIG_TSAN + if (!leader.tsan_co_fiber) { + leader.tsan_co_fiber = __tsan_get_current_fiber(); + } +#endif + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +} diff --git a/util/coroutine-win32.c b/util/coroutine-win32.c new file mode 100644 index 000000000..de6bd4fd3 --- /dev/null +++ b/util/coroutine-win32.c @@ -0,0 +1,102 @@ +/* + * Win32 coroutine initialization code + * + * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/coroutine_int.h" + +typedef struct +{ + Coroutine base; + + LPVOID fiber; + CoroutineAction action; +} CoroutineWin32; + +static __thread CoroutineWin32 leader; +static __thread Coroutine *current; + +/* This function is marked noinline to prevent GCC from inlining it + * into coroutine_trampoline(). If we allow it to do that then it + * hoists the code to get the address of the TLS variable "current" + * out of the while() loop. This is an invalid transformation because + * the SwitchToFiber() call may be called when running thread A but + * return in thread B, and so we might be in a different thread + * context each time round the loop. + */ +CoroutineAction __attribute__((noinline)) +qemu_coroutine_switch(Coroutine *from_, Coroutine *to_, + CoroutineAction action) +{ + CoroutineWin32 *from = DO_UPCAST(CoroutineWin32, base, from_); + CoroutineWin32 *to = DO_UPCAST(CoroutineWin32, base, to_); + + current = to_; + + to->action = action; + SwitchToFiber(to->fiber); + return from->action; +} + +static void CALLBACK coroutine_trampoline(void *co_) +{ + Coroutine *co = co_; + + while (true) { + co->entry(co->entry_arg); + qemu_coroutine_switch(co, co->caller, COROUTINE_TERMINATE); + } +} + +Coroutine *qemu_coroutine_new(void) +{ + const size_t stack_size = COROUTINE_STACK_SIZE; + CoroutineWin32 *co; + + co = g_malloc0(sizeof(*co)); + co->fiber = CreateFiber(stack_size, coroutine_trampoline, &co->base); + return &co->base; +} + +void qemu_coroutine_delete(Coroutine *co_) +{ + CoroutineWin32 *co = DO_UPCAST(CoroutineWin32, base, co_); + + DeleteFiber(co->fiber); + g_free(co); +} + +Coroutine *qemu_coroutine_self(void) +{ + if (!current) { + current = &leader.base; + leader.fiber = ConvertThreadToFiber(NULL); + } + return current; +} + +bool qemu_in_coroutine(void) +{ + return current && current->caller; +} diff --git a/util/crc-ccitt.c b/util/crc-ccitt.c new file mode 100644 index 000000000..b981d8ac5 --- /dev/null +++ b/util/crc-ccitt.c @@ -0,0 +1,127 @@ +/* + * CRC16 (CCITT) Checksum Algorithm + * + * Copyright (c) 2021 Wind River Systems, Inc. + * + * Author: + * Bin Meng <bin.meng@windriver.com> + * + * From Linux kernel v5.10 lib/crc-ccitt.c + * + * SPDX-License-Identifier: GPL-2.0-only + */ + +#include "qemu/osdep.h" +#include "qemu/crc-ccitt.h" + +/* + * This mysterious table is just the CRC of each possible byte. It can be + * computed using the standard bit-at-a-time methods. The polynomial can + * be seen in entry 128, 0x8408. This corresponds to x^0 + x^5 + x^12. + * Add the implicit x^16, and you have the standard CRC-CCITT. + */ +uint16_t const crc_ccitt_table[256] = { + 0x0000, 0x1189, 0x2312, 0x329b, 0x4624, 0x57ad, 0x6536, 0x74bf, + 0x8c48, 0x9dc1, 0xaf5a, 0xbed3, 0xca6c, 0xdbe5, 0xe97e, 0xf8f7, + 0x1081, 0x0108, 0x3393, 0x221a, 0x56a5, 0x472c, 0x75b7, 0x643e, + 0x9cc9, 0x8d40, 0xbfdb, 0xae52, 0xdaed, 0xcb64, 0xf9ff, 0xe876, + 0x2102, 0x308b, 0x0210, 0x1399, 0x6726, 0x76af, 0x4434, 0x55bd, + 0xad4a, 0xbcc3, 0x8e58, 0x9fd1, 0xeb6e, 0xfae7, 0xc87c, 0xd9f5, + 0x3183, 0x200a, 0x1291, 0x0318, 0x77a7, 0x662e, 0x54b5, 0x453c, + 0xbdcb, 0xac42, 0x9ed9, 0x8f50, 0xfbef, 0xea66, 0xd8fd, 0xc974, + 0x4204, 0x538d, 0x6116, 0x709f, 0x0420, 0x15a9, 0x2732, 0x36bb, + 0xce4c, 0xdfc5, 0xed5e, 0xfcd7, 0x8868, 0x99e1, 0xab7a, 0xbaf3, + 0x5285, 0x430c, 0x7197, 0x601e, 0x14a1, 0x0528, 0x37b3, 0x263a, + 0xdecd, 0xcf44, 0xfddf, 0xec56, 0x98e9, 0x8960, 0xbbfb, 0xaa72, + 0x6306, 0x728f, 0x4014, 0x519d, 0x2522, 0x34ab, 0x0630, 0x17b9, + 0xef4e, 0xfec7, 0xcc5c, 0xddd5, 0xa96a, 0xb8e3, 0x8a78, 0x9bf1, + 0x7387, 0x620e, 0x5095, 0x411c, 0x35a3, 0x242a, 0x16b1, 0x0738, + 0xffcf, 0xee46, 0xdcdd, 0xcd54, 0xb9eb, 0xa862, 0x9af9, 0x8b70, + 0x8408, 0x9581, 0xa71a, 0xb693, 0xc22c, 0xd3a5, 0xe13e, 0xf0b7, + 0x0840, 0x19c9, 0x2b52, 0x3adb, 0x4e64, 0x5fed, 0x6d76, 0x7cff, + 0x9489, 0x8500, 0xb79b, 0xa612, 0xd2ad, 0xc324, 0xf1bf, 0xe036, + 0x18c1, 0x0948, 0x3bd3, 0x2a5a, 0x5ee5, 0x4f6c, 0x7df7, 0x6c7e, + 0xa50a, 0xb483, 0x8618, 0x9791, 0xe32e, 0xf2a7, 0xc03c, 0xd1b5, + 0x2942, 0x38cb, 0x0a50, 0x1bd9, 0x6f66, 0x7eef, 0x4c74, 0x5dfd, + 0xb58b, 0xa402, 0x9699, 0x8710, 0xf3af, 0xe226, 0xd0bd, 0xc134, + 0x39c3, 0x284a, 0x1ad1, 0x0b58, 0x7fe7, 0x6e6e, 0x5cf5, 0x4d7c, + 0xc60c, 0xd785, 0xe51e, 0xf497, 0x8028, 0x91a1, 0xa33a, 0xb2b3, + 0x4a44, 0x5bcd, 0x6956, 0x78df, 0x0c60, 0x1de9, 0x2f72, 0x3efb, + 0xd68d, 0xc704, 0xf59f, 0xe416, 0x90a9, 0x8120, 0xb3bb, 0xa232, + 0x5ac5, 0x4b4c, 0x79d7, 0x685e, 0x1ce1, 0x0d68, 0x3ff3, 0x2e7a, + 0xe70e, 0xf687, 0xc41c, 0xd595, 0xa12a, 0xb0a3, 0x8238, 0x93b1, + 0x6b46, 0x7acf, 0x4854, 0x59dd, 0x2d62, 0x3ceb, 0x0e70, 0x1ff9, + 0xf78f, 0xe606, 0xd49d, 0xc514, 0xb1ab, 0xa022, 0x92b9, 0x8330, + 0x7bc7, 0x6a4e, 0x58d5, 0x495c, 0x3de3, 0x2c6a, 0x1ef1, 0x0f78 +}; + +/* + * Similar table to calculate CRC16 variant known as CRC-CCITT-FALSE + * Reflected bits order, does not augment final value. + */ +uint16_t const crc_ccitt_false_table[256] = { + 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7, + 0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF, + 0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6, + 0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE, + 0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485, + 0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D, + 0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4, + 0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC, + 0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823, + 0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B, + 0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12, + 0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A, + 0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41, + 0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49, + 0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70, + 0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78, + 0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F, + 0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067, + 0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E, + 0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256, + 0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D, + 0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, + 0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C, + 0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634, + 0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB, + 0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3, + 0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A, + 0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92, + 0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9, + 0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1, + 0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8, + 0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0 +}; + +/** + * crc_ccitt - recompute the CRC (CRC-CCITT variant) + * for the data buffer + * + * @crc: previous CRC value + * @buffer: data pointer + * @len: number of bytes in the buffer + */ +uint16_t crc_ccitt(uint16_t crc, uint8_t const *buffer, size_t len) +{ + while (len--) { + crc = crc_ccitt_byte(crc, *buffer++); + } + return crc; +} + +/** + * crc_ccitt_false - recompute the CRC (CRC-CCITT-FALSE variant) + * for the data buffer + * + * @crc: previous CRC value + * @buffer: data pointer + * @len: number of bytes in the buffer + */ +uint16_t crc_ccitt_false(uint16_t crc, uint8_t const *buffer, size_t len) +{ + while (len--) { + crc = crc_ccitt_false_byte(crc, *buffer++); + } + return crc; +} diff --git a/util/crc32c.c b/util/crc32c.c new file mode 100644 index 000000000..762657d85 --- /dev/null +++ b/util/crc32c.c @@ -0,0 +1,115 @@ +/* + * Castagnoli CRC32C Checksum Algorithm + * + * Polynomial: 0x11EDC6F41 + * + * Castagnoli93: Guy Castagnoli and Stefan Braeuer and Martin Herrman + * "Optimization of Cyclic Redundancy-Check Codes with 24 + * and 32 Parity Bits",IEEE Transactions on Communication, + * Volume 41, Number 6, June 1993 + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * Based on the Linux kernel cryptographic crc32c module, + * + * Copyright (c) 2004 Cisco Systems, Inc. + * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include "qemu/osdep.h" +#include "qemu/crc32c.h" + +/* + * This is the CRC-32C table + * Generated with: + * width = 32 bits + * poly = 0x1EDC6F41 + * reflect input bytes = true + * reflect output bytes = true + */ + +static const uint32_t crc32c_table[256] = { + 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, + 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, + 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, + 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, + 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, + 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, + 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, + 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, + 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, + 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, + 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, + 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, + 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, + 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, + 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, + 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, + 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, + 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, + 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, + 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, + 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, + 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, + 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, + 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, + 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, + 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, + 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, + 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, + 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, + 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, + 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, + 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, + 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, + 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, + 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, + 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, + 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, + 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, + 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, + 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, + 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, + 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, + 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, + 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, + 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, + 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, + 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, + 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, + 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, + 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, + 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, + 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, + 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, + 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, + 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, + 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, + 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, + 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, + 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, + 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, + 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, + 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, + 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, + 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L +}; + + +uint32_t crc32c(uint32_t crc, const uint8_t *data, unsigned int length) +{ + while (length--) { + crc = crc32c_table[(crc ^ *data++) & 0xFFL] ^ (crc >> 8); + } + return crc^0xffffffff; +} + diff --git a/util/cutils.c b/util/cutils.c new file mode 100644 index 000000000..c9b91e753 --- /dev/null +++ b/util/cutils.c @@ -0,0 +1,1059 @@ +/* + * Simple C functions to supplement the C library + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" +#include <math.h> + +#include "qemu-common.h" +#include "qemu/sockets.h" +#include "qemu/iov.h" +#include "net/net.h" +#include "qemu/ctype.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" + +void strpadcpy(char *buf, int buf_size, const char *str, char pad) +{ + int len = qemu_strnlen(str, buf_size); + memcpy(buf, str, len); + memset(buf + len, pad, buf_size - len); +} + +void pstrcpy(char *buf, int buf_size, const char *str) +{ + int c; + char *q = buf; + + if (buf_size <= 0) + return; + + for(;;) { + c = *str++; + if (c == 0 || q >= buf + buf_size - 1) + break; + *q++ = c; + } + *q = '\0'; +} + +/* strcat and truncate. */ +char *pstrcat(char *buf, int buf_size, const char *s) +{ + int len; + len = strlen(buf); + if (len < buf_size) + pstrcpy(buf + len, buf_size - len, s); + return buf; +} + +int strstart(const char *str, const char *val, const char **ptr) +{ + const char *p, *q; + p = str; + q = val; + while (*q != '\0') { + if (*p != *q) + return 0; + p++; + q++; + } + if (ptr) + *ptr = p; + return 1; +} + +int stristart(const char *str, const char *val, const char **ptr) +{ + const char *p, *q; + p = str; + q = val; + while (*q != '\0') { + if (qemu_toupper(*p) != qemu_toupper(*q)) + return 0; + p++; + q++; + } + if (ptr) + *ptr = p; + return 1; +} + +/* XXX: use host strnlen if available ? */ +int qemu_strnlen(const char *s, int max_len) +{ + int i; + + for(i = 0; i < max_len; i++) { + if (s[i] == '\0') { + break; + } + } + return i; +} + +char *qemu_strsep(char **input, const char *delim) +{ + char *result = *input; + if (result != NULL) { + char *p; + + for (p = result; *p != '\0'; p++) { + if (strchr(delim, *p)) { + break; + } + } + if (*p == '\0') { + *input = NULL; + } else { + *p = '\0'; + *input = p + 1; + } + } + return result; +} + +time_t mktimegm(struct tm *tm) +{ + time_t t; + int y = tm->tm_year + 1900, m = tm->tm_mon + 1, d = tm->tm_mday; + if (m < 3) { + m += 12; + y--; + } + t = 86400ULL * (d + (153 * m - 457) / 5 + 365 * y + y / 4 - y / 100 + + y / 400 - 719469); + t += 3600 * tm->tm_hour + 60 * tm->tm_min + tm->tm_sec; + return t; +} + +/* + * Make sure data goes on disk, but if possible do not bother to + * write out the inode just for timestamp updates. + * + * Unfortunately even in 2009 many operating systems do not support + * fdatasync and have to fall back to fsync. + */ +int qemu_fdatasync(int fd) +{ +#ifdef CONFIG_FDATASYNC + return fdatasync(fd); +#else + return fsync(fd); +#endif +} + +/** + * Sync changes made to the memory mapped file back to the backing + * storage. For POSIX compliant systems this will fallback + * to regular msync call. Otherwise it will trigger whole file sync + * (including the metadata case there is no support to skip that otherwise) + * + * @addr - start of the memory area to be synced + * @length - length of the are to be synced + * @fd - file descriptor for the file to be synced + * (mandatory only for POSIX non-compliant systems) + */ +int qemu_msync(void *addr, size_t length, int fd) +{ +#ifdef CONFIG_POSIX + size_t align_mask = ~(qemu_real_host_page_size - 1); + + /** + * There are no strict reqs as per the length of mapping + * to be synced. Still the length needs to follow the address + * alignment changes. Additionally - round the size to the multiple + * of PAGE_SIZE + */ + length += ((uintptr_t)addr & (qemu_real_host_page_size - 1)); + length = (length + ~align_mask) & align_mask; + + addr = (void *)((uintptr_t)addr & align_mask); + + return msync(addr, length, MS_SYNC); +#else /* CONFIG_POSIX */ + /** + * Perform the sync based on the file descriptor + * The sync range will most probably be wider than the one + * requested - but it will still get the job done + */ + return qemu_fdatasync(fd); +#endif /* CONFIG_POSIX */ +} + +#ifndef _WIN32 +/* Sets a specific flag */ +int fcntl_setfl(int fd, int flag) +{ + int flags; + + flags = fcntl(fd, F_GETFL); + if (flags == -1) + return -errno; + + if (fcntl(fd, F_SETFL, flags | flag) == -1) + return -errno; + + return 0; +} +#endif + +static int64_t suffix_mul(char suffix, int64_t unit) +{ + switch (qemu_toupper(suffix)) { + case 'B': + return 1; + case 'K': + return unit; + case 'M': + return unit * unit; + case 'G': + return unit * unit * unit; + case 'T': + return unit * unit * unit * unit; + case 'P': + return unit * unit * unit * unit * unit; + case 'E': + return unit * unit * unit * unit * unit * unit; + } + return -1; +} + +/* + * Convert size string to bytes. + * + * The size parsing supports the following syntaxes + * - 12345 - decimal, scale determined by @default_suffix and @unit + * - 12345{bBkKmMgGtTpPeE} - decimal, scale determined by suffix and @unit + * - 12345.678{kKmMgGtTpPeE} - decimal, scale determined by suffix, and + * fractional portion is truncated to byte + * - 0x7fEE - hexadecimal, unit determined by @default_suffix + * + * The following cause a deprecation warning, and may be removed in the future + * - 0xabc{kKmMgGtTpP} - hex with scaling suffix + * + * The following are intentionally not supported + * - octal, such as 08 + * - fractional hex, such as 0x1.8 + * - floating point exponents, such as 1e3 + * + * The end pointer will be returned in *end, if not NULL. If there is + * no fraction, the input can be decimal or hexadecimal; if there is a + * fraction, then the input must be decimal and there must be a suffix + * (possibly by @default_suffix) larger than Byte, and the fractional + * portion may suffer from precision loss or rounding. The input must + * be positive. + * + * Return -ERANGE on overflow (with *@end advanced), and -EINVAL on + * other error (with *@end left unchanged). + */ +static int do_strtosz(const char *nptr, const char **end, + const char default_suffix, int64_t unit, + uint64_t *result) +{ + int retval; + const char *endptr, *f; + unsigned char c; + bool hex = false; + uint64_t val, valf = 0; + int64_t mul; + + /* Parse integral portion as decimal. */ + retval = qemu_strtou64(nptr, &endptr, 10, &val); + if (retval) { + goto out; + } + if (memchr(nptr, '-', endptr - nptr) != NULL) { + endptr = nptr; + retval = -EINVAL; + goto out; + } + if (val == 0 && (*endptr == 'x' || *endptr == 'X')) { + /* Input looks like hex, reparse, and insist on no fraction. */ + retval = qemu_strtou64(nptr, &endptr, 16, &val); + if (retval) { + goto out; + } + if (*endptr == '.') { + endptr = nptr; + retval = -EINVAL; + goto out; + } + hex = true; + } else if (*endptr == '.') { + /* + * Input looks like a fraction. Make sure even 1.k works + * without fractional digits. If we see an exponent, treat + * the entire input as invalid instead. + */ + double fraction; + + f = endptr; + retval = qemu_strtod_finite(f, &endptr, &fraction); + if (retval) { + endptr++; + } else if (memchr(f, 'e', endptr - f) || memchr(f, 'E', endptr - f)) { + endptr = nptr; + retval = -EINVAL; + goto out; + } else { + /* Extract into a 64-bit fixed-point fraction. */ + valf = (uint64_t)(fraction * 0x1p64); + } + } + c = *endptr; + mul = suffix_mul(c, unit); + if (mul > 0) { + if (hex) { + warn_report("Using a multiplier suffix on hex numbers " + "is deprecated: %s", nptr); + } + endptr++; + } else { + mul = suffix_mul(default_suffix, unit); + assert(mul > 0); + } + if (mul == 1) { + /* When a fraction is present, a scale is required. */ + if (valf != 0) { + endptr = nptr; + retval = -EINVAL; + goto out; + } + } else { + uint64_t valh, tmp; + + /* Compute exact result: 64.64 x 64.0 -> 128.64 fixed point */ + mulu64(&val, &valh, val, mul); + mulu64(&valf, &tmp, valf, mul); + val += tmp; + valh += val < tmp; + + /* Round 0.5 upward. */ + tmp = valf >> 63; + val += tmp; + valh += val < tmp; + + /* Report overflow. */ + if (valh != 0) { + retval = -ERANGE; + goto out; + } + } + + retval = 0; + +out: + if (end) { + *end = endptr; + } else if (*endptr) { + retval = -EINVAL; + } + if (retval == 0) { + *result = val; + } + + return retval; +} + +int qemu_strtosz(const char *nptr, const char **end, uint64_t *result) +{ + return do_strtosz(nptr, end, 'B', 1024, result); +} + +int qemu_strtosz_MiB(const char *nptr, const char **end, uint64_t *result) +{ + return do_strtosz(nptr, end, 'M', 1024, result); +} + +int qemu_strtosz_metric(const char *nptr, const char **end, uint64_t *result) +{ + return do_strtosz(nptr, end, 'B', 1000, result); +} + +/** + * Helper function for error checking after strtol() and the like + */ +static int check_strtox_error(const char *nptr, char *ep, + const char **endptr, bool check_zero, + int libc_errno) +{ + assert(ep >= nptr); + + /* Windows has a bug in that it fails to parse 0 from "0x" in base 16 */ + if (check_zero && ep == nptr && libc_errno == 0) { + char *tmp; + + errno = 0; + if (strtol(nptr, &tmp, 10) == 0 && errno == 0 && + (*tmp == 'x' || *tmp == 'X')) { + ep = tmp; + } + } + + if (endptr) { + *endptr = ep; + } + + /* Turn "no conversion" into an error */ + if (libc_errno == 0 && ep == nptr) { + return -EINVAL; + } + + /* Fail when we're expected to consume the string, but didn't */ + if (!endptr && *ep) { + return -EINVAL; + } + + return -libc_errno; +} + +/** + * Convert string @nptr to an integer, and store it in @result. + * + * This is a wrapper around strtol() that is harder to misuse. + * Semantics of @nptr, @endptr, @base match strtol() with differences + * noted below. + * + * @nptr may be null, and no conversion is performed then. + * + * If no conversion is performed, store @nptr in *@endptr and return + * -EINVAL. + * + * If @endptr is null, and the string isn't fully converted, return + * -EINVAL. This is the case when the pointer that would be stored in + * a non-null @endptr points to a character other than '\0'. + * + * If the conversion overflows @result, store INT_MAX in @result, + * and return -ERANGE. + * + * If the conversion underflows @result, store INT_MIN in @result, + * and return -ERANGE. + * + * Else store the converted value in @result, and return zero. + */ +int qemu_strtoi(const char *nptr, const char **endptr, int base, + int *result) +{ + char *ep; + long long lresult; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + errno = 0; + lresult = strtoll(nptr, &ep, base); + if (lresult < INT_MIN) { + *result = INT_MIN; + errno = ERANGE; + } else if (lresult > INT_MAX) { + *result = INT_MAX; + errno = ERANGE; + } else { + *result = lresult; + } + return check_strtox_error(nptr, ep, endptr, lresult == 0, errno); +} + +/** + * Convert string @nptr to an unsigned integer, and store it in @result. + * + * This is a wrapper around strtoul() that is harder to misuse. + * Semantics of @nptr, @endptr, @base match strtoul() with differences + * noted below. + * + * @nptr may be null, and no conversion is performed then. + * + * If no conversion is performed, store @nptr in *@endptr and return + * -EINVAL. + * + * If @endptr is null, and the string isn't fully converted, return + * -EINVAL. This is the case when the pointer that would be stored in + * a non-null @endptr points to a character other than '\0'. + * + * If the conversion overflows @result, store UINT_MAX in @result, + * and return -ERANGE. + * + * Else store the converted value in @result, and return zero. + * + * Note that a number with a leading minus sign gets converted without + * the minus sign, checked for overflow (see above), then negated (in + * @result's type). This is exactly how strtoul() works. + */ +int qemu_strtoui(const char *nptr, const char **endptr, int base, + unsigned int *result) +{ + char *ep; + long long lresult; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + errno = 0; + lresult = strtoull(nptr, &ep, base); + + /* Windows returns 1 for negative out-of-range values. */ + if (errno == ERANGE) { + *result = -1; + } else { + if (lresult > UINT_MAX) { + *result = UINT_MAX; + errno = ERANGE; + } else if (lresult < INT_MIN) { + *result = UINT_MAX; + errno = ERANGE; + } else { + *result = lresult; + } + } + return check_strtox_error(nptr, ep, endptr, lresult == 0, errno); +} + +/** + * Convert string @nptr to a long integer, and store it in @result. + * + * This is a wrapper around strtol() that is harder to misuse. + * Semantics of @nptr, @endptr, @base match strtol() with differences + * noted below. + * + * @nptr may be null, and no conversion is performed then. + * + * If no conversion is performed, store @nptr in *@endptr and return + * -EINVAL. + * + * If @endptr is null, and the string isn't fully converted, return + * -EINVAL. This is the case when the pointer that would be stored in + * a non-null @endptr points to a character other than '\0'. + * + * If the conversion overflows @result, store LONG_MAX in @result, + * and return -ERANGE. + * + * If the conversion underflows @result, store LONG_MIN in @result, + * and return -ERANGE. + * + * Else store the converted value in @result, and return zero. + */ +int qemu_strtol(const char *nptr, const char **endptr, int base, + long *result) +{ + char *ep; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + errno = 0; + *result = strtol(nptr, &ep, base); + return check_strtox_error(nptr, ep, endptr, *result == 0, errno); +} + +/** + * Convert string @nptr to an unsigned long, and store it in @result. + * + * This is a wrapper around strtoul() that is harder to misuse. + * Semantics of @nptr, @endptr, @base match strtoul() with differences + * noted below. + * + * @nptr may be null, and no conversion is performed then. + * + * If no conversion is performed, store @nptr in *@endptr and return + * -EINVAL. + * + * If @endptr is null, and the string isn't fully converted, return + * -EINVAL. This is the case when the pointer that would be stored in + * a non-null @endptr points to a character other than '\0'. + * + * If the conversion overflows @result, store ULONG_MAX in @result, + * and return -ERANGE. + * + * Else store the converted value in @result, and return zero. + * + * Note that a number with a leading minus sign gets converted without + * the minus sign, checked for overflow (see above), then negated (in + * @result's type). This is exactly how strtoul() works. + */ +int qemu_strtoul(const char *nptr, const char **endptr, int base, + unsigned long *result) +{ + char *ep; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + errno = 0; + *result = strtoul(nptr, &ep, base); + /* Windows returns 1 for negative out-of-range values. */ + if (errno == ERANGE) { + *result = -1; + } + return check_strtox_error(nptr, ep, endptr, *result == 0, errno); +} + +/** + * Convert string @nptr to an int64_t. + * + * Works like qemu_strtol(), except it stores INT64_MAX on overflow, + * and INT64_MIN on underflow. + */ +int qemu_strtoi64(const char *nptr, const char **endptr, int base, + int64_t *result) +{ + char *ep; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + /* This assumes int64_t is long long TODO relax */ + QEMU_BUILD_BUG_ON(sizeof(int64_t) != sizeof(long long)); + errno = 0; + *result = strtoll(nptr, &ep, base); + return check_strtox_error(nptr, ep, endptr, *result == 0, errno); +} + +/** + * Convert string @nptr to an uint64_t. + * + * Works like qemu_strtoul(), except it stores UINT64_MAX on overflow. + */ +int qemu_strtou64(const char *nptr, const char **endptr, int base, + uint64_t *result) +{ + char *ep; + + assert((unsigned) base <= 36 && base != 1); + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + /* This assumes uint64_t is unsigned long long TODO relax */ + QEMU_BUILD_BUG_ON(sizeof(uint64_t) != sizeof(unsigned long long)); + errno = 0; + *result = strtoull(nptr, &ep, base); + /* Windows returns 1 for negative out-of-range values. */ + if (errno == ERANGE) { + *result = -1; + } + return check_strtox_error(nptr, ep, endptr, *result == 0, errno); +} + +/** + * Convert string @nptr to a double. + * + * This is a wrapper around strtod() that is harder to misuse. + * Semantics of @nptr and @endptr match strtod() with differences + * noted below. + * + * @nptr may be null, and no conversion is performed then. + * + * If no conversion is performed, store @nptr in *@endptr and return + * -EINVAL. + * + * If @endptr is null, and the string isn't fully converted, return + * -EINVAL. This is the case when the pointer that would be stored in + * a non-null @endptr points to a character other than '\0'. + * + * If the conversion overflows, store +/-HUGE_VAL in @result, depending + * on the sign, and return -ERANGE. + * + * If the conversion underflows, store +/-0.0 in @result, depending on the + * sign, and return -ERANGE. + * + * Else store the converted value in @result, and return zero. + */ +int qemu_strtod(const char *nptr, const char **endptr, double *result) +{ + char *ep; + + if (!nptr) { + if (endptr) { + *endptr = nptr; + } + return -EINVAL; + } + + errno = 0; + *result = strtod(nptr, &ep); + return check_strtox_error(nptr, ep, endptr, false, errno); +} + +/** + * Convert string @nptr to a finite double. + * + * Works like qemu_strtod(), except that "NaN" and "inf" are rejected + * with -EINVAL and no conversion is performed. + */ +int qemu_strtod_finite(const char *nptr, const char **endptr, double *result) +{ + double tmp; + int ret; + + ret = qemu_strtod(nptr, endptr, &tmp); + if (!ret && !isfinite(tmp)) { + if (endptr) { + *endptr = nptr; + } + ret = -EINVAL; + } + + if (ret != -EINVAL) { + *result = tmp; + } + return ret; +} + +/** + * Searches for the first occurrence of 'c' in 's', and returns a pointer + * to the trailing null byte if none was found. + */ +#ifndef HAVE_STRCHRNUL +const char *qemu_strchrnul(const char *s, int c) +{ + const char *e = strchr(s, c); + if (!e) { + e = s + strlen(s); + } + return e; +} +#endif + +/** + * parse_uint: + * + * @s: String to parse + * @value: Destination for parsed integer value + * @endptr: Destination for pointer to first character not consumed + * @base: integer base, between 2 and 36 inclusive, or 0 + * + * Parse unsigned integer + * + * Parsed syntax is like strtoull()'s: arbitrary whitespace, a single optional + * '+' or '-', an optional "0x" if @base is 0 or 16, one or more digits. + * + * If @s is null, or @base is invalid, or @s doesn't start with an + * integer in the syntax above, set *@value to 0, *@endptr to @s, and + * return -EINVAL. + * + * Set *@endptr to point right beyond the parsed integer (even if the integer + * overflows or is negative, all digits will be parsed and *@endptr will + * point right beyond them). + * + * If the integer is negative, set *@value to 0, and return -ERANGE. + * + * If the integer overflows unsigned long long, set *@value to + * ULLONG_MAX, and return -ERANGE. + * + * Else, set *@value to the parsed integer, and return 0. + */ +int parse_uint(const char *s, unsigned long long *value, char **endptr, + int base) +{ + int r = 0; + char *endp = (char *)s; + unsigned long long val = 0; + + assert((unsigned) base <= 36 && base != 1); + if (!s) { + r = -EINVAL; + goto out; + } + + errno = 0; + val = strtoull(s, &endp, base); + if (errno) { + r = -errno; + goto out; + } + + if (endp == s) { + r = -EINVAL; + goto out; + } + + /* make sure we reject negative numbers: */ + while (qemu_isspace(*s)) { + s++; + } + if (*s == '-') { + val = 0; + r = -ERANGE; + goto out; + } + +out: + *value = val; + *endptr = endp; + return r; +} + +/** + * parse_uint_full: + * + * @s: String to parse + * @value: Destination for parsed integer value + * @base: integer base, between 2 and 36 inclusive, or 0 + * + * Parse unsigned integer from entire string + * + * Have the same behavior of parse_uint(), but with an additional check + * for additional data after the parsed number. If extra characters are present + * after the parsed number, the function will return -EINVAL, and *@v will + * be set to 0. + */ +int parse_uint_full(const char *s, unsigned long long *value, int base) +{ + char *endp; + int r; + + r = parse_uint(s, value, &endp, base); + if (r < 0) { + return r; + } + if (*endp) { + *value = 0; + return -EINVAL; + } + + return 0; +} + +int qemu_parse_fd(const char *param) +{ + long fd; + char *endptr; + + errno = 0; + fd = strtol(param, &endptr, 10); + if (param == endptr /* no conversion performed */ || + errno != 0 /* not representable as long; possibly others */ || + *endptr != '\0' /* final string not empty */ || + fd < 0 /* invalid as file descriptor */ || + fd > INT_MAX /* not representable as int */) { + return -1; + } + return fd; +} + +/* + * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) + * Input is limited to 14-bit numbers + */ +int uleb128_encode_small(uint8_t *out, uint32_t n) +{ + g_assert(n <= 0x3fff); + if (n < 0x80) { + *out = n; + return 1; + } else { + *out++ = (n & 0x7f) | 0x80; + *out = n >> 7; + return 2; + } +} + +int uleb128_decode_small(const uint8_t *in, uint32_t *n) +{ + if (!(*in & 0x80)) { + *n = *in; + return 1; + } else { + *n = *in++ & 0x7f; + /* we exceed 14 bit number */ + if (*in & 0x80) { + return -1; + } + *n |= *in << 7; + return 2; + } +} + +/* + * helper to parse debug environment variables + */ +int parse_debug_env(const char *name, int max, int initial) +{ + char *debug_env = getenv(name); + char *inv = NULL; + long debug; + + if (!debug_env) { + return initial; + } + errno = 0; + debug = strtol(debug_env, &inv, 10); + if (inv == debug_env) { + return initial; + } + if (debug < 0 || debug > max || errno != 0) { + warn_report("%s not in [0, %d]", name, max); + return initial; + } + return debug; +} + +/* + * Helper to print ethernet mac address + */ +const char *qemu_ether_ntoa(const MACAddr *mac) +{ + static char ret[18]; + + snprintf(ret, sizeof(ret), "%02x:%02x:%02x:%02x:%02x:%02x", + mac->a[0], mac->a[1], mac->a[2], mac->a[3], mac->a[4], mac->a[5]); + + return ret; +} + +/* + * Return human readable string for size @val. + * @val can be anything that uint64_t allows (no more than "16 EiB"). + * Use IEC binary units like KiB, MiB, and so forth. + * Caller is responsible for passing it to g_free(). + */ +char *size_to_str(uint64_t val) +{ + static const char *suffixes[] = { "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei" }; + uint64_t div; + int i; + + /* + * The exponent (returned in i) minus one gives us + * floor(log2(val * 1024 / 1000). The correction makes us + * switch to the higher power when the integer part is >= 1000. + * (see e41b509d68afb1f for more info) + */ + frexp(val / (1000.0 / 1024.0), &i); + i = (i - 1) / 10; + div = 1ULL << (i * 10); + + return g_strdup_printf("%0.3g %sB", (double)val / div, suffixes[i]); +} + +char *freq_to_str(uint64_t freq_hz) +{ + static const char *const suffixes[] = { "", "K", "M", "G", "T", "P", "E" }; + double freq = freq_hz; + size_t idx = 0; + + while (freq >= 1000.0) { + freq /= 1000.0; + idx++; + } + assert(idx < ARRAY_SIZE(suffixes)); + + return g_strdup_printf("%0.3g %sHz", freq, suffixes[idx]); +} + +int qemu_pstrcmp0(const char **str1, const char **str2) +{ + return g_strcmp0(*str1, *str2); +} + +static inline bool starts_with_prefix(const char *dir) +{ + size_t prefix_len = strlen(CONFIG_PREFIX); + return !memcmp(dir, CONFIG_PREFIX, prefix_len) && + (!dir[prefix_len] || G_IS_DIR_SEPARATOR(dir[prefix_len])); +} + +/* Return the next path component in dir, and store its length in *p_len. */ +static inline const char *next_component(const char *dir, int *p_len) +{ + int len; + while ((*dir && G_IS_DIR_SEPARATOR(*dir)) || + (*dir == '.' && (G_IS_DIR_SEPARATOR(dir[1]) || dir[1] == '\0'))) { + dir++; + } + len = 0; + while (dir[len] && !G_IS_DIR_SEPARATOR(dir[len])) { + len++; + } + *p_len = len; + return dir; +} + +char *get_relocated_path(const char *dir) +{ + size_t prefix_len = strlen(CONFIG_PREFIX); + const char *bindir = CONFIG_BINDIR; + const char *exec_dir = qemu_get_exec_dir(); + GString *result; + int len_dir, len_bindir; + + /* Fail if qemu_init_exec_dir was not called. */ + assert(exec_dir[0]); + if (!starts_with_prefix(dir) || !starts_with_prefix(bindir)) { + return g_strdup(dir); + } + + result = g_string_new(exec_dir); + + /* Advance over common components. */ + len_dir = len_bindir = prefix_len; + do { + dir += len_dir; + bindir += len_bindir; + dir = next_component(dir, &len_dir); + bindir = next_component(bindir, &len_bindir); + } while (len_dir && len_dir == len_bindir && !memcmp(dir, bindir, len_dir)); + + /* Ascend from bindir to the common prefix with dir. */ + while (len_bindir) { + bindir += len_bindir; + g_string_append(result, "/.."); + bindir = next_component(bindir, &len_bindir); + } + + if (*dir) { + assert(G_IS_DIR_SEPARATOR(dir[-1])); + g_string_append(result, dir - 1); + } + return g_string_free(result, false); +} diff --git a/util/dbus.c b/util/dbus.c new file mode 100644 index 000000000..9099dc5b4 --- /dev/null +++ b/util/dbus.c @@ -0,0 +1,57 @@ +/* + * Helpers for using D-Bus + * + * Copyright (C) 2019 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/dbus.h" +#include "qemu/error-report.h" +#include "qapi/error.h" + +/* + * qemu_dbus_get_queued_owners() - return the list of queued unique names + * @connection: A GDBusConnection + * @name: a service name + * + * Return: a GStrv of unique names, or NULL on failure. + */ +GStrv +qemu_dbus_get_queued_owners(GDBusConnection *connection, const char *name, + Error **errp) +{ + g_autoptr(GDBusProxy) proxy = NULL; + g_autoptr(GVariant) result = NULL; + g_autoptr(GVariant) child = NULL; + g_autoptr(GError) err = NULL; + + proxy = g_dbus_proxy_new_sync(connection, G_DBUS_PROXY_FLAGS_NONE, NULL, + "org.freedesktop.DBus", + "/org/freedesktop/DBus", + "org.freedesktop.DBus", + NULL, &err); + if (!proxy) { + error_setg(errp, "Failed to create DBus proxy: %s", err->message); + return NULL; + } + + result = g_dbus_proxy_call_sync(proxy, "ListQueuedOwners", + g_variant_new("(s)", name), + G_DBUS_CALL_FLAGS_NO_AUTO_START, + -1, NULL, &err); + if (!result) { + if (g_error_matches(err, + G_DBUS_ERROR, + G_DBUS_ERROR_NAME_HAS_NO_OWNER)) { + return g_new0(char *, 1); + } + error_setg(errp, "Failed to call ListQueuedOwners: %s", err->message); + return NULL; + } + + child = g_variant_get_child_value(result, 0); + return g_variant_dup_strv(child, NULL); +} diff --git a/util/drm.c b/util/drm.c new file mode 100644 index 000000000..dae8ffebc --- /dev/null +++ b/util/drm.c @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2015-2016 Gerd Hoffmann <kraxel@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ +#include "qemu/osdep.h" +#include "qemu/drm.h" + +#include <glob.h> +#include <dirent.h> + +int qemu_drm_rendernode_open(const char *rendernode) +{ + DIR *dir; + struct dirent *e; + struct stat st; + int r, fd, ret; + char *p; + + if (rendernode) { + return open(rendernode, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK); + } + + dir = opendir("/dev/dri"); + if (!dir) { + return -1; + } + + fd = -1; + while ((e = readdir(dir))) { + if (strncmp(e->d_name, "renderD", 7)) { + continue; + } + + p = g_strdup_printf("/dev/dri/%s", e->d_name); + + r = open(p, O_RDWR | O_CLOEXEC | O_NOCTTY | O_NONBLOCK); + if (r < 0) { + g_free(p); + continue; + } + + /* + * prefer fstat() over checking e->d_type == DT_CHR for + * portability reasons + */ + ret = fstat(r, &st); + if (ret < 0 || (st.st_mode & S_IFMT) != S_IFCHR) { + close(r); + g_free(p); + continue; + } + + fd = r; + g_free(p); + break; + } + + closedir(dir); + if (fd < 0) { + return -1; + } + return fd; +} diff --git a/util/envlist.c b/util/envlist.c new file mode 100644 index 000000000..2bcc13f09 --- /dev/null +++ b/util/envlist.c @@ -0,0 +1,232 @@ +#include "qemu/osdep.h" +#include "qemu/queue.h" +#include "qemu/envlist.h" + +struct envlist_entry { + const char *ev_var; /* actual env value */ + QLIST_ENTRY(envlist_entry) ev_link; +}; + +struct envlist { + QLIST_HEAD(, envlist_entry) el_entries; /* actual entries */ + size_t el_count; /* number of entries */ +}; + +static int envlist_parse(envlist_t *envlist, + const char *env, int (*)(envlist_t *, const char *)); + +/* + * Allocates new envlist and returns pointer to it. + */ +envlist_t * +envlist_create(void) +{ + envlist_t *envlist; + + envlist = g_malloc(sizeof(*envlist)); + + QLIST_INIT(&envlist->el_entries); + envlist->el_count = 0; + + return (envlist); +} + +/* + * Releases given envlist and its entries. + */ +void +envlist_free(envlist_t *envlist) +{ + struct envlist_entry *entry; + + assert(envlist != NULL); + + while (envlist->el_entries.lh_first != NULL) { + entry = envlist->el_entries.lh_first; + QLIST_REMOVE(entry, ev_link); + + g_free((char *)entry->ev_var); + g_free(entry); + } + g_free(envlist); +} + +/* + * Parses comma separated list of set/modify environment + * variable entries and updates given enlist accordingly. + * + * For example: + * envlist_parse(el, "HOME=foo,SHELL=/bin/sh"); + * + * inserts/sets environment variables HOME and SHELL. + * + * Returns 0 on success, errno otherwise. + */ +int +envlist_parse_set(envlist_t *envlist, const char *env) +{ + return (envlist_parse(envlist, env, &envlist_setenv)); +} + +/* + * Parses comma separated list of unset environment variable + * entries and removes given variables from given envlist. + * + * Returns 0 on success, errno otherwise. + */ +int +envlist_parse_unset(envlist_t *envlist, const char *env) +{ + return (envlist_parse(envlist, env, &envlist_unsetenv)); +} + +/* + * Parses comma separated list of set, modify or unset entries + * and calls given callback for each entry. + * + * Returns 0 in case of success, errno otherwise. + */ +static int +envlist_parse(envlist_t *envlist, const char *env, + int (*callback)(envlist_t *, const char *)) +{ + char *tmpenv, *envvar; + char *envsave = NULL; + int ret = 0; + assert(callback != NULL); + + if ((envlist == NULL) || (env == NULL)) + return (EINVAL); + + tmpenv = g_strdup(env); + envsave = tmpenv; + + do { + envvar = strchr(tmpenv, ','); + if (envvar != NULL) { + *envvar = '\0'; + } + if ((*callback)(envlist, tmpenv) != 0) { + ret = errno; + break; + } + tmpenv = envvar + 1; + } while (envvar != NULL); + + g_free(envsave); + return ret; +} + +/* + * Sets environment value to envlist in similar manner + * than putenv(3). + * + * Returns 0 in success, errno otherwise. + */ +int +envlist_setenv(envlist_t *envlist, const char *env) +{ + struct envlist_entry *entry = NULL; + const char *eq_sign; + size_t envname_len; + + if ((envlist == NULL) || (env == NULL)) + return (EINVAL); + + /* find out first equals sign in given env */ + if ((eq_sign = strchr(env, '=')) == NULL) + return (EINVAL); + envname_len = eq_sign - env + 1; + + /* + * If there already exists variable with given name + * we remove and release it before allocating a whole + * new entry. + */ + for (entry = envlist->el_entries.lh_first; entry != NULL; + entry = entry->ev_link.le_next) { + if (strncmp(entry->ev_var, env, envname_len) == 0) + break; + } + + if (entry != NULL) { + QLIST_REMOVE(entry, ev_link); + g_free((char *)entry->ev_var); + g_free(entry); + } else { + envlist->el_count++; + } + + entry = g_malloc(sizeof(*entry)); + entry->ev_var = g_strdup(env); + QLIST_INSERT_HEAD(&envlist->el_entries, entry, ev_link); + + return (0); +} + +/* + * Removes given env value from envlist in similar manner + * than unsetenv(3). Returns 0 in success, errno otherwise. + */ +int +envlist_unsetenv(envlist_t *envlist, const char *env) +{ + struct envlist_entry *entry; + size_t envname_len; + + if ((envlist == NULL) || (env == NULL)) + return (EINVAL); + + /* env is not allowed to contain '=' */ + if (strchr(env, '=') != NULL) + return (EINVAL); + + /* + * Find out the requested entry and remove + * it from the list. + */ + envname_len = strlen(env); + for (entry = envlist->el_entries.lh_first; entry != NULL; + entry = entry->ev_link.le_next) { + if (strncmp(entry->ev_var, env, envname_len) == 0) + break; + } + if (entry != NULL) { + QLIST_REMOVE(entry, ev_link); + g_free((char *)entry->ev_var); + g_free(entry); + + envlist->el_count--; + } + return (0); +} + +/* + * Returns given envlist as array of strings (in same form that + * global variable environ is). Caller must free returned memory + * by calling g_free for each element and the array. + * Returned array and given envlist are not related (no common + * references). + * + * If caller provides count pointer, number of items in array is + * stored there. + */ +char ** +envlist_to_environ(const envlist_t *envlist, size_t *count) +{ + struct envlist_entry *entry; + char **env, **penv; + + penv = env = g_malloc((envlist->el_count + 1) * sizeof(char *)); + + for (entry = envlist->el_entries.lh_first; entry != NULL; + entry = entry->ev_link.le_next) { + *(penv++) = g_strdup(entry->ev_var); + } + *penv = NULL; /* NULL terminate the list */ + + if (count != NULL) + *count = envlist->el_count; + + return (env); +} diff --git a/util/error.c b/util/error.c new file mode 100644 index 000000000..b6c89d141 --- /dev/null +++ b/util/error.c @@ -0,0 +1,305 @@ +/* + * QEMU Error Objects + * + * Copyright IBM, Corp. 2011 + * Copyright (C) 2011-2015 Red Hat, Inc. + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2. See + * the COPYING.LIB file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/error-report.h" + +struct Error +{ + char *msg; + ErrorClass err_class; + const char *src, *func; + int line; + GString *hint; +}; + +Error *error_abort; +Error *error_fatal; + +static void error_handle_fatal(Error **errp, Error *err) +{ + if (errp == &error_abort) { + fprintf(stderr, "Unexpected error in %s() at %s:%d:\n", + err->func, err->src, err->line); + error_report("%s", error_get_pretty(err)); + if (err->hint) { + error_printf("%s", err->hint->str); + } + abort(); + } + if (errp == &error_fatal) { + error_report_err(err); + exit(1); + } +} + +static void error_setv(Error **errp, + const char *src, int line, const char *func, + ErrorClass err_class, const char *fmt, va_list ap, + const char *suffix) +{ + Error *err; + int saved_errno = errno; + + if (errp == NULL) { + return; + } + assert(*errp == NULL); + + err = g_malloc0(sizeof(*err)); + err->msg = g_strdup_vprintf(fmt, ap); + if (suffix) { + char *msg = err->msg; + err->msg = g_strdup_printf("%s: %s", msg, suffix); + g_free(msg); + } + err->err_class = err_class; + err->src = src; + err->line = line; + err->func = func; + + error_handle_fatal(errp, err); + *errp = err; + + errno = saved_errno; +} + +void error_set_internal(Error **errp, + const char *src, int line, const char *func, + ErrorClass err_class, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_setv(errp, src, line, func, err_class, fmt, ap, NULL); + va_end(ap); +} + +void error_setg_internal(Error **errp, + const char *src, int line, const char *func, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap, NULL); + va_end(ap); +} + +void error_setg_errno_internal(Error **errp, + const char *src, int line, const char *func, + int os_errno, const char *fmt, ...) +{ + va_list ap; + int saved_errno = errno; + + va_start(ap, fmt); + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, fmt, ap, + os_errno != 0 ? strerror(os_errno) : NULL); + va_end(ap); + + errno = saved_errno; +} + +void error_setg_file_open_internal(Error **errp, + const char *src, int line, const char *func, + int os_errno, const char *filename) +{ + error_setg_errno_internal(errp, src, line, func, os_errno, + "Could not open '%s'", filename); +} + +void error_vprepend(Error *const *errp, const char *fmt, va_list ap) +{ + GString *newmsg; + + if (!errp) { + return; + } + + newmsg = g_string_new(NULL); + g_string_vprintf(newmsg, fmt, ap); + g_string_append(newmsg, (*errp)->msg); + g_free((*errp)->msg); + (*errp)->msg = g_string_free(newmsg, 0); +} + +void error_prepend(Error *const *errp, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_vprepend(errp, fmt, ap); + va_end(ap); +} + +void error_append_hint(Error *const *errp, const char *fmt, ...) +{ + va_list ap; + int saved_errno = errno; + Error *err; + + if (!errp) { + return; + } + err = *errp; + assert(err && errp != &error_abort && errp != &error_fatal); + + if (!err->hint) { + err->hint = g_string_new(NULL); + } + va_start(ap, fmt); + g_string_append_vprintf(err->hint, fmt, ap); + va_end(ap); + + errno = saved_errno; +} + +#ifdef _WIN32 + +void error_setg_win32_internal(Error **errp, + const char *src, int line, const char *func, + int win32_err, const char *fmt, ...) +{ + va_list ap; + char *suffix = NULL; + + if (errp == NULL) { + return; + } + + if (win32_err != 0) { + suffix = g_win32_error_message(win32_err); + } + + va_start(ap, fmt); + error_setv(errp, src, line, func, ERROR_CLASS_GENERIC_ERROR, + fmt, ap, suffix); + va_end(ap); + + g_free(suffix); +} + +#endif + +Error *error_copy(const Error *err) +{ + Error *err_new; + + err_new = g_malloc0(sizeof(*err)); + err_new->msg = g_strdup(err->msg); + err_new->err_class = err->err_class; + err_new->src = err->src; + err_new->line = err->line; + err_new->func = err->func; + if (err->hint) { + err_new->hint = g_string_new(err->hint->str); + } + + return err_new; +} + +ErrorClass error_get_class(const Error *err) +{ + return err->err_class; +} + +const char *error_get_pretty(const Error *err) +{ + return err->msg; +} + +void error_report_err(Error *err) +{ + error_report("%s", error_get_pretty(err)); + if (err->hint) { + error_printf("%s", err->hint->str); + } + error_free(err); +} + +void warn_report_err(Error *err) +{ + warn_report("%s", error_get_pretty(err)); + if (err->hint) { + error_printf("%s", err->hint->str); + } + error_free(err); +} + +void error_reportf_err(Error *err, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_vprepend(&err, fmt, ap); + va_end(ap); + error_report_err(err); +} + + +void warn_reportf_err(Error *err, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + error_vprepend(&err, fmt, ap); + va_end(ap); + warn_report_err(err); +} + +void error_free(Error *err) +{ + if (err) { + g_free(err->msg); + if (err->hint) { + g_string_free(err->hint, true); + } + g_free(err); + } +} + +void error_free_or_abort(Error **errp) +{ + assert(errp && *errp); + error_free(*errp); + *errp = NULL; +} + +void error_propagate(Error **dst_errp, Error *local_err) +{ + if (!local_err) { + return; + } + error_handle_fatal(dst_errp, local_err); + if (dst_errp && !*dst_errp) { + *dst_errp = local_err; + } else { + error_free(local_err); + } +} + +void error_propagate_prepend(Error **dst_errp, Error *err, + const char *fmt, ...) +{ + va_list ap; + + if (dst_errp && !*dst_errp) { + va_start(ap, fmt); + error_vprepend(&err, fmt, ap); + va_end(ap); + } /* else error is being ignored, don't bother with prepending */ + error_propagate(dst_errp, err); +} diff --git a/util/event_notifier-posix.c b/util/event_notifier-posix.c new file mode 100644 index 000000000..8307013c5 --- /dev/null +++ b/util/event_notifier-posix.c @@ -0,0 +1,140 @@ +/* + * event notifier support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/cutils.h" +#include "qemu/event_notifier.h" +#include "qemu/main-loop.h" + +#ifdef CONFIG_EVENTFD +#include <sys/eventfd.h> +#endif + +#ifdef CONFIG_EVENTFD +/* + * Initialize @e with existing file descriptor @fd. + * @fd must be a genuine eventfd object, emulation with pipe won't do. + */ +void event_notifier_init_fd(EventNotifier *e, int fd) +{ + e->rfd = fd; + e->wfd = fd; + e->initialized = true; +} +#endif + +int event_notifier_init(EventNotifier *e, int active) +{ + int fds[2]; + int ret; + +#ifdef CONFIG_EVENTFD + ret = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); +#else + ret = -1; + errno = ENOSYS; +#endif + if (ret >= 0) { + e->rfd = e->wfd = ret; + } else { + if (errno != ENOSYS) { + return -errno; + } + if (qemu_pipe(fds) < 0) { + return -errno; + } + ret = fcntl_setfl(fds[0], O_NONBLOCK); + if (ret < 0) { + ret = -errno; + goto fail; + } + ret = fcntl_setfl(fds[1], O_NONBLOCK); + if (ret < 0) { + ret = -errno; + goto fail; + } + e->rfd = fds[0]; + e->wfd = fds[1]; + } + e->initialized = true; + if (active) { + event_notifier_set(e); + } + return 0; + +fail: + close(fds[0]); + close(fds[1]); + return ret; +} + +void event_notifier_cleanup(EventNotifier *e) +{ + if (!e->initialized) { + return; + } + + if (e->rfd != e->wfd) { + close(e->rfd); + } + + e->rfd = -1; + close(e->wfd); + e->wfd = -1; + e->initialized = false; +} + +int event_notifier_get_fd(const EventNotifier *e) +{ + return e->rfd; +} + +int event_notifier_set(EventNotifier *e) +{ + static const uint64_t value = 1; + ssize_t ret; + + if (!e->initialized) { + return -1; + } + + do { + ret = write(e->wfd, &value, sizeof(value)); + } while (ret < 0 && errno == EINTR); + + /* EAGAIN is fine, a read must be pending. */ + if (ret < 0 && errno != EAGAIN) { + return -errno; + } + return 0; +} + +int event_notifier_test_and_clear(EventNotifier *e) +{ + int value; + ssize_t len; + char buffer[512]; + + if (!e->initialized) { + return 0; + } + + /* Drain the notify pipe. For eventfd, only 8 bytes will be read. */ + value = 0; + do { + len = read(e->rfd, buffer, sizeof(buffer)); + value |= (len > 0); + } while ((len == -1 && errno == EINTR) || len == sizeof(buffer)); + + return value; +} diff --git a/util/event_notifier-win32.c b/util/event_notifier-win32.c new file mode 100644 index 000000000..62c53b0a9 --- /dev/null +++ b/util/event_notifier-win32.c @@ -0,0 +1,50 @@ +/* + * event notifier support + * + * Copyright Red Hat, Inc. 2010 + * + * Authors: + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/event_notifier.h" +#include "qemu/main-loop.h" + +int event_notifier_init(EventNotifier *e, int active) +{ + e->event = CreateEvent(NULL, TRUE, FALSE, NULL); + assert(e->event); + return 0; +} + +void event_notifier_cleanup(EventNotifier *e) +{ + CloseHandle(e->event); + e->event = NULL; +} + +HANDLE event_notifier_get_handle(EventNotifier *e) +{ + return e->event; +} + +int event_notifier_set(EventNotifier *e) +{ + SetEvent(e->event); + return 0; +} + +int event_notifier_test_and_clear(EventNotifier *e) +{ + int ret = WaitForSingleObject(e->event, 0); + if (ret == WAIT_OBJECT_0) { + ResetEvent(e->event); + return true; + } + return false; +} diff --git a/util/fdmon-epoll.c b/util/fdmon-epoll.c new file mode 100644 index 000000000..e11a8a022 --- /dev/null +++ b/util/fdmon-epoll.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * epoll(7) file descriptor monitoring + */ + +#include "qemu/osdep.h" +#include <sys/epoll.h> +#include "qemu/rcu_queue.h" +#include "aio-posix.h" + +/* The fd number threshold to switch to epoll */ +#define EPOLL_ENABLE_THRESHOLD 64 + +void fdmon_epoll_disable(AioContext *ctx) +{ + if (ctx->epollfd >= 0) { + close(ctx->epollfd); + ctx->epollfd = -1; + } + + /* Switch back */ + ctx->fdmon_ops = &fdmon_poll_ops; +} + +static inline int epoll_events_from_pfd(int pfd_events) +{ + return (pfd_events & G_IO_IN ? EPOLLIN : 0) | + (pfd_events & G_IO_OUT ? EPOLLOUT : 0) | + (pfd_events & G_IO_HUP ? EPOLLHUP : 0) | + (pfd_events & G_IO_ERR ? EPOLLERR : 0); +} + +static void fdmon_epoll_update(AioContext *ctx, + AioHandler *old_node, + AioHandler *new_node) +{ + struct epoll_event event = { + .data.ptr = new_node, + .events = new_node ? epoll_events_from_pfd(new_node->pfd.events) : 0, + }; + int r; + + if (!new_node) { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, old_node->pfd.fd, &event); + } else if (!old_node) { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, new_node->pfd.fd, &event); + } else { + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, new_node->pfd.fd, &event); + } + + if (r) { + fdmon_epoll_disable(ctx); + } +} + +static int fdmon_epoll_wait(AioContext *ctx, AioHandlerList *ready_list, + int64_t timeout) +{ + GPollFD pfd = { + .fd = ctx->epollfd, + .events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR, + }; + AioHandler *node; + int i, ret = 0; + struct epoll_event events[128]; + + /* Fall back while external clients are disabled */ + if (qatomic_read(&ctx->external_disable_cnt)) { + return fdmon_poll_ops.wait(ctx, ready_list, timeout); + } + + if (timeout > 0) { + ret = qemu_poll_ns(&pfd, 1, timeout); + if (ret > 0) { + timeout = 0; + } + } + if (timeout <= 0 || ret > 0) { + ret = epoll_wait(ctx->epollfd, events, + ARRAY_SIZE(events), + timeout); + if (ret <= 0) { + goto out; + } + for (i = 0; i < ret; i++) { + int ev = events[i].events; + int revents = (ev & EPOLLIN ? G_IO_IN : 0) | + (ev & EPOLLOUT ? G_IO_OUT : 0) | + (ev & EPOLLHUP ? G_IO_HUP : 0) | + (ev & EPOLLERR ? G_IO_ERR : 0); + + node = events[i].data.ptr; + aio_add_ready_handler(ready_list, node, revents); + } + } +out: + return ret; +} + +static const FDMonOps fdmon_epoll_ops = { + .update = fdmon_epoll_update, + .wait = fdmon_epoll_wait, + .need_wait = aio_poll_disabled, +}; + +static bool fdmon_epoll_try_enable(AioContext *ctx) +{ + AioHandler *node; + struct epoll_event event; + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + int r; + if (QLIST_IS_INSERTED(node, node_deleted) || !node->pfd.events) { + continue; + } + event.events = epoll_events_from_pfd(node->pfd.events); + event.data.ptr = node; + r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event); + if (r) { + return false; + } + } + + ctx->fdmon_ops = &fdmon_epoll_ops; + return true; +} + +bool fdmon_epoll_try_upgrade(AioContext *ctx, unsigned npfd) +{ + if (ctx->epollfd < 0) { + return false; + } + + /* Do not upgrade while external clients are disabled */ + if (qatomic_read(&ctx->external_disable_cnt)) { + return false; + } + + if (npfd >= EPOLL_ENABLE_THRESHOLD) { + if (fdmon_epoll_try_enable(ctx)) { + return true; + } else { + fdmon_epoll_disable(ctx); + } + } + return false; +} + +void fdmon_epoll_setup(AioContext *ctx) +{ + ctx->epollfd = epoll_create1(EPOLL_CLOEXEC); + if (ctx->epollfd == -1) { + fprintf(stderr, "Failed to create epoll instance: %s", strerror(errno)); + } +} diff --git a/util/fdmon-io_uring.c b/util/fdmon-io_uring.c new file mode 100644 index 000000000..1461dfa40 --- /dev/null +++ b/util/fdmon-io_uring.c @@ -0,0 +1,361 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Linux io_uring file descriptor monitoring + * + * The Linux io_uring API supports file descriptor monitoring with a few + * advantages over existing APIs like poll(2) and epoll(7): + * + * 1. Userspace polling of events is possible because the completion queue (cq + * ring) is shared between the kernel and userspace. This allows + * applications that rely on userspace polling to also monitor file + * descriptors in the same userspace polling loop. + * + * 2. Submission and completion is batched and done together in a single system + * call. This minimizes the number of system calls. + * + * 3. File descriptor monitoring is O(1) like epoll(7) so it scales better than + * poll(2). + * + * 4. Nanosecond timeouts are supported so it requires fewer syscalls than + * epoll(7). + * + * This code only monitors file descriptors and does not do asynchronous disk + * I/O. Implementing disk I/O efficiently has other requirements and should + * use a separate io_uring so it does not make sense to unify the code. + * + * File descriptor monitoring is implemented using the following operations: + * + * 1. IORING_OP_POLL_ADD - adds a file descriptor to be monitored. + * 2. IORING_OP_POLL_REMOVE - removes a file descriptor being monitored. When + * the poll mask changes for a file descriptor it is first removed and then + * re-added with the new poll mask, so this operation is also used as part + * of modifying an existing monitored file descriptor. + * 3. IORING_OP_TIMEOUT - added every time a blocking syscall is made to wait + * for events. This operation self-cancels if another event completes + * before the timeout. + * + * io_uring calls the submission queue the "sq ring" and the completion queue + * the "cq ring". Ring entries are called "sqe" and "cqe", respectively. + * + * The code is structured so that sq/cq rings are only modified within + * fdmon_io_uring_wait(). Changes to AioHandlers are made by enqueuing them on + * ctx->submit_list so that fdmon_io_uring_wait() can submit IORING_OP_POLL_ADD + * and/or IORING_OP_POLL_REMOVE sqes for them. + */ + +#include "qemu/osdep.h" +#include <poll.h> +#include "qemu/rcu_queue.h" +#include "aio-posix.h" + +enum { + FDMON_IO_URING_ENTRIES = 128, /* sq/cq ring size */ + + /* AioHandler::flags */ + FDMON_IO_URING_PENDING = (1 << 0), + FDMON_IO_URING_ADD = (1 << 1), + FDMON_IO_URING_REMOVE = (1 << 2), +}; + +static inline int poll_events_from_pfd(int pfd_events) +{ + return (pfd_events & G_IO_IN ? POLLIN : 0) | + (pfd_events & G_IO_OUT ? POLLOUT : 0) | + (pfd_events & G_IO_HUP ? POLLHUP : 0) | + (pfd_events & G_IO_ERR ? POLLERR : 0); +} + +static inline int pfd_events_from_poll(int poll_events) +{ + return (poll_events & POLLIN ? G_IO_IN : 0) | + (poll_events & POLLOUT ? G_IO_OUT : 0) | + (poll_events & POLLHUP ? G_IO_HUP : 0) | + (poll_events & POLLERR ? G_IO_ERR : 0); +} + +/* + * Returns an sqe for submitting a request. Only be called within + * fdmon_io_uring_wait(). + */ +static struct io_uring_sqe *get_sqe(AioContext *ctx) +{ + struct io_uring *ring = &ctx->fdmon_io_uring; + struct io_uring_sqe *sqe = io_uring_get_sqe(ring); + int ret; + + if (likely(sqe)) { + return sqe; + } + + /* No free sqes left, submit pending sqes first */ + do { + ret = io_uring_submit(ring); + } while (ret == -EINTR); + + assert(ret > 1); + sqe = io_uring_get_sqe(ring); + assert(sqe); + return sqe; +} + +/* Atomically enqueue an AioHandler for sq ring submission */ +static void enqueue(AioHandlerSList *head, AioHandler *node, unsigned flags) +{ + unsigned old_flags; + + old_flags = qatomic_fetch_or(&node->flags, FDMON_IO_URING_PENDING | flags); + if (!(old_flags & FDMON_IO_URING_PENDING)) { + QSLIST_INSERT_HEAD_ATOMIC(head, node, node_submitted); + } +} + +/* Dequeue an AioHandler for sq ring submission. Called by fill_sq_ring(). */ +static AioHandler *dequeue(AioHandlerSList *head, unsigned *flags) +{ + AioHandler *node = QSLIST_FIRST(head); + + if (!node) { + return NULL; + } + + /* Doesn't need to be atomic since fill_sq_ring() moves the list */ + QSLIST_REMOVE_HEAD(head, node_submitted); + + /* + * Don't clear FDMON_IO_URING_REMOVE. It's sticky so it can serve two + * purposes: telling fill_sq_ring() to submit IORING_OP_POLL_REMOVE and + * telling process_cqe() to delete the AioHandler when its + * IORING_OP_POLL_ADD completes. + */ + *flags = qatomic_fetch_and(&node->flags, ~(FDMON_IO_URING_PENDING | + FDMON_IO_URING_ADD)); + return node; +} + +static void fdmon_io_uring_update(AioContext *ctx, + AioHandler *old_node, + AioHandler *new_node) +{ + if (new_node) { + enqueue(&ctx->submit_list, new_node, FDMON_IO_URING_ADD); + } + + if (old_node) { + /* + * Deletion is tricky because IORING_OP_POLL_ADD and + * IORING_OP_POLL_REMOVE are async. We need to wait for the original + * IORING_OP_POLL_ADD to complete before this handler can be freed + * safely. + * + * It's possible that the file descriptor becomes ready and the + * IORING_OP_POLL_ADD cqe is enqueued before IORING_OP_POLL_REMOVE is + * submitted, too. + * + * Mark this handler deleted right now but don't place it on + * ctx->deleted_aio_handlers yet. Instead, manually fudge the list + * entry to make QLIST_IS_INSERTED() think this handler has been + * inserted and other code recognizes this AioHandler as deleted. + * + * Once the original IORING_OP_POLL_ADD completes we enqueue the + * handler on the real ctx->deleted_aio_handlers list to be freed. + */ + assert(!QLIST_IS_INSERTED(old_node, node_deleted)); + old_node->node_deleted.le_prev = &old_node->node_deleted.le_next; + + enqueue(&ctx->submit_list, old_node, FDMON_IO_URING_REMOVE); + } +} + +static void add_poll_add_sqe(AioContext *ctx, AioHandler *node) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + int events = poll_events_from_pfd(node->pfd.events); + + io_uring_prep_poll_add(sqe, node->pfd.fd, events); + io_uring_sqe_set_data(sqe, node); +} + +static void add_poll_remove_sqe(AioContext *ctx, AioHandler *node) +{ + struct io_uring_sqe *sqe = get_sqe(ctx); + + io_uring_prep_poll_remove(sqe, node); +} + +/* Add a timeout that self-cancels when another cqe becomes ready */ +static void add_timeout_sqe(AioContext *ctx, int64_t ns) +{ + struct io_uring_sqe *sqe; + struct __kernel_timespec ts = { + .tv_sec = ns / NANOSECONDS_PER_SECOND, + .tv_nsec = ns % NANOSECONDS_PER_SECOND, + }; + + sqe = get_sqe(ctx); + io_uring_prep_timeout(sqe, &ts, 1, 0); +} + +/* Add sqes from ctx->submit_list for submission */ +static void fill_sq_ring(AioContext *ctx) +{ + AioHandlerSList submit_list; + AioHandler *node; + unsigned flags; + + QSLIST_MOVE_ATOMIC(&submit_list, &ctx->submit_list); + + while ((node = dequeue(&submit_list, &flags))) { + /* Order matters, just in case both flags were set */ + if (flags & FDMON_IO_URING_ADD) { + add_poll_add_sqe(ctx, node); + } + if (flags & FDMON_IO_URING_REMOVE) { + add_poll_remove_sqe(ctx, node); + } + } +} + +/* Returns true if a handler became ready */ +static bool process_cqe(AioContext *ctx, + AioHandlerList *ready_list, + struct io_uring_cqe *cqe) +{ + AioHandler *node = io_uring_cqe_get_data(cqe); + unsigned flags; + + /* poll_timeout and poll_remove have a zero user_data field */ + if (!node) { + return false; + } + + /* + * Deletion can only happen when IORING_OP_POLL_ADD completes. If we race + * with enqueue() here then we can safely clear the FDMON_IO_URING_REMOVE + * bit before IORING_OP_POLL_REMOVE is submitted. + */ + flags = qatomic_fetch_and(&node->flags, ~FDMON_IO_URING_REMOVE); + if (flags & FDMON_IO_URING_REMOVE) { + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + return false; + } + + aio_add_ready_handler(ready_list, node, pfd_events_from_poll(cqe->res)); + + /* IORING_OP_POLL_ADD is one-shot so we must re-arm it */ + add_poll_add_sqe(ctx, node); + return true; +} + +static int process_cq_ring(AioContext *ctx, AioHandlerList *ready_list) +{ + struct io_uring *ring = &ctx->fdmon_io_uring; + struct io_uring_cqe *cqe; + unsigned num_cqes = 0; + unsigned num_ready = 0; + unsigned head; + + io_uring_for_each_cqe(ring, head, cqe) { + if (process_cqe(ctx, ready_list, cqe)) { + num_ready++; + } + + num_cqes++; + } + + io_uring_cq_advance(ring, num_cqes); + return num_ready; +} + +static int fdmon_io_uring_wait(AioContext *ctx, AioHandlerList *ready_list, + int64_t timeout) +{ + unsigned wait_nr = 1; /* block until at least one cqe is ready */ + int ret; + + /* Fall back while external clients are disabled */ + if (qatomic_read(&ctx->external_disable_cnt)) { + return fdmon_poll_ops.wait(ctx, ready_list, timeout); + } + + if (timeout == 0) { + wait_nr = 0; /* non-blocking */ + } else if (timeout > 0) { + add_timeout_sqe(ctx, timeout); + } + + fill_sq_ring(ctx); + + do { + ret = io_uring_submit_and_wait(&ctx->fdmon_io_uring, wait_nr); + } while (ret == -EINTR); + + assert(ret >= 0); + + return process_cq_ring(ctx, ready_list); +} + +static bool fdmon_io_uring_need_wait(AioContext *ctx) +{ + /* Have io_uring events completed? */ + if (io_uring_cq_ready(&ctx->fdmon_io_uring)) { + return true; + } + + /* Are there pending sqes to submit? */ + if (io_uring_sq_ready(&ctx->fdmon_io_uring)) { + return true; + } + + /* Do we need to process AioHandlers for io_uring changes? */ + if (!QSLIST_EMPTY_RCU(&ctx->submit_list)) { + return true; + } + + /* Are we falling back to fdmon-poll? */ + return qatomic_read(&ctx->external_disable_cnt); +} + +static const FDMonOps fdmon_io_uring_ops = { + .update = fdmon_io_uring_update, + .wait = fdmon_io_uring_wait, + .need_wait = fdmon_io_uring_need_wait, +}; + +bool fdmon_io_uring_setup(AioContext *ctx) +{ + int ret; + + ret = io_uring_queue_init(FDMON_IO_URING_ENTRIES, &ctx->fdmon_io_uring, 0); + if (ret != 0) { + return false; + } + + QSLIST_INIT(&ctx->submit_list); + ctx->fdmon_ops = &fdmon_io_uring_ops; + return true; +} + +void fdmon_io_uring_destroy(AioContext *ctx) +{ + if (ctx->fdmon_ops == &fdmon_io_uring_ops) { + AioHandler *node; + + io_uring_queue_exit(&ctx->fdmon_io_uring); + + /* Move handlers due to be removed onto the deleted list */ + while ((node = QSLIST_FIRST_RCU(&ctx->submit_list))) { + unsigned flags = qatomic_fetch_and(&node->flags, + ~(FDMON_IO_URING_PENDING | + FDMON_IO_URING_ADD | + FDMON_IO_URING_REMOVE)); + + if (flags & FDMON_IO_URING_REMOVE) { + QLIST_INSERT_HEAD_RCU(&ctx->deleted_aio_handlers, node, node_deleted); + } + + QSLIST_REMOVE_HEAD_RCU(&ctx->submit_list, node_submitted); + } + + ctx->fdmon_ops = &fdmon_poll_ops; + } +} diff --git a/util/fdmon-poll.c b/util/fdmon-poll.c new file mode 100644 index 000000000..5fe3b4786 --- /dev/null +++ b/util/fdmon-poll.c @@ -0,0 +1,108 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * poll(2) file descriptor monitoring + * + * Uses ppoll(2) when available, g_poll() otherwise. + */ + +#include "qemu/osdep.h" +#include "aio-posix.h" +#include "qemu/rcu_queue.h" + +/* + * These thread-local variables are used only in fdmon_poll_wait() around the + * call to the poll() system call. In particular they are not used while + * aio_poll is performing callbacks, which makes it much easier to think about + * reentrancy! + * + * Stack-allocated arrays would be perfect but they have size limitations; + * heap allocation is expensive enough that we want to reuse arrays across + * calls to aio_poll(). And because poll() has to be called without holding + * any lock, the arrays cannot be stored in AioContext. Thread-local data + * has none of the disadvantages of these three options. + */ +static __thread GPollFD *pollfds; +static __thread AioHandler **nodes; +static __thread unsigned npfd, nalloc; +static __thread Notifier pollfds_cleanup_notifier; + +static void pollfds_cleanup(Notifier *n, void *unused) +{ + g_assert(npfd == 0); + g_free(pollfds); + g_free(nodes); + nalloc = 0; +} + +static void add_pollfd(AioHandler *node) +{ + if (npfd == nalloc) { + if (nalloc == 0) { + pollfds_cleanup_notifier.notify = pollfds_cleanup; + qemu_thread_atexit_add(&pollfds_cleanup_notifier); + nalloc = 8; + } else { + g_assert(nalloc <= INT_MAX); + nalloc *= 2; + } + pollfds = g_renew(GPollFD, pollfds, nalloc); + nodes = g_renew(AioHandler *, nodes, nalloc); + } + nodes[npfd] = node; + pollfds[npfd] = (GPollFD) { + .fd = node->pfd.fd, + .events = node->pfd.events, + }; + npfd++; +} + +static int fdmon_poll_wait(AioContext *ctx, AioHandlerList *ready_list, + int64_t timeout) +{ + AioHandler *node; + int ret; + + assert(npfd == 0); + + QLIST_FOREACH_RCU(node, &ctx->aio_handlers, node) { + if (!QLIST_IS_INSERTED(node, node_deleted) && node->pfd.events + && aio_node_check(ctx, node->is_external)) { + add_pollfd(node); + } + } + + /* epoll(7) is faster above a certain number of fds */ + if (fdmon_epoll_try_upgrade(ctx, npfd)) { + npfd = 0; /* we won't need pollfds[], reset npfd */ + return ctx->fdmon_ops->wait(ctx, ready_list, timeout); + } + + ret = qemu_poll_ns(pollfds, npfd, timeout); + if (ret > 0) { + int i; + + for (i = 0; i < npfd; i++) { + int revents = pollfds[i].revents; + + if (revents) { + aio_add_ready_handler(ready_list, nodes[i], revents); + } + } + } + + npfd = 0; + return ret; +} + +static void fdmon_poll_update(AioContext *ctx, + AioHandler *old_node, + AioHandler *new_node) +{ + /* Do nothing, AioHandler already contains the state we'll need */ +} + +const FDMonOps fdmon_poll_ops = { + .update = fdmon_poll_update, + .wait = fdmon_poll_wait, + .need_wait = aio_poll_disabled, +}; diff --git a/util/fifo8.c b/util/fifo8.c new file mode 100644 index 000000000..d4d1c135e --- /dev/null +++ b/util/fifo8.c @@ -0,0 +1,118 @@ +/* + * Generic FIFO component, implemented as a circular buffer. + * + * Copyright (c) 2012 Peter A. G. Crosthwaite + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "migration/vmstate.h" +#include "qemu/fifo8.h" + +void fifo8_create(Fifo8 *fifo, uint32_t capacity) +{ + fifo->data = g_new(uint8_t, capacity); + fifo->capacity = capacity; + fifo->head = 0; + fifo->num = 0; +} + +void fifo8_destroy(Fifo8 *fifo) +{ + g_free(fifo->data); +} + +void fifo8_push(Fifo8 *fifo, uint8_t data) +{ + assert(fifo->num < fifo->capacity); + fifo->data[(fifo->head + fifo->num) % fifo->capacity] = data; + fifo->num++; +} + +void fifo8_push_all(Fifo8 *fifo, const uint8_t *data, uint32_t num) +{ + uint32_t start, avail; + + assert(fifo->num + num <= fifo->capacity); + + start = (fifo->head + fifo->num) % fifo->capacity; + + if (start + num <= fifo->capacity) { + memcpy(&fifo->data[start], data, num); + } else { + avail = fifo->capacity - start; + memcpy(&fifo->data[start], data, avail); + memcpy(&fifo->data[0], &data[avail], num - avail); + } + + fifo->num += num; +} + +uint8_t fifo8_pop(Fifo8 *fifo) +{ + uint8_t ret; + + assert(fifo->num > 0); + ret = fifo->data[fifo->head++]; + fifo->head %= fifo->capacity; + fifo->num--; + return ret; +} + +const uint8_t *fifo8_pop_buf(Fifo8 *fifo, uint32_t max, uint32_t *num) +{ + uint8_t *ret; + + assert(max > 0 && max <= fifo->num); + *num = MIN(fifo->capacity - fifo->head, max); + ret = &fifo->data[fifo->head]; + fifo->head += *num; + fifo->head %= fifo->capacity; + fifo->num -= *num; + return ret; +} + +void fifo8_reset(Fifo8 *fifo) +{ + fifo->num = 0; + fifo->head = 0; +} + +bool fifo8_is_empty(Fifo8 *fifo) +{ + return (fifo->num == 0); +} + +bool fifo8_is_full(Fifo8 *fifo) +{ + return (fifo->num == fifo->capacity); +} + +uint32_t fifo8_num_free(Fifo8 *fifo) +{ + return fifo->capacity - fifo->num; +} + +uint32_t fifo8_num_used(Fifo8 *fifo) +{ + return fifo->num; +} + +const VMStateDescription vmstate_fifo8 = { + .name = "Fifo8", + .version_id = 1, + .minimum_version_id = 1, + .fields = (VMStateField[]) { + VMSTATE_VBUFFER_UINT32(data, Fifo8, 1, NULL, capacity), + VMSTATE_UINT32(head, Fifo8), + VMSTATE_UINT32(num, Fifo8), + VMSTATE_END_OF_LIST() + } +}; diff --git a/util/filemonitor-inotify.c b/util/filemonitor-inotify.c new file mode 100644 index 000000000..2c45f7f17 --- /dev/null +++ b/util/filemonitor-inotify.c @@ -0,0 +1,339 @@ +/* + * QEMU file monitor Linux inotify impl + * + * Copyright (c) 2018 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qemu/filemonitor.h" +#include "qemu/main-loop.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "trace.h" + +#include <sys/inotify.h> + +struct QFileMonitor { + int fd; + QemuMutex lock; /* protects dirs & idmap */ + GHashTable *dirs; /* dirname => QFileMonitorDir */ + GHashTable *idmap; /* inotify ID => dirname */ +}; + + +typedef struct { + int64_t id; /* watch ID */ + char *filename; /* optional filter */ + QFileMonitorHandler cb; + void *opaque; +} QFileMonitorWatch; + + +typedef struct { + char *path; + int inotify_id; /* inotify ID */ + int next_file_id; /* file ID counter */ + GArray *watches; /* QFileMonitorWatch elements */ +} QFileMonitorDir; + + +static void qemu_file_monitor_watch(void *arg) +{ + QFileMonitor *mon = arg; + char buf[4096] + __attribute__ ((aligned(__alignof__(struct inotify_event)))); + int used = 0; + int len; + + qemu_mutex_lock(&mon->lock); + + if (mon->fd == -1) { + qemu_mutex_unlock(&mon->lock); + return; + } + + len = read(mon->fd, buf, sizeof(buf)); + + if (len < 0) { + if (errno != EAGAIN) { + error_report("Failure monitoring inotify FD '%s'," + "disabling events", strerror(errno)); + goto cleanup; + } + + /* no more events right now */ + goto cleanup; + } + + /* Loop over all events in the buffer */ + while (used < len) { + struct inotify_event *ev = + (struct inotify_event *)(buf + used); + const char *name = ev->len ? ev->name : ""; + QFileMonitorDir *dir = g_hash_table_lookup(mon->idmap, + GINT_TO_POINTER(ev->wd)); + uint32_t iev = ev->mask & + (IN_CREATE | IN_MODIFY | IN_DELETE | IN_IGNORED | + IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB); + int qev; + gsize i; + + used += sizeof(struct inotify_event) + ev->len; + + if (!dir) { + continue; + } + + /* + * During a rename operation, the old name gets + * IN_MOVED_FROM and the new name gets IN_MOVED_TO. + * To simplify life for callers, we turn these into + * DELETED and CREATED events + */ + switch (iev) { + case IN_CREATE: + case IN_MOVED_TO: + qev = QFILE_MONITOR_EVENT_CREATED; + break; + case IN_MODIFY: + qev = QFILE_MONITOR_EVENT_MODIFIED; + break; + case IN_DELETE: + case IN_MOVED_FROM: + qev = QFILE_MONITOR_EVENT_DELETED; + break; + case IN_ATTRIB: + qev = QFILE_MONITOR_EVENT_ATTRIBUTES; + break; + case IN_IGNORED: + qev = QFILE_MONITOR_EVENT_IGNORED; + break; + default: + g_assert_not_reached(); + } + + trace_qemu_file_monitor_event(mon, dir->path, name, ev->mask, + dir->inotify_id); + for (i = 0; i < dir->watches->len; i++) { + QFileMonitorWatch *watch = &g_array_index(dir->watches, + QFileMonitorWatch, + i); + + if (watch->filename == NULL || + (name && g_str_equal(watch->filename, name))) { + trace_qemu_file_monitor_dispatch(mon, dir->path, name, + qev, watch->cb, + watch->opaque, watch->id); + watch->cb(watch->id, qev, name, watch->opaque); + } + } + } + + cleanup: + qemu_mutex_unlock(&mon->lock); +} + + +static void +qemu_file_monitor_dir_free(void *data) +{ + QFileMonitorDir *dir = data; + gsize i; + + for (i = 0; i < dir->watches->len; i++) { + QFileMonitorWatch *watch = &g_array_index(dir->watches, + QFileMonitorWatch, i); + g_free(watch->filename); + } + g_array_unref(dir->watches); + g_free(dir->path); + g_free(dir); +} + + +QFileMonitor * +qemu_file_monitor_new(Error **errp) +{ + int fd; + QFileMonitor *mon; + + fd = inotify_init1(IN_NONBLOCK); + if (fd < 0) { + error_setg_errno(errp, errno, + "Unable to initialize inotify"); + return NULL; + } + + mon = g_new0(QFileMonitor, 1); + qemu_mutex_init(&mon->lock); + mon->fd = fd; + + mon->dirs = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, + qemu_file_monitor_dir_free); + mon->idmap = g_hash_table_new(g_direct_hash, g_direct_equal); + + trace_qemu_file_monitor_new(mon, mon->fd); + + return mon; +} + +static gboolean +qemu_file_monitor_free_idle(void *opaque) +{ + QFileMonitor *mon = opaque; + + if (!mon) { + return G_SOURCE_REMOVE; + } + + qemu_mutex_lock(&mon->lock); + + g_hash_table_unref(mon->idmap); + g_hash_table_unref(mon->dirs); + + qemu_mutex_unlock(&mon->lock); + + qemu_mutex_destroy(&mon->lock); + g_free(mon); + + return G_SOURCE_REMOVE; +} + +void +qemu_file_monitor_free(QFileMonitor *mon) +{ + if (!mon) { + return; + } + + qemu_mutex_lock(&mon->lock); + if (mon->fd != -1) { + qemu_set_fd_handler(mon->fd, NULL, NULL, NULL); + close(mon->fd); + mon->fd = -1; + } + qemu_mutex_unlock(&mon->lock); + + /* + * Can't free it yet, because another thread + * may be running event loop, so the inotify + * callback might be pending. Using an idle + * source ensures we'll only free after the + * pending callback is done + */ + g_idle_add((GSourceFunc)qemu_file_monitor_free_idle, mon); +} + +int64_t +qemu_file_monitor_add_watch(QFileMonitor *mon, + const char *dirpath, + const char *filename, + QFileMonitorHandler cb, + void *opaque, + Error **errp) +{ + QFileMonitorDir *dir; + QFileMonitorWatch watch; + int64_t ret = -1; + + qemu_mutex_lock(&mon->lock); + dir = g_hash_table_lookup(mon->dirs, dirpath); + if (!dir) { + int rv = inotify_add_watch(mon->fd, dirpath, + IN_CREATE | IN_DELETE | IN_MODIFY | + IN_MOVED_TO | IN_MOVED_FROM | IN_ATTRIB); + + if (rv < 0) { + error_setg_errno(errp, errno, "Unable to watch '%s'", dirpath); + goto cleanup; + } + + trace_qemu_file_monitor_enable_watch(mon, dirpath, rv); + + dir = g_new0(QFileMonitorDir, 1); + dir->path = g_strdup(dirpath); + dir->inotify_id = rv; + dir->watches = g_array_new(FALSE, TRUE, sizeof(QFileMonitorWatch)); + + g_hash_table_insert(mon->dirs, dir->path, dir); + g_hash_table_insert(mon->idmap, GINT_TO_POINTER(rv), dir); + + if (g_hash_table_size(mon->dirs) == 1) { + qemu_set_fd_handler(mon->fd, qemu_file_monitor_watch, NULL, mon); + } + } + + watch.id = (((int64_t)dir->inotify_id) << 32) | dir->next_file_id++; + watch.filename = g_strdup(filename); + watch.cb = cb; + watch.opaque = opaque; + + g_array_append_val(dir->watches, watch); + + trace_qemu_file_monitor_add_watch(mon, dirpath, + filename ? filename : "<none>", + cb, opaque, watch.id); + + ret = watch.id; + + cleanup: + qemu_mutex_unlock(&mon->lock); + return ret; +} + + +void qemu_file_monitor_remove_watch(QFileMonitor *mon, + const char *dirpath, + int64_t id) +{ + QFileMonitorDir *dir; + gsize i; + + qemu_mutex_lock(&mon->lock); + + trace_qemu_file_monitor_remove_watch(mon, dirpath, id); + + dir = g_hash_table_lookup(mon->dirs, dirpath); + if (!dir) { + goto cleanup; + } + + for (i = 0; i < dir->watches->len; i++) { + QFileMonitorWatch *watch = &g_array_index(dir->watches, + QFileMonitorWatch, i); + if (watch->id == id) { + g_free(watch->filename); + g_array_remove_index(dir->watches, i); + break; + } + } + + if (dir->watches->len == 0) { + inotify_rm_watch(mon->fd, dir->inotify_id); + trace_qemu_file_monitor_disable_watch(mon, dir->path, dir->inotify_id); + + g_hash_table_remove(mon->idmap, GINT_TO_POINTER(dir->inotify_id)); + g_hash_table_remove(mon->dirs, dir->path); + + if (g_hash_table_size(mon->dirs) == 0) { + qemu_set_fd_handler(mon->fd, NULL, NULL, NULL); + } + } + + cleanup: + qemu_mutex_unlock(&mon->lock); +} diff --git a/util/filemonitor-stub.c b/util/filemonitor-stub.c new file mode 100644 index 000000000..93fef6534 --- /dev/null +++ b/util/filemonitor-stub.c @@ -0,0 +1,59 @@ +/* + * QEMU file monitor stub impl + * + * Copyright (c) 2018 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" +#include "qemu/filemonitor.h" +#include "qemu/error-report.h" +#include "qapi/error.h" + + +QFileMonitor * +qemu_file_monitor_new(Error **errp) +{ + error_setg(errp, "File monitoring not available on this platform"); + return NULL; +} + + +void +qemu_file_monitor_free(QFileMonitor *mon G_GNUC_UNUSED) +{ +} + + +int64_t +qemu_file_monitor_add_watch(QFileMonitor *mon G_GNUC_UNUSED, + const char *dirpath G_GNUC_UNUSED, + const char *filename G_GNUC_UNUSED, + QFileMonitorHandler cb G_GNUC_UNUSED, + void *opaque G_GNUC_UNUSED, + Error **errp) +{ + error_setg(errp, "File monitoring not available on this platform"); + return -1; +} + + +void +qemu_file_monitor_remove_watch(QFileMonitor *mon G_GNUC_UNUSED, + const char *dirpath G_GNUC_UNUSED, + int64_t id G_GNUC_UNUSED) +{ +} diff --git a/util/getauxval.c b/util/getauxval.c new file mode 100644 index 000000000..b124107d6 --- /dev/null +++ b/util/getauxval.c @@ -0,0 +1,118 @@ +/* + * QEMU access to the auxiliary vector + * + * Copyright (C) 2013 Red Hat, Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" + +#ifdef CONFIG_GETAUXVAL +/* Don't inline this in qemu/osdep.h, because pulling in <sys/auxv.h> for + the system declaration of getauxval pulls in the system <elf.h>, which + conflicts with qemu's version. */ + +#include <sys/auxv.h> + +unsigned long qemu_getauxval(unsigned long key) +{ + return getauxval(key); +} +#elif defined(__linux__) +#include "elf.h" + +/* Our elf.h doesn't contain Elf32_auxv_t and Elf64_auxv_t, which is ok because + that just makes it easier to define it properly for the host here. */ +typedef struct { + unsigned long a_type; + unsigned long a_val; +} ElfW_auxv_t; + +static const ElfW_auxv_t *auxv; + +static const ElfW_auxv_t *qemu_init_auxval(void) +{ + ElfW_auxv_t *a; + ssize_t size = 512, r, ofs; + int fd; + + /* Allocate some initial storage. Make sure the first entry is set + to end-of-list, so that we've got a valid list in case of error. */ + auxv = a = g_malloc(size); + a[0].a_type = 0; + a[0].a_val = 0; + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) { + return a; + } + + /* Read the first SIZE bytes. Hopefully, this covers everything. */ + r = read(fd, a, size); + + if (r == size) { + /* Continue to expand until we do get a partial read. */ + do { + ofs = size; + size *= 2; + auxv = a = g_realloc(a, size); + r = read(fd, (char *)a + ofs, ofs); + } while (r == ofs); + } + + close(fd); + return a; +} + +unsigned long qemu_getauxval(unsigned long type) +{ + const ElfW_auxv_t *a = auxv; + + if (unlikely(a == NULL)) { + a = qemu_init_auxval(); + } + + for (; a->a_type != 0; a++) { + if (a->a_type == type) { + return a->a_val; + } + } + + return 0; +} + +#elif defined(__FreeBSD__) +#include <sys/auxv.h> + +unsigned long qemu_getauxval(unsigned long type) +{ + unsigned long aux = 0; + elf_aux_info(type, &aux, sizeof(aux)); + return aux; +} + +#else + +unsigned long qemu_getauxval(unsigned long type) +{ + return 0; +} + +#endif diff --git a/util/guest-random.c b/util/guest-random.c new file mode 100644 index 000000000..23643f86c --- /dev/null +++ b/util/guest-random.c @@ -0,0 +1,101 @@ +/* + * QEMU guest-visible random functions + * + * Copyright 2019 Linaro, Ltd. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qapi/error.h" +#include "qemu/guest-random.h" +#include "crypto/random.h" +#include "sysemu/replay.h" + + +static __thread GRand *thread_rand; +static bool deterministic; + + +static int glib_random_bytes(void *buf, size_t len) +{ + GRand *rand = thread_rand; + size_t i; + uint32_t x; + + if (unlikely(rand == NULL)) { + /* Thread not initialized for a cpu, or main w/o -seed. */ + thread_rand = rand = g_rand_new(); + } + + for (i = 0; i + 4 <= len; i += 4) { + x = g_rand_int(rand); + __builtin_memcpy(buf + i, &x, 4); + } + if (i < len) { + x = g_rand_int(rand); + __builtin_memcpy(buf + i, &x, len - i); + } + return 0; +} + +int qemu_guest_getrandom(void *buf, size_t len, Error **errp) +{ + int ret; + if (replay_mode == REPLAY_MODE_PLAY) { + return replay_read_random(buf, len); + } + if (unlikely(deterministic)) { + /* Deterministic implementation using Glib's Mersenne Twister. */ + ret = glib_random_bytes(buf, len); + } else { + /* Non-deterministic implementation using crypto routines. */ + ret = qcrypto_random_bytes(buf, len, errp); + } + if (replay_mode == REPLAY_MODE_RECORD) { + replay_save_random(ret, buf, len); + } + return ret; +} + +void qemu_guest_getrandom_nofail(void *buf, size_t len) +{ + (void)qemu_guest_getrandom(buf, len, &error_fatal); +} + +uint64_t qemu_guest_random_seed_thread_part1(void) +{ + if (deterministic) { + uint64_t ret; + glib_random_bytes(&ret, sizeof(ret)); + return ret; + } + return 0; +} + +void qemu_guest_random_seed_thread_part2(uint64_t seed) +{ + g_assert(thread_rand == NULL); + if (deterministic) { + thread_rand = + g_rand_new_with_seed_array((const guint32 *)&seed, + sizeof(seed) / sizeof(guint32)); + } +} + +int qemu_guest_random_seed_main(const char *optarg, Error **errp) +{ + unsigned long long seed; + if (parse_uint_full(optarg, &seed, 0)) { + error_setg(errp, "Invalid seed number: %s", optarg); + return -1; + } else { + deterministic = true; + qemu_guest_random_seed_thread_part2(seed); + return 0; + } +} diff --git a/util/hbitmap.c b/util/hbitmap.c new file mode 100644 index 000000000..305b894a6 --- /dev/null +++ b/util/hbitmap.c @@ -0,0 +1,933 @@ +/* + * Hierarchical Bitmap Data Type + * + * Copyright Red Hat, Inc., 2012 + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/hbitmap.h" +#include "qemu/host-utils.h" +#include "trace.h" +#include "crypto/hash.h" + +/* HBitmaps provides an array of bits. The bits are stored as usual in an + * array of unsigned longs, but HBitmap is also optimized to provide fast + * iteration over set bits; going from one bit to the next is O(logB n) + * worst case, with B = sizeof(long) * CHAR_BIT: the result is low enough + * that the number of levels is in fact fixed. + * + * In order to do this, it stacks multiple bitmaps with progressively coarser + * granularity; in all levels except the last, bit N is set iff the N-th + * unsigned long is nonzero in the immediately next level. When iteration + * completes on the last level it can examine the 2nd-last level to quickly + * skip entire words, and even do so recursively to skip blocks of 64 words or + * powers thereof (32 on 32-bit machines). + * + * Given an index in the bitmap, it can be split in group of bits like + * this (for the 64-bit case): + * + * bits 0-57 => word in the last bitmap | bits 58-63 => bit in the word + * bits 0-51 => word in the 2nd-last bitmap | bits 52-57 => bit in the word + * bits 0-45 => word in the 3rd-last bitmap | bits 46-51 => bit in the word + * + * So it is easy to move up simply by shifting the index right by + * log2(BITS_PER_LONG) bits. To move down, you shift the index left + * similarly, and add the word index within the group. Iteration uses + * ffs (find first set bit) to find the next word to examine; this + * operation can be done in constant time in most current architectures. + * + * Setting or clearing a range of m bits on all levels, the work to perform + * is O(m + m/W + m/W^2 + ...), which is O(m) like on a regular bitmap. + * + * When iterating on a bitmap, each bit (on any level) is only visited + * once. Hence, The total cost of visiting a bitmap with m bits in it is + * the number of bits that are set in all bitmaps. Unless the bitmap is + * extremely sparse, this is also O(m + m/W + m/W^2 + ...), so the amortized + * cost of advancing from one bit to the next is usually constant (worst case + * O(logB n) as in the non-amortized complexity). + */ + +struct HBitmap { + /* + * Size of the bitmap, as requested in hbitmap_alloc or in hbitmap_truncate. + */ + uint64_t orig_size; + + /* Number of total bits in the bottom level. */ + uint64_t size; + + /* Number of set bits in the bottom level. */ + uint64_t count; + + /* A scaling factor. Given a granularity of G, each bit in the bitmap will + * will actually represent a group of 2^G elements. Each operation on a + * range of bits first rounds the bits to determine which group they land + * in, and then affect the entire page; iteration will only visit the first + * bit of each group. Here is an example of operations in a size-16, + * granularity-1 HBitmap: + * + * initial state 00000000 + * set(start=0, count=9) 11111000 (iter: 0, 2, 4, 6, 8) + * reset(start=1, count=3) 00111000 (iter: 4, 6, 8) + * set(start=9, count=2) 00111100 (iter: 4, 6, 8, 10) + * reset(start=5, count=5) 00000000 + * + * From an implementation point of view, when setting or resetting bits, + * the bitmap will scale bit numbers right by this amount of bits. When + * iterating, the bitmap will scale bit numbers left by this amount of + * bits. + */ + int granularity; + + /* A meta dirty bitmap to track the dirtiness of bits in this HBitmap. */ + HBitmap *meta; + + /* A number of progressively less coarse bitmaps (i.e. level 0 is the + * coarsest). Each bit in level N represents a word in level N+1 that + * has a set bit, except the last level where each bit represents the + * actual bitmap. + * + * Note that all bitmaps have the same number of levels. Even a 1-bit + * bitmap will still allocate HBITMAP_LEVELS arrays. + */ + unsigned long *levels[HBITMAP_LEVELS]; + + /* The length of each levels[] array. */ + uint64_t sizes[HBITMAP_LEVELS]; +}; + +/* Advance hbi to the next nonzero word and return it. hbi->pos + * is updated. Returns zero if we reach the end of the bitmap. + */ +static unsigned long hbitmap_iter_skip_words(HBitmapIter *hbi) +{ + size_t pos = hbi->pos; + const HBitmap *hb = hbi->hb; + unsigned i = HBITMAP_LEVELS - 1; + + unsigned long cur; + do { + i--; + pos >>= BITS_PER_LEVEL; + cur = hbi->cur[i] & hb->levels[i][pos]; + } while (cur == 0); + + /* Check for end of iteration. We always use fewer than BITS_PER_LONG + * bits in the level 0 bitmap; thus we can repurpose the most significant + * bit as a sentinel. The sentinel is set in hbitmap_alloc and ensures + * that the above loop ends even without an explicit check on i. + */ + + if (i == 0 && cur == (1UL << (BITS_PER_LONG - 1))) { + return 0; + } + for (; i < HBITMAP_LEVELS - 1; i++) { + /* Shift back pos to the left, matching the right shifts above. + * The index of this word's least significant set bit provides + * the low-order bits. + */ + assert(cur); + pos = (pos << BITS_PER_LEVEL) + ctzl(cur); + hbi->cur[i] = cur & (cur - 1); + + /* Set up next level for iteration. */ + cur = hb->levels[i + 1][pos]; + } + + hbi->pos = pos; + trace_hbitmap_iter_skip_words(hbi->hb, hbi, pos, cur); + + assert(cur); + return cur; +} + +int64_t hbitmap_iter_next(HBitmapIter *hbi) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1] & + hbi->hb->levels[HBITMAP_LEVELS - 1][hbi->pos]; + int64_t item; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + return -1; + } + } + + /* The next call will resume work from the next bit. */ + hbi->cur[HBITMAP_LEVELS - 1] = cur & (cur - 1); + item = ((uint64_t)hbi->pos << BITS_PER_LEVEL) + ctzl(cur); + + return item << hbi->granularity; +} + +void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first) +{ + unsigned i, bit; + uint64_t pos; + + hbi->hb = hb; + pos = first >> hb->granularity; + assert(pos < hb->size); + hbi->pos = pos >> BITS_PER_LEVEL; + hbi->granularity = hb->granularity; + + for (i = HBITMAP_LEVELS; i-- > 0; ) { + bit = pos & (BITS_PER_LONG - 1); + pos >>= BITS_PER_LEVEL; + + /* Drop bits representing items before first. */ + hbi->cur[i] = hb->levels[i][pos] & ~((1UL << bit) - 1); + + /* We have already added level i+1, so the lowest set bit has + * been processed. Clear it. + */ + if (i != HBITMAP_LEVELS - 1) { + hbi->cur[i] &= ~(1UL << bit); + } + } +} + +int64_t hbitmap_next_dirty(const HBitmap *hb, int64_t start, int64_t count) +{ + HBitmapIter hbi; + int64_t first_dirty_off; + uint64_t end; + + assert(start >= 0 && count >= 0); + + if (start >= hb->orig_size || count == 0) { + return -1; + } + + end = count > hb->orig_size - start ? hb->orig_size : start + count; + + hbitmap_iter_init(&hbi, hb, start); + first_dirty_off = hbitmap_iter_next(&hbi); + + if (first_dirty_off < 0 || first_dirty_off >= end) { + return -1; + } + + return MAX(start, first_dirty_off); +} + +int64_t hbitmap_next_zero(const HBitmap *hb, int64_t start, int64_t count) +{ + size_t pos = (start >> hb->granularity) >> BITS_PER_LEVEL; + unsigned long *last_lev = hb->levels[HBITMAP_LEVELS - 1]; + unsigned long cur = last_lev[pos]; + unsigned start_bit_offset; + uint64_t end_bit, sz; + int64_t res; + + assert(start >= 0 && count >= 0); + + if (start >= hb->orig_size || count == 0) { + return -1; + } + + end_bit = count > hb->orig_size - start ? + hb->size : + ((start + count - 1) >> hb->granularity) + 1; + sz = (end_bit + BITS_PER_LONG - 1) >> BITS_PER_LEVEL; + + /* There may be some zero bits in @cur before @start. We are not interested + * in them, let's set them. + */ + start_bit_offset = (start >> hb->granularity) & (BITS_PER_LONG - 1); + cur |= (1UL << start_bit_offset) - 1; + assert((start >> hb->granularity) < hb->size); + + if (cur == (unsigned long)-1) { + do { + pos++; + } while (pos < sz && last_lev[pos] == (unsigned long)-1); + + if (pos >= sz) { + return -1; + } + + cur = last_lev[pos]; + } + + res = (pos << BITS_PER_LEVEL) + ctol(cur); + if (res >= end_bit) { + return -1; + } + + res = res << hb->granularity; + if (res < start) { + assert(((start - res) >> hb->granularity) == 0); + return start; + } + + return res; +} + +bool hbitmap_next_dirty_area(const HBitmap *hb, int64_t start, int64_t end, + int64_t max_dirty_count, + int64_t *dirty_start, int64_t *dirty_count) +{ + int64_t next_zero; + + assert(start >= 0 && end >= 0 && max_dirty_count > 0); + + end = MIN(end, hb->orig_size); + if (start >= end) { + return false; + } + + start = hbitmap_next_dirty(hb, start, end - start); + if (start < 0) { + return false; + } + + end = start + MIN(end - start, max_dirty_count); + + next_zero = hbitmap_next_zero(hb, start, end - start); + if (next_zero >= 0) { + end = next_zero; + } + + *dirty_start = start; + *dirty_count = end - start; + + return true; +} + +bool hbitmap_empty(const HBitmap *hb) +{ + return hb->count == 0; +} + +int hbitmap_granularity(const HBitmap *hb) +{ + return hb->granularity; +} + +uint64_t hbitmap_count(const HBitmap *hb) +{ + return hb->count << hb->granularity; +} + +/** + * hbitmap_iter_next_word: + * @hbi: HBitmapIter to operate on. + * @p_cur: Location where to store the next non-zero word. + * + * Return the index of the next nonzero word that is set in @hbi's + * associated HBitmap, and set *p_cur to the content of that word + * (bits before the index that was passed to hbitmap_iter_init are + * trimmed on the first call). Return -1, and set *p_cur to zero, + * if all remaining words are zero. + */ +static size_t hbitmap_iter_next_word(HBitmapIter *hbi, unsigned long *p_cur) +{ + unsigned long cur = hbi->cur[HBITMAP_LEVELS - 1]; + + if (cur == 0) { + cur = hbitmap_iter_skip_words(hbi); + if (cur == 0) { + *p_cur = 0; + return -1; + } + } + + /* The next call will resume work from the next word. */ + hbi->cur[HBITMAP_LEVELS - 1] = 0; + *p_cur = cur; + return hbi->pos; +} + +/* Count the number of set bits between start and end, not accounting for + * the granularity. Also an example of how to use hbitmap_iter_next_word. + */ +static uint64_t hb_count_between(HBitmap *hb, uint64_t start, uint64_t last) +{ + HBitmapIter hbi; + uint64_t count = 0; + uint64_t end = last + 1; + unsigned long cur; + size_t pos; + + hbitmap_iter_init(&hbi, hb, start << hb->granularity); + for (;;) { + pos = hbitmap_iter_next_word(&hbi, &cur); + if (pos >= (end >> BITS_PER_LEVEL)) { + break; + } + count += ctpopl(cur); + } + + if (pos == (end >> BITS_PER_LEVEL)) { + /* Drop bits representing the END-th and subsequent items. */ + int bit = end & (BITS_PER_LONG - 1); + cur &= (1UL << bit) - 1; + count += ctpopl(cur); + } + + return count; +} + +/* Setting starts at the last layer and propagates up if an element + * changes. + */ +static inline bool hb_set_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + unsigned long old; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + old = *elem; + *elem |= mask; + return old != *elem; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... + * Returns true if at least one bit is changed. */ +static bool hb_set_between(HBitmap *hb, int level, uint64_t start, + uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + changed |= hb_set_elem(&hb->levels[level][i], start, next - 1); + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] == 0); + hb->levels[level][i] = ~0UL; + } + } + changed |= hb_set_elem(&hb->levels[level][i], start, last); + + /* If there was any change in this layer, we may have to update + * the one above. + */ + if (level > 0 && changed) { + hb_set_between(hb, level - 1, pos, lastpos); + } + return changed; +} + +void hbitmap_set(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t first, n; + uint64_t last = start + count - 1; + + if (count == 0) { + return; + } + + trace_hbitmap_set(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + first = start >> hb->granularity; + last >>= hb->granularity; + assert(last < hb->size); + n = last - first + 1; + + hb->count += n - hb_count_between(hb, first, last); + if (hb_set_between(hb, HBITMAP_LEVELS - 1, first, last) && + hb->meta) { + hbitmap_set(hb->meta, start, count); + } +} + +/* Resetting works the other way round: propagate up if the new + * value is zero. + */ +static inline bool hb_reset_elem(unsigned long *elem, uint64_t start, uint64_t last) +{ + unsigned long mask; + bool blanked; + + assert((last >> BITS_PER_LEVEL) == (start >> BITS_PER_LEVEL)); + assert(start <= last); + + mask = 2UL << (last & (BITS_PER_LONG - 1)); + mask -= 1UL << (start & (BITS_PER_LONG - 1)); + blanked = *elem != 0 && ((*elem & ~mask) == 0); + *elem &= ~mask; + return blanked; +} + +/* The recursive workhorse (the depth is limited to HBITMAP_LEVELS)... + * Returns true if at least one bit is changed. */ +static bool hb_reset_between(HBitmap *hb, int level, uint64_t start, + uint64_t last) +{ + size_t pos = start >> BITS_PER_LEVEL; + size_t lastpos = last >> BITS_PER_LEVEL; + bool changed = false; + size_t i; + + i = pos; + if (i < lastpos) { + uint64_t next = (start | (BITS_PER_LONG - 1)) + 1; + + /* Here we need a more complex test than when setting bits. Even if + * something was changed, we must not blank bits in the upper level + * unless the lower-level word became entirely zero. So, remove pos + * from the upper-level range if bits remain set. + */ + if (hb_reset_elem(&hb->levels[level][i], start, next - 1)) { + changed = true; + } else { + pos++; + } + + for (;;) { + start = next; + next += BITS_PER_LONG; + if (++i == lastpos) { + break; + } + changed |= (hb->levels[level][i] != 0); + hb->levels[level][i] = 0UL; + } + } + + /* Same as above, this time for lastpos. */ + if (hb_reset_elem(&hb->levels[level][i], start, last)) { + changed = true; + } else { + lastpos--; + } + + if (level > 0 && changed) { + hb_reset_between(hb, level - 1, pos, lastpos); + } + + return changed; + +} + +void hbitmap_reset(HBitmap *hb, uint64_t start, uint64_t count) +{ + /* Compute range in the last layer. */ + uint64_t first; + uint64_t last = start + count - 1; + uint64_t gran = 1ULL << hb->granularity; + + if (count == 0) { + return; + } + + assert(QEMU_IS_ALIGNED(start, gran)); + assert(QEMU_IS_ALIGNED(count, gran) || (start + count == hb->orig_size)); + + trace_hbitmap_reset(hb, start, count, + start >> hb->granularity, last >> hb->granularity); + + first = start >> hb->granularity; + last >>= hb->granularity; + assert(last < hb->size); + + hb->count -= hb_count_between(hb, first, last); + if (hb_reset_between(hb, HBITMAP_LEVELS - 1, first, last) && + hb->meta) { + hbitmap_set(hb->meta, start, count); + } +} + +void hbitmap_reset_all(HBitmap *hb) +{ + unsigned int i; + + /* Same as hbitmap_alloc() except for memset() instead of malloc() */ + for (i = HBITMAP_LEVELS; --i >= 1; ) { + memset(hb->levels[i], 0, hb->sizes[i] * sizeof(unsigned long)); + } + + hb->levels[0][0] = 1UL << (BITS_PER_LONG - 1); + hb->count = 0; +} + +bool hbitmap_is_serializable(const HBitmap *hb) +{ + /* Every serialized chunk must be aligned to 64 bits so that endianness + * requirements can be fulfilled on both 64 bit and 32 bit hosts. + * We have hbitmap_serialization_align() which converts this + * alignment requirement from bitmap bits to items covered (e.g. sectors). + * That value is: + * 64 << hb->granularity + * Since this value must not exceed UINT64_MAX, hb->granularity must be + * less than 58 (== 64 - 6, where 6 is ld(64), i.e. 1 << 6 == 64). + * + * In order for hbitmap_serialization_align() to always return a + * meaningful value, bitmaps that are to be serialized must have a + * granularity of less than 58. */ + + return hb->granularity < 58; +} + +bool hbitmap_get(const HBitmap *hb, uint64_t item) +{ + /* Compute position and bit in the last layer. */ + uint64_t pos = item >> hb->granularity; + unsigned long bit = 1UL << (pos & (BITS_PER_LONG - 1)); + assert(pos < hb->size); + + return (hb->levels[HBITMAP_LEVELS - 1][pos >> BITS_PER_LEVEL] & bit) != 0; +} + +uint64_t hbitmap_serialization_align(const HBitmap *hb) +{ + assert(hbitmap_is_serializable(hb)); + + /* Require at least 64 bit granularity to be safe on both 64 bit and 32 bit + * hosts. */ + return UINT64_C(64) << hb->granularity; +} + +/* Start should be aligned to serialization granularity, chunk size should be + * aligned to serialization granularity too, except for last chunk. + */ +static void serialization_chunk(const HBitmap *hb, + uint64_t start, uint64_t count, + unsigned long **first_el, uint64_t *el_count) +{ + uint64_t last = start + count - 1; + uint64_t gran = hbitmap_serialization_align(hb); + + assert((start & (gran - 1)) == 0); + assert((last >> hb->granularity) < hb->size); + if ((last >> hb->granularity) != hb->size - 1) { + assert((count & (gran - 1)) == 0); + } + + start = (start >> hb->granularity) >> BITS_PER_LEVEL; + last = (last >> hb->granularity) >> BITS_PER_LEVEL; + + *first_el = &hb->levels[HBITMAP_LEVELS - 1][start]; + *el_count = last - start + 1; +} + +uint64_t hbitmap_serialization_size(const HBitmap *hb, + uint64_t start, uint64_t count) +{ + uint64_t el_count; + unsigned long *cur; + + if (!count) { + return 0; + } + serialization_chunk(hb, start, count, &cur, &el_count); + + return el_count * sizeof(unsigned long); +} + +void hbitmap_serialize_part(const HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count) +{ + uint64_t el_count; + unsigned long *cur, *end; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &cur, &el_count); + end = cur + el_count; + + while (cur != end) { + unsigned long el = + (BITS_PER_LONG == 32 ? cpu_to_le32(*cur) : cpu_to_le64(*cur)); + + memcpy(buf, &el, sizeof(el)); + buf += sizeof(el); + cur++; + } +} + +void hbitmap_deserialize_part(HBitmap *hb, uint8_t *buf, + uint64_t start, uint64_t count, + bool finish) +{ + uint64_t el_count; + unsigned long *cur, *end; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &cur, &el_count); + end = cur + el_count; + + while (cur != end) { + memcpy(cur, buf, sizeof(*cur)); + + if (BITS_PER_LONG == 32) { + le32_to_cpus((uint32_t *)cur); + } else { + le64_to_cpus((uint64_t *)cur); + } + + buf += sizeof(unsigned long); + cur++; + } + if (finish) { + hbitmap_deserialize_finish(hb); + } +} + +void hbitmap_deserialize_zeroes(HBitmap *hb, uint64_t start, uint64_t count, + bool finish) +{ + uint64_t el_count; + unsigned long *first; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &first, &el_count); + + memset(first, 0, el_count * sizeof(unsigned long)); + if (finish) { + hbitmap_deserialize_finish(hb); + } +} + +void hbitmap_deserialize_ones(HBitmap *hb, uint64_t start, uint64_t count, + bool finish) +{ + uint64_t el_count; + unsigned long *first; + + if (!count) { + return; + } + serialization_chunk(hb, start, count, &first, &el_count); + + memset(first, 0xff, el_count * sizeof(unsigned long)); + if (finish) { + hbitmap_deserialize_finish(hb); + } +} + +void hbitmap_deserialize_finish(HBitmap *bitmap) +{ + int64_t i, size, prev_size; + int lev; + + /* restore levels starting from penultimate to zero level, assuming + * that the last level is ok */ + size = MAX((bitmap->size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + for (lev = HBITMAP_LEVELS - 1; lev-- > 0; ) { + prev_size = size; + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + memset(bitmap->levels[lev], 0, size * sizeof(unsigned long)); + + for (i = 0; i < prev_size; ++i) { + if (bitmap->levels[lev + 1][i]) { + bitmap->levels[lev][i >> BITS_PER_LEVEL] |= + 1UL << (i & (BITS_PER_LONG - 1)); + } + } + } + + bitmap->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); + bitmap->count = hb_count_between(bitmap, 0, bitmap->size - 1); +} + +void hbitmap_free(HBitmap *hb) +{ + unsigned i; + assert(!hb->meta); + for (i = HBITMAP_LEVELS; i-- > 0; ) { + g_free(hb->levels[i]); + } + g_free(hb); +} + +HBitmap *hbitmap_alloc(uint64_t size, int granularity) +{ + HBitmap *hb = g_new0(struct HBitmap, 1); + unsigned i; + + assert(size <= INT64_MAX); + hb->orig_size = size; + + assert(granularity >= 0 && granularity < 64); + size = (size + (1ULL << granularity) - 1) >> granularity; + assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE)); + + hb->size = size; + hb->granularity = granularity; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + size = MAX((size + BITS_PER_LONG - 1) >> BITS_PER_LEVEL, 1); + hb->sizes[i] = size; + hb->levels[i] = g_new0(unsigned long, size); + } + + /* We necessarily have free bits in level 0 due to the definition + * of HBITMAP_LEVELS, so use one for a sentinel. This speeds up + * hbitmap_iter_skip_words. + */ + assert(size == 1); + hb->levels[0][0] |= 1UL << (BITS_PER_LONG - 1); + return hb; +} + +void hbitmap_truncate(HBitmap *hb, uint64_t size) +{ + bool shrink; + unsigned i; + uint64_t num_elements = size; + uint64_t old; + + assert(size <= INT64_MAX); + hb->orig_size = size; + + /* Size comes in as logical elements, adjust for granularity. */ + size = (size + (1ULL << hb->granularity) - 1) >> hb->granularity; + assert(size <= ((uint64_t)1 << HBITMAP_LOG_MAX_SIZE)); + shrink = size < hb->size; + + /* bit sizes are identical; nothing to do. */ + if (size == hb->size) { + return; + } + + /* If we're losing bits, let's clear those bits before we invalidate all of + * our invariants. This helps keep the bitcount consistent, and will prevent + * us from carrying around garbage bits beyond the end of the map. + */ + if (shrink) { + /* Don't clear partial granularity groups; + * start at the first full one. */ + uint64_t start = ROUND_UP(num_elements, UINT64_C(1) << hb->granularity); + uint64_t fix_count = (hb->size << hb->granularity) - start; + + assert(fix_count); + hbitmap_reset(hb, start, fix_count); + } + + hb->size = size; + for (i = HBITMAP_LEVELS; i-- > 0; ) { + size = MAX(BITS_TO_LONGS(size), 1); + if (hb->sizes[i] == size) { + break; + } + old = hb->sizes[i]; + hb->sizes[i] = size; + hb->levels[i] = g_realloc(hb->levels[i], size * sizeof(unsigned long)); + if (!shrink) { + memset(&hb->levels[i][old], 0x00, + (size - old) * sizeof(*hb->levels[i])); + } + } + if (hb->meta) { + hbitmap_truncate(hb->meta, hb->size << hb->granularity); + } +} + +bool hbitmap_can_merge(const HBitmap *a, const HBitmap *b) +{ + return (a->orig_size == b->orig_size); +} + +/** + * hbitmap_sparse_merge: performs dst = dst | src + * works with differing granularities. + * best used when src is sparsely populated. + */ +static void hbitmap_sparse_merge(HBitmap *dst, const HBitmap *src) +{ + int64_t offset; + int64_t count; + + for (offset = 0; + hbitmap_next_dirty_area(src, offset, src->orig_size, INT64_MAX, + &offset, &count); + offset += count) + { + hbitmap_set(dst, offset, count); + } +} + +/** + * Given HBitmaps A and B, let R := A (BITOR) B. + * Bitmaps A and B will not be modified, + * except when bitmap R is an alias of A or B. + * + * @return true if the merge was successful, + * false if it was not attempted. + */ +bool hbitmap_merge(const HBitmap *a, const HBitmap *b, HBitmap *result) +{ + int i; + uint64_t j; + + if (!hbitmap_can_merge(a, b) || !hbitmap_can_merge(a, result)) { + return false; + } + assert(hbitmap_can_merge(b, result)); + + if ((!hbitmap_count(a) && result == b) || + (!hbitmap_count(b) && result == a)) { + return true; + } + + if (!hbitmap_count(a) && !hbitmap_count(b)) { + hbitmap_reset_all(result); + return true; + } + + if (a->granularity != b->granularity) { + if ((a != result) && (b != result)) { + hbitmap_reset_all(result); + } + if (a != result) { + hbitmap_sparse_merge(result, a); + } + if (b != result) { + hbitmap_sparse_merge(result, b); + } + return true; + } + + /* This merge is O(size), as BITS_PER_LONG and HBITMAP_LEVELS are constant. + * It may be possible to improve running times for sparsely populated maps + * by using hbitmap_iter_next, but this is suboptimal for dense maps. + */ + assert(a->size == b->size); + for (i = HBITMAP_LEVELS - 1; i >= 0; i--) { + for (j = 0; j < a->sizes[i]; j++) { + result->levels[i][j] = a->levels[i][j] | b->levels[i][j]; + } + } + + /* Recompute the dirty count */ + result->count = hb_count_between(result, 0, result->size - 1); + + return true; +} + +char *hbitmap_sha256(const HBitmap *bitmap, Error **errp) +{ + size_t size = bitmap->sizes[HBITMAP_LEVELS - 1] * sizeof(unsigned long); + char *data = (char *)bitmap->levels[HBITMAP_LEVELS - 1]; + char *hash = NULL; + qcrypto_hash_digest(QCRYPTO_HASH_ALG_SHA256, data, size, &hash, errp); + + return hash; +} diff --git a/util/hexdump.c b/util/hexdump.c new file mode 100644 index 000000000..2c105a884 --- /dev/null +++ b/util/hexdump.c @@ -0,0 +1,65 @@ +/* + * Helper to hexdump a buffer + * + * Copyright (c) 2013 Red Hat, Inc. + * Copyright (c) 2013 Gerd Hoffmann <kraxel@redhat.com> + * Copyright (c) 2013 Peter Crosthwaite <peter.crosthwaite@xilinx.com> + * Copyright (c) 2013 Xilinx, Inc + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +void qemu_hexdump_line(char *line, unsigned int b, const void *bufptr, + unsigned int len, bool ascii) +{ + const char *buf = bufptr; + int i, c; + + if (len > QEMU_HEXDUMP_LINE_BYTES) { + len = QEMU_HEXDUMP_LINE_BYTES; + } + + line += snprintf(line, 6, "%04x:", b); + for (i = 0; i < QEMU_HEXDUMP_LINE_BYTES; i++) { + if ((i % 4) == 0) { + *line++ = ' '; + } + if (i < len) { + line += sprintf(line, " %02x", (unsigned char)buf[b + i]); + } else { + line += sprintf(line, " "); + } + } + if (ascii) { + *line++ = ' '; + for (i = 0; i < len; i++) { + c = buf[b + i]; + if (c < ' ' || c > '~') { + c = '.'; + } + *line++ = c; + } + } + *line = '\0'; +} + +void qemu_hexdump(FILE *fp, const char *prefix, + const void *bufptr, size_t size) +{ + unsigned int b, len; + char line[QEMU_HEXDUMP_LINE_LEN]; + + for (b = 0; b < size; b += QEMU_HEXDUMP_LINE_BYTES) { + len = size - b; + qemu_hexdump_line(line, b, bufptr, len, true); + fprintf(fp, "%s: %s\n", prefix, line); + } + +} diff --git a/util/host-utils.c b/util/host-utils.c new file mode 100644 index 000000000..bcc772b8e --- /dev/null +++ b/util/host-utils.c @@ -0,0 +1,268 @@ +/* + * Utility compute operations used by translated code. + * + * Copyright (c) 2003 Fabrice Bellard + * Copyright (c) 2007 Aurelien Jarno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/host-utils.h" + +#ifndef CONFIG_INT128 +/* Long integer helpers */ +static inline void mul64(uint64_t *plow, uint64_t *phigh, + uint64_t a, uint64_t b) +{ + typedef union { + uint64_t ll; + struct { +#ifdef HOST_WORDS_BIGENDIAN + uint32_t high, low; +#else + uint32_t low, high; +#endif + } l; + } LL; + LL rl, rm, rn, rh, a0, b0; + uint64_t c; + + a0.ll = a; + b0.ll = b; + + rl.ll = (uint64_t)a0.l.low * b0.l.low; + rm.ll = (uint64_t)a0.l.low * b0.l.high; + rn.ll = (uint64_t)a0.l.high * b0.l.low; + rh.ll = (uint64_t)a0.l.high * b0.l.high; + + c = (uint64_t)rl.l.high + rm.l.low + rn.l.low; + rl.l.high = c; + c >>= 32; + c = c + rm.l.high + rn.l.high + rh.l.low; + rh.l.low = c; + rh.l.high += (uint32_t)(c >> 32); + + *plow = rl.ll; + *phigh = rh.ll; +} + +/* Unsigned 64x64 -> 128 multiplication */ +void mulu64 (uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b) +{ + mul64(plow, phigh, a, b); +} + +/* Signed 64x64 -> 128 multiplication */ +void muls64 (uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b) +{ + uint64_t rh; + + mul64(plow, &rh, a, b); + + /* Adjust for signs. */ + if (b < 0) { + rh -= a; + } + if (a < 0) { + rh -= b; + } + *phigh = rh; +} + +/* + * Unsigned 128-by-64 division. + * Returns the remainder. + * Returns quotient via plow and phigh. + * Also returns the remainder via the function return value. + */ +uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor) +{ + uint64_t dhi = *phigh; + uint64_t dlo = *plow; + uint64_t rem, dhighest; + int sh; + + if (divisor == 0 || dhi == 0) { + *plow = dlo / divisor; + *phigh = 0; + return dlo % divisor; + } else { + sh = clz64(divisor); + + if (dhi < divisor) { + if (sh != 0) { + /* normalize the divisor, shifting the dividend accordingly */ + divisor <<= sh; + dhi = (dhi << sh) | (dlo >> (64 - sh)); + dlo <<= sh; + } + + *phigh = 0; + *plow = udiv_qrnnd(&rem, dhi, dlo, divisor); + } else { + if (sh != 0) { + /* normalize the divisor, shifting the dividend accordingly */ + divisor <<= sh; + dhighest = dhi >> (64 - sh); + dhi = (dhi << sh) | (dlo >> (64 - sh)); + dlo <<= sh; + + *phigh = udiv_qrnnd(&dhi, dhighest, dhi, divisor); + } else { + /** + * dhi >= divisor + * Since the MSB of divisor is set (sh == 0), + * (dhi - divisor) < divisor + * + * Thus, the high part of the quotient is 1, and we can + * calculate the low part with a single call to udiv_qrnnd + * after subtracting divisor from dhi + */ + dhi -= divisor; + *phigh = 1; + } + + *plow = udiv_qrnnd(&rem, dhi, dlo, divisor); + } + + /* + * since the dividend/divisor might have been normalized, + * the remainder might also have to be shifted back + */ + return rem >> sh; + } +} + +/* + * Signed 128-by-64 division. + * Returns quotient via plow and phigh. + * Also returns the remainder via the function return value. + */ +int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor) +{ + bool neg_quotient = false, neg_remainder = false; + uint64_t unsig_hi = *phigh, unsig_lo = *plow; + uint64_t rem; + + if (*phigh < 0) { + neg_quotient = !neg_quotient; + neg_remainder = !neg_remainder; + + if (unsig_lo == 0) { + unsig_hi = -unsig_hi; + } else { + unsig_hi = ~unsig_hi; + unsig_lo = -unsig_lo; + } + } + + if (divisor < 0) { + neg_quotient = !neg_quotient; + + divisor = -divisor; + } + + rem = divu128(&unsig_lo, &unsig_hi, (uint64_t)divisor); + + if (neg_quotient) { + if (unsig_lo == 0) { + *phigh = -unsig_hi; + *plow = 0; + } else { + *phigh = ~unsig_hi; + *plow = -unsig_lo; + } + } else { + *phigh = unsig_hi; + *plow = unsig_lo; + } + + if (neg_remainder) { + return -rem; + } else { + return rem; + } +} +#endif + +/** + * urshift - 128-bit Unsigned Right Shift. + * @plow: in/out - lower 64-bit integer. + * @phigh: in/out - higher 64-bit integer. + * @shift: in - bytes to shift, between 0 and 127. + * + * Result is zero-extended and stored in plow/phigh, which are + * input/output variables. Shift values outside the range will + * be mod to 128. In other words, the caller is responsible to + * verify/assert both the shift range and plow/phigh pointers. + */ +void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift) +{ + shift &= 127; + if (shift == 0) { + return; + } + + uint64_t h = *phigh >> (shift & 63); + if (shift >= 64) { + *plow = h; + *phigh = 0; + } else { + *plow = (*plow >> (shift & 63)) | (*phigh << (64 - (shift & 63))); + *phigh = h; + } +} + +/** + * ulshift - 128-bit Unsigned Left Shift. + * @plow: in/out - lower 64-bit integer. + * @phigh: in/out - higher 64-bit integer. + * @shift: in - bytes to shift, between 0 and 127. + * @overflow: out - true if any 1-bit is shifted out. + * + * Result is zero-extended and stored in plow/phigh, which are + * input/output variables. Shift values outside the range will + * be mod to 128. In other words, the caller is responsible to + * verify/assert both the shift range and plow/phigh pointers. + */ +void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow) +{ + uint64_t low = *plow; + uint64_t high = *phigh; + + shift &= 127; + if (shift == 0) { + return; + } + + /* check if any bit will be shifted out */ + urshift(&low, &high, 128 - shift); + if (low | high) { + *overflow = true; + } + + if (shift >= 64) { + *phigh = *plow << (shift & 63); + *plow = 0; + } else { + *phigh = (*plow >> (64 - (shift & 63))) | (*phigh << (shift & 63)); + *plow = *plow << shift; + } +} diff --git a/util/id.c b/util/id.c new file mode 100644 index 000000000..ded41c502 --- /dev/null +++ b/util/id.c @@ -0,0 +1,69 @@ +/* + * Dealing with identifiers + * + * Copyright (C) 2014 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/ctype.h" +#include "qemu/id.h" + +bool id_wellformed(const char *id) +{ + int i; + + if (!qemu_isalpha(id[0])) { + return false; + } + for (i = 1; id[i]; i++) { + if (!qemu_isalnum(id[i]) && !strchr("-._", id[i])) { + return false; + } + } + return true; +} + +#define ID_SPECIAL_CHAR '#' + +static const char *const id_subsys_str[ID_MAX] = { + [ID_QDEV] = "qdev", + [ID_BLOCK] = "block", + [ID_CHR] = "chr", + [ID_NET] = "net", +}; + +/* + * Generates an ID of the form PREFIX SUBSYSTEM NUMBER + * where: + * + * - PREFIX is the reserved character '#' + * - SUBSYSTEM identifies the subsystem creating the ID + * - NUMBER is a decimal number unique within SUBSYSTEM. + * + * Example: "#block146" + * + * Note that these IDs do not satisfy id_wellformed(). + * + * The caller is responsible for freeing the returned string with g_free() + */ +char *id_generate(IdSubSystems id) +{ + static uint64_t id_counters[ID_MAX]; + uint32_t rnd; + + assert(id < ARRAY_SIZE(id_subsys_str)); + assert(id_subsys_str[id]); + + rnd = g_random_int_range(0, 100); + + return g_strdup_printf("%c%s%" PRIu64 "%02" PRId32, ID_SPECIAL_CHAR, + id_subsys_str[id], + id_counters[id]++, + rnd); +} diff --git a/util/iov.c b/util/iov.c new file mode 100644 index 000000000..58c7b3eee --- /dev/null +++ b/util/iov.c @@ -0,0 +1,764 @@ +/* + * Helpers for getting linearized buffers from iov / filling buffers into iovs + * + * Copyright IBM, Corp. 2007, 2008 + * Copyright (C) 2010 Red Hat, Inc. + * + * Author(s): + * Anthony Liguori <aliguori@us.ibm.com> + * Amit Shah <amit.shah@redhat.com> + * Michael Tokarev <mjt@tls.msk.ru> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/iov.h" +#include "qemu/sockets.h" +#include "qemu/cutils.h" + +size_t iov_from_buf_full(const struct iovec *iov, unsigned int iov_cnt, + size_t offset, const void *buf, size_t bytes) +{ + size_t done; + unsigned int i; + for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) { + if (offset < iov[i].iov_len) { + size_t len = MIN(iov[i].iov_len - offset, bytes - done); + memcpy(iov[i].iov_base + offset, buf + done, len); + done += len; + offset = 0; + } else { + offset -= iov[i].iov_len; + } + } + assert(offset == 0); + return done; +} + +size_t iov_to_buf_full(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, void *buf, size_t bytes) +{ + size_t done; + unsigned int i; + for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) { + if (offset < iov[i].iov_len) { + size_t len = MIN(iov[i].iov_len - offset, bytes - done); + memcpy(buf + done, iov[i].iov_base + offset, len); + done += len; + offset = 0; + } else { + offset -= iov[i].iov_len; + } + } + assert(offset == 0); + return done; +} + +size_t iov_memset(const struct iovec *iov, const unsigned int iov_cnt, + size_t offset, int fillc, size_t bytes) +{ + size_t done; + unsigned int i; + for (i = 0, done = 0; (offset || done < bytes) && i < iov_cnt; i++) { + if (offset < iov[i].iov_len) { + size_t len = MIN(iov[i].iov_len - offset, bytes - done); + memset(iov[i].iov_base + offset, fillc, len); + done += len; + offset = 0; + } else { + offset -= iov[i].iov_len; + } + } + assert(offset == 0); + return done; +} + +size_t iov_size(const struct iovec *iov, const unsigned int iov_cnt) +{ + size_t len; + unsigned int i; + + len = 0; + for (i = 0; i < iov_cnt; i++) { + len += iov[i].iov_len; + } + return len; +} + +/* helper function for iov_send_recv() */ +static ssize_t +do_send_recv(int sockfd, struct iovec *iov, unsigned iov_cnt, bool do_send) +{ +#ifdef CONFIG_POSIX + ssize_t ret; + struct msghdr msg; + memset(&msg, 0, sizeof(msg)); + msg.msg_iov = iov; + msg.msg_iovlen = iov_cnt; + do { + ret = do_send + ? sendmsg(sockfd, &msg, 0) + : recvmsg(sockfd, &msg, 0); + } while (ret < 0 && errno == EINTR); + return ret; +#else + /* else send piece-by-piece */ + /*XXX Note: windows has WSASend() and WSARecv() */ + unsigned i = 0; + ssize_t ret = 0; + while (i < iov_cnt) { + ssize_t r = do_send + ? send(sockfd, iov[i].iov_base, iov[i].iov_len, 0) + : recv(sockfd, iov[i].iov_base, iov[i].iov_len, 0); + if (r > 0) { + ret += r; + } else if (!r) { + break; + } else if (errno == EINTR) { + continue; + } else { + /* else it is some "other" error, + * only return if there was no data processed. */ + if (ret == 0) { + ret = -1; + } + break; + } + i++; + } + return ret; +#endif +} + +ssize_t iov_send_recv(int sockfd, const struct iovec *_iov, unsigned iov_cnt, + size_t offset, size_t bytes, + bool do_send) +{ + ssize_t total = 0; + ssize_t ret; + size_t orig_len, tail; + unsigned niov; + struct iovec *local_iov, *iov; + + if (bytes <= 0) { + return 0; + } + + local_iov = g_new0(struct iovec, iov_cnt); + iov_copy(local_iov, iov_cnt, _iov, iov_cnt, offset, bytes); + offset = 0; + iov = local_iov; + + while (bytes > 0) { + /* Find the start position, skipping `offset' bytes: + * first, skip all full-sized vector elements, */ + for (niov = 0; niov < iov_cnt && offset >= iov[niov].iov_len; ++niov) { + offset -= iov[niov].iov_len; + } + + /* niov == iov_cnt would only be valid if bytes == 0, which + * we already ruled out in the loop condition. */ + assert(niov < iov_cnt); + iov += niov; + iov_cnt -= niov; + + if (offset) { + /* second, skip `offset' bytes from the (now) first element, + * undo it on exit */ + iov[0].iov_base += offset; + iov[0].iov_len -= offset; + } + /* Find the end position skipping `bytes' bytes: */ + /* first, skip all full-sized elements */ + tail = bytes; + for (niov = 0; niov < iov_cnt && iov[niov].iov_len <= tail; ++niov) { + tail -= iov[niov].iov_len; + } + if (tail) { + /* second, fixup the last element, and remember the original + * length */ + assert(niov < iov_cnt); + assert(iov[niov].iov_len > tail); + orig_len = iov[niov].iov_len; + iov[niov++].iov_len = tail; + ret = do_send_recv(sockfd, iov, niov, do_send); + /* Undo the changes above before checking for errors */ + iov[niov-1].iov_len = orig_len; + } else { + ret = do_send_recv(sockfd, iov, niov, do_send); + } + if (offset) { + iov[0].iov_base -= offset; + iov[0].iov_len += offset; + } + + if (ret < 0) { + assert(errno != EINTR); + g_free(local_iov); + if (errno == EAGAIN && total > 0) { + return total; + } + return -1; + } + + if (ret == 0 && !do_send) { + /* recv returns 0 when the peer has performed an orderly + * shutdown. */ + break; + } + + /* Prepare for the next iteration */ + offset += ret; + total += ret; + bytes -= ret; + } + + g_free(local_iov); + return total; +} + + +void iov_hexdump(const struct iovec *iov, const unsigned int iov_cnt, + FILE *fp, const char *prefix, size_t limit) +{ + int v; + size_t size = 0; + char *buf; + + for (v = 0; v < iov_cnt; v++) { + size += iov[v].iov_len; + } + size = size > limit ? limit : size; + buf = g_malloc(size); + iov_to_buf(iov, iov_cnt, 0, buf, size); + qemu_hexdump(fp, prefix, buf, size); + g_free(buf); +} + +unsigned iov_copy(struct iovec *dst_iov, unsigned int dst_iov_cnt, + const struct iovec *iov, unsigned int iov_cnt, + size_t offset, size_t bytes) +{ + size_t len; + unsigned int i, j; + for (i = 0, j = 0; + i < iov_cnt && j < dst_iov_cnt && (offset || bytes); i++) { + if (offset >= iov[i].iov_len) { + offset -= iov[i].iov_len; + continue; + } + len = MIN(bytes, iov[i].iov_len - offset); + + dst_iov[j].iov_base = iov[i].iov_base + offset; + dst_iov[j].iov_len = len; + j++; + bytes -= len; + offset = 0; + } + assert(offset == 0); + return j; +} + +/* io vectors */ + +void qemu_iovec_init(QEMUIOVector *qiov, int alloc_hint) +{ + qiov->iov = g_new(struct iovec, alloc_hint); + qiov->niov = 0; + qiov->nalloc = alloc_hint; + qiov->size = 0; +} + +void qemu_iovec_init_external(QEMUIOVector *qiov, struct iovec *iov, int niov) +{ + int i; + + qiov->iov = iov; + qiov->niov = niov; + qiov->nalloc = -1; + qiov->size = 0; + for (i = 0; i < niov; i++) + qiov->size += iov[i].iov_len; +} + +void qemu_iovec_add(QEMUIOVector *qiov, void *base, size_t len) +{ + assert(qiov->nalloc != -1); + + if (qiov->niov == qiov->nalloc) { + qiov->nalloc = 2 * qiov->nalloc + 1; + qiov->iov = g_renew(struct iovec, qiov->iov, qiov->nalloc); + } + qiov->iov[qiov->niov].iov_base = base; + qiov->iov[qiov->niov].iov_len = len; + qiov->size += len; + ++qiov->niov; +} + +/* + * Concatenates (partial) iovecs from src_iov to the end of dst. + * It starts copying after skipping `soffset' bytes at the + * beginning of src and adds individual vectors from src to + * dst copies up to `sbytes' bytes total, or up to the end + * of src_iov if it comes first. This way, it is okay to specify + * very large value for `sbytes' to indicate "up to the end + * of src". + * Only vector pointers are processed, not the actual data buffers. + */ +size_t qemu_iovec_concat_iov(QEMUIOVector *dst, + struct iovec *src_iov, unsigned int src_cnt, + size_t soffset, size_t sbytes) +{ + int i; + size_t done; + + if (!sbytes) { + return 0; + } + assert(dst->nalloc != -1); + for (i = 0, done = 0; done < sbytes && i < src_cnt; i++) { + if (soffset < src_iov[i].iov_len) { + size_t len = MIN(src_iov[i].iov_len - soffset, sbytes - done); + qemu_iovec_add(dst, src_iov[i].iov_base + soffset, len); + done += len; + soffset = 0; + } else { + soffset -= src_iov[i].iov_len; + } + } + assert(soffset == 0); /* offset beyond end of src */ + + return done; +} + +/* + * Concatenates (partial) iovecs from src to the end of dst. + * It starts copying after skipping `soffset' bytes at the + * beginning of src and adds individual vectors from src to + * dst copies up to `sbytes' bytes total, or up to the end + * of src if it comes first. This way, it is okay to specify + * very large value for `sbytes' to indicate "up to the end + * of src". + * Only vector pointers are processed, not the actual data buffers. + */ +void qemu_iovec_concat(QEMUIOVector *dst, + QEMUIOVector *src, size_t soffset, size_t sbytes) +{ + qemu_iovec_concat_iov(dst, src->iov, src->niov, soffset, sbytes); +} + +/* + * qiov_find_iov + * + * Return pointer to iovec structure, where byte at @offset in original vector + * @iov exactly is. + * Set @remaining_offset to be offset inside that iovec to the same byte. + */ +static struct iovec *iov_skip_offset(struct iovec *iov, size_t offset, + size_t *remaining_offset) +{ + while (offset > 0 && offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + *remaining_offset = offset; + + return iov; +} + +/* + * qiov_slice + * + * Find subarray of iovec's, containing requested range. @head would + * be offset in first iov (returned by the function), @tail would be + * count of extra bytes in last iovec (returned iov + @niov - 1). + */ +static struct iovec *qiov_slice(QEMUIOVector *qiov, + size_t offset, size_t len, + size_t *head, size_t *tail, int *niov) +{ + struct iovec *iov, *end_iov; + + assert(offset + len <= qiov->size); + + iov = iov_skip_offset(qiov->iov, offset, head); + end_iov = iov_skip_offset(iov, *head + len, tail); + + if (*tail > 0) { + assert(*tail < end_iov->iov_len); + *tail = end_iov->iov_len - *tail; + end_iov++; + } + + *niov = end_iov - iov; + + return iov; +} + +int qemu_iovec_subvec_niov(QEMUIOVector *qiov, size_t offset, size_t len) +{ + size_t head, tail; + int niov; + + qiov_slice(qiov, offset, len, &head, &tail, &niov); + + return niov; +} + +/* + * Compile new iovec, combining @head_buf buffer, sub-qiov of @mid_qiov, + * and @tail_buf buffer into new qiov. + */ +int qemu_iovec_init_extended( + QEMUIOVector *qiov, + void *head_buf, size_t head_len, + QEMUIOVector *mid_qiov, size_t mid_offset, size_t mid_len, + void *tail_buf, size_t tail_len) +{ + size_t mid_head, mid_tail; + int total_niov, mid_niov = 0; + struct iovec *p, *mid_iov = NULL; + + assert(mid_qiov->niov <= IOV_MAX); + + if (SIZE_MAX - head_len < mid_len || + SIZE_MAX - head_len - mid_len < tail_len) + { + return -EINVAL; + } + + if (mid_len) { + mid_iov = qiov_slice(mid_qiov, mid_offset, mid_len, + &mid_head, &mid_tail, &mid_niov); + } + + total_niov = !!head_len + mid_niov + !!tail_len; + if (total_niov > IOV_MAX) { + return -EINVAL; + } + + if (total_niov == 1) { + qemu_iovec_init_buf(qiov, NULL, 0); + p = &qiov->local_iov; + } else { + qiov->niov = qiov->nalloc = total_niov; + qiov->size = head_len + mid_len + tail_len; + p = qiov->iov = g_new(struct iovec, qiov->niov); + } + + if (head_len) { + p->iov_base = head_buf; + p->iov_len = head_len; + p++; + } + + assert(!mid_niov == !mid_len); + if (mid_niov) { + memcpy(p, mid_iov, mid_niov * sizeof(*p)); + p[0].iov_base = (uint8_t *)p[0].iov_base + mid_head; + p[0].iov_len -= mid_head; + p[mid_niov - 1].iov_len -= mid_tail; + p += mid_niov; + } + + if (tail_len) { + p->iov_base = tail_buf; + p->iov_len = tail_len; + } + + return 0; +} + +/* + * Check if the contents of subrange of qiov data is all zeroes. + */ +bool qemu_iovec_is_zero(QEMUIOVector *qiov, size_t offset, size_t bytes) +{ + struct iovec *iov; + size_t current_offset; + + assert(offset + bytes <= qiov->size); + + iov = iov_skip_offset(qiov->iov, offset, ¤t_offset); + + while (bytes) { + uint8_t *base = (uint8_t *)iov->iov_base + current_offset; + size_t len = MIN(iov->iov_len - current_offset, bytes); + + if (!buffer_is_zero(base, len)) { + return false; + } + + current_offset = 0; + bytes -= len; + iov++; + } + + return true; +} + +void qemu_iovec_init_slice(QEMUIOVector *qiov, QEMUIOVector *source, + size_t offset, size_t len) +{ + int ret; + + assert(source->size >= len); + assert(source->size - len >= offset); + + /* We shrink the request, so we can't overflow neither size_t nor MAX_IOV */ + ret = qemu_iovec_init_extended(qiov, NULL, 0, source, offset, len, NULL, 0); + assert(ret == 0); +} + +void qemu_iovec_destroy(QEMUIOVector *qiov) +{ + if (qiov->nalloc != -1) { + g_free(qiov->iov); + } + + memset(qiov, 0, sizeof(*qiov)); +} + +void qemu_iovec_reset(QEMUIOVector *qiov) +{ + assert(qiov->nalloc != -1); + + qiov->niov = 0; + qiov->size = 0; +} + +size_t qemu_iovec_to_buf(QEMUIOVector *qiov, size_t offset, + void *buf, size_t bytes) +{ + return iov_to_buf(qiov->iov, qiov->niov, offset, buf, bytes); +} + +size_t qemu_iovec_from_buf(QEMUIOVector *qiov, size_t offset, + const void *buf, size_t bytes) +{ + return iov_from_buf(qiov->iov, qiov->niov, offset, buf, bytes); +} + +size_t qemu_iovec_memset(QEMUIOVector *qiov, size_t offset, + int fillc, size_t bytes) +{ + return iov_memset(qiov->iov, qiov->niov, offset, fillc, bytes); +} + +/** + * Check that I/O vector contents are identical + * + * The IO vectors must have the same structure (same length of all parts). + * A typical usage is to compare vectors created with qemu_iovec_clone(). + * + * @a: I/O vector + * @b: I/O vector + * @ret: Offset to first mismatching byte or -1 if match + */ +ssize_t qemu_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + ssize_t offset = 0; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + size_t len = 0; + uint8_t *p = (uint8_t *)a->iov[i].iov_base; + uint8_t *q = (uint8_t *)b->iov[i].iov_base; + + assert(a->iov[i].iov_len == b->iov[i].iov_len); + while (len < a->iov[i].iov_len && *p++ == *q++) { + len++; + } + + offset += len; + + if (len != a->iov[i].iov_len) { + return offset; + } + } + return -1; +} + +typedef struct { + int src_index; + struct iovec *src_iov; + void *dest_base; +} IOVectorSortElem; + +static int sortelem_cmp_src_base(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + /* Don't overflow */ + if (elem_a->src_iov->iov_base < elem_b->src_iov->iov_base) { + return -1; + } else if (elem_a->src_iov->iov_base > elem_b->src_iov->iov_base) { + return 1; + } else { + return 0; + } +} + +static int sortelem_cmp_src_index(const void *a, const void *b) +{ + const IOVectorSortElem *elem_a = a; + const IOVectorSortElem *elem_b = b; + + return elem_a->src_index - elem_b->src_index; +} + +/** + * Copy contents of I/O vector + * + * The relative relationships of overlapping iovecs are preserved. This is + * necessary to ensure identical semantics in the cloned I/O vector. + */ +void qemu_iovec_clone(QEMUIOVector *dest, const QEMUIOVector *src, void *buf) +{ + IOVectorSortElem sortelems[src->niov]; + void *last_end; + int i; + + /* Sort by source iovecs by base address */ + for (i = 0; i < src->niov; i++) { + sortelems[i].src_index = i; + sortelems[i].src_iov = &src->iov[i]; + } + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_base); + + /* Allocate buffer space taking into account overlapping iovecs */ + last_end = NULL; + for (i = 0; i < src->niov; i++) { + struct iovec *cur = sortelems[i].src_iov; + ptrdiff_t rewind = 0; + + /* Detect overlap */ + if (last_end && last_end > cur->iov_base) { + rewind = last_end - cur->iov_base; + } + + sortelems[i].dest_base = buf - rewind; + buf += cur->iov_len - MIN(rewind, cur->iov_len); + last_end = MAX(cur->iov_base + cur->iov_len, last_end); + } + + /* Sort by source iovec index and build destination iovec */ + qsort(sortelems, src->niov, sizeof(sortelems[0]), sortelem_cmp_src_index); + for (i = 0; i < src->niov; i++) { + qemu_iovec_add(dest, sortelems[i].dest_base, src->iov[i].iov_len); + } +} + +void iov_discard_undo(IOVDiscardUndo *undo) +{ + /* Restore original iovec if it was modified */ + if (undo->modified_iov) { + *undo->modified_iov = undo->orig; + } +} + +size_t iov_discard_front_undoable(struct iovec **iov, + unsigned int *iov_cnt, + size_t bytes, + IOVDiscardUndo *undo) +{ + size_t total = 0; + struct iovec *cur; + + if (undo) { + undo->modified_iov = NULL; + } + + for (cur = *iov; *iov_cnt > 0; cur++) { + if (cur->iov_len > bytes) { + if (undo) { + undo->modified_iov = cur; + undo->orig = *cur; + } + + cur->iov_base += bytes; + cur->iov_len -= bytes; + total += bytes; + break; + } + + bytes -= cur->iov_len; + total += cur->iov_len; + *iov_cnt -= 1; + } + + *iov = cur; + return total; +} + +size_t iov_discard_front(struct iovec **iov, unsigned int *iov_cnt, + size_t bytes) +{ + return iov_discard_front_undoable(iov, iov_cnt, bytes, NULL); +} + +size_t iov_discard_back_undoable(struct iovec *iov, + unsigned int *iov_cnt, + size_t bytes, + IOVDiscardUndo *undo) +{ + size_t total = 0; + struct iovec *cur; + + if (undo) { + undo->modified_iov = NULL; + } + + if (*iov_cnt == 0) { + return 0; + } + + cur = iov + (*iov_cnt - 1); + + while (*iov_cnt > 0) { + if (cur->iov_len > bytes) { + if (undo) { + undo->modified_iov = cur; + undo->orig = *cur; + } + + cur->iov_len -= bytes; + total += bytes; + break; + } + + bytes -= cur->iov_len; + total += cur->iov_len; + cur--; + *iov_cnt -= 1; + } + + return total; +} + +size_t iov_discard_back(struct iovec *iov, unsigned int *iov_cnt, + size_t bytes) +{ + return iov_discard_back_undoable(iov, iov_cnt, bytes, NULL); +} + +void qemu_iovec_discard_back(QEMUIOVector *qiov, size_t bytes) +{ + size_t total; + unsigned int niov = qiov->niov; + + assert(qiov->size >= bytes); + total = iov_discard_back(qiov->iov, &niov, bytes); + assert(total == bytes); + + qiov->niov = niov; + qiov->size -= bytes; +} diff --git a/util/iova-tree.c b/util/iova-tree.c new file mode 100644 index 000000000..23ea35b7a --- /dev/null +++ b/util/iova-tree.c @@ -0,0 +1,114 @@ +/* + * IOVA tree implementation based on GTree. + * + * Copyright 2018 Red Hat, Inc. + * + * Authors: + * Peter Xu <peterx@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + */ + +#include "qemu/osdep.h" +#include "qemu/iova-tree.h" + +struct IOVATree { + GTree *tree; +}; + +static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data) +{ + const DMAMap *m1 = a, *m2 = b; + + if (m1->iova > m2->iova + m2->size) { + return 1; + } + + if (m1->iova + m1->size < m2->iova) { + return -1; + } + + /* Overlapped */ + return 0; +} + +IOVATree *iova_tree_new(void) +{ + IOVATree *iova_tree = g_new0(IOVATree, 1); + + /* We don't have values actually, no need to free */ + iova_tree->tree = g_tree_new_full(iova_tree_compare, NULL, g_free, NULL); + + return iova_tree; +} + +const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map) +{ + return g_tree_lookup(tree->tree, map); +} + +const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova) +{ + const DMAMap map = { .iova = iova, .size = 0 }; + + return iova_tree_find(tree, &map); +} + +static inline void iova_tree_insert_internal(GTree *gtree, DMAMap *range) +{ + /* Key and value are sharing the same range data */ + g_tree_insert(gtree, range, range); +} + +int iova_tree_insert(IOVATree *tree, const DMAMap *map) +{ + DMAMap *new; + + if (map->iova + map->size < map->iova || map->perm == IOMMU_NONE) { + return IOVA_ERR_INVALID; + } + + /* We don't allow to insert range that overlaps with existings */ + if (iova_tree_find(tree, map)) { + return IOVA_ERR_OVERLAP; + } + + new = g_new0(DMAMap, 1); + memcpy(new, map, sizeof(*new)); + iova_tree_insert_internal(tree->tree, new); + + return IOVA_OK; +} + +static gboolean iova_tree_traverse(gpointer key, gpointer value, + gpointer data) +{ + iova_tree_iterator iterator = data; + DMAMap *map = key; + + g_assert(key == value); + + return iterator(map); +} + +void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator) +{ + g_tree_foreach(tree->tree, iova_tree_traverse, iterator); +} + +int iova_tree_remove(IOVATree *tree, const DMAMap *map) +{ + const DMAMap *overlap; + + while ((overlap = iova_tree_find(tree, map))) { + g_tree_remove(tree->tree, overlap); + } + + return IOVA_OK; +} + +void iova_tree_destroy(IOVATree *tree) +{ + g_tree_destroy(tree->tree); + g_free(tree); +} diff --git a/util/keyval.c b/util/keyval.c new file mode 100644 index 000000000..904337c8a --- /dev/null +++ b/util/keyval.c @@ -0,0 +1,577 @@ +/* + * Parsing KEY=VALUE,... strings + * + * Copyright (C) 2017 Red Hat Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +/* + * KEY=VALUE,... syntax: + * + * key-vals = [ key-val { ',' key-val } [ ',' ] ] + * key-val = key '=' val | help + * key = key-fragment { '.' key-fragment } + * key-fragment = / [^=,.]+ / + * val = { / [^,]+ / | ',,' } + * help = 'help' | '?' + * + * Semantics defined by reduction to JSON: + * + * key-vals specifies a JSON object, i.e. a tree whose root is an + * object, inner nodes other than the root are objects or arrays, + * and leaves are strings. + * + * Each key-val = key-fragment '.' ... '=' val specifies a path from + * root to a leaf (left of '='), and the leaf's value (right of + * '='). + * + * A path from the root is defined recursively: + * L '.' key-fragment is a child of the node denoted by path L + * key-fragment is a child of the tree root + * If key-fragment is numeric, the parent is an array and the child + * is its key-fragment-th member, counting from zero. + * Else, the parent is an object, and the child is its member named + * key-fragment. + * + * This constrains inner nodes to be either array or object. The + * constraints must be satisfiable. Counter-example: a.b=1,a=2 is + * not, because root.a must be an object to satisfy a.b=1 and a + * string to satisfy a=2. + * + * Array subscripts can occur in any order, but the set of + * subscripts must not have gaps. For instance, a.1=v is not okay, + * because root.a[0] is missing. + * + * If multiple key-val denote the same leaf, the last one determines + * the value. + * + * Key-fragments must be valid QAPI names or consist only of decimal + * digits. + * + * The length of any key-fragment must be between 1 and 127. + * + * If any key-val is help, the object is to be treated as a help + * request. + * + * Design flaw: there is no way to denote an empty array or non-root + * object. While interpreting "key absent" as empty seems natural + * (removing a key-val from the input string removes the member when + * there are more, so why not when it's the last), it doesn't work: + * "key absent" already means "optional object/array absent", which + * isn't the same as "empty object/array present". + * + * Design flaw: scalar values can only be strings; there is no way to + * denote numbers, true, false or null. The special QObject input + * visitor returned by qobject_input_visitor_new_keyval() mostly hides + * this by automatically converting strings to the type the visitor + * expects. Breaks down for type 'any', where the visitor's + * expectation isn't clear. Code visiting 'any' needs to do the + * conversion itself, but only when using this keyval visitor. + * Awkward. Note that we carefully restrict alternate types to avoid + * similar ambiguity. + * + * Alternative syntax for use with an implied key: + * + * key-vals = [ key-val-1st { ',' key-val } [ ',' ] ] + * key-val-1st = val-no-key | key-val + * val-no-key = / [^=,]+ / - help + * + * where val-no-key is syntactic sugar for implied-key=val-no-key. + * + * Note that you can't use the sugared form when the value contains + * '=' or ','. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qlist.h" +#include "qapi/qmp/qstring.h" +#include "qemu/cutils.h" +#include "qemu/help_option.h" +#include "qemu/option.h" + +/* + * Convert @key to a list index. + * Convert all leading decimal digits to a (non-negative) number, + * capped at INT_MAX. + * If @end is non-null, assign a pointer to the first character after + * the number to *@end. + * Else, fail if any characters follow. + * On success, return the converted number. + * On failure, return a negative value. + * Note: since only digits are converted, no two keys can map to the + * same number, except by overflow to INT_MAX. + */ +static int key_to_index(const char *key, const char **end) +{ + int ret; + unsigned long index; + + if (*key < '0' || *key > '9') { + return -EINVAL; + } + ret = qemu_strtoul(key, end, 10, &index); + if (ret) { + return ret == -ERANGE ? INT_MAX : ret; + } + return index <= INT_MAX ? index : INT_MAX; +} + +/* + * Ensure @cur maps @key_in_cur the right way. + * If @value is null, it needs to map to a QDict, else to this + * QString. + * If @cur doesn't have @key_in_cur, put an empty QDict or @value, + * respectively. + * Else, if it needs to map to a QDict, and already does, do nothing. + * Else, if it needs to map to this QString, and already maps to a + * QString, replace it by @value. + * Else, fail because we have conflicting needs on how to map + * @key_in_cur. + * In any case, take over the reference to @value, i.e. if the caller + * wants to hold on to a reference, it needs to qobject_ref(). + * Use @key up to @key_cursor to identify the key in error messages. + * On success, return the mapped value. + * On failure, store an error through @errp and return NULL. + */ +static QObject *keyval_parse_put(QDict *cur, + const char *key_in_cur, QString *value, + const char *key, const char *key_cursor, + Error **errp) +{ + QObject *old, *new; + + old = qdict_get(cur, key_in_cur); + if (old) { + if (qobject_type(old) != (value ? QTYPE_QSTRING : QTYPE_QDICT)) { + error_setg(errp, "Parameters '%.*s.*' used inconsistently", + (int)(key_cursor - key), key); + qobject_unref(value); + return NULL; + } + if (!value) { + return old; /* already QDict, do nothing */ + } + new = QOBJECT(value); /* replacement */ + } else { + new = value ? QOBJECT(value) : QOBJECT(qdict_new()); + } + qdict_put_obj(cur, key_in_cur, new); + return new; +} + +/* + * Parse one parameter from @params. + * + * If we're looking at KEY=VALUE, store result in @qdict. + * The first fragment of KEY applies to @qdict. Subsequent fragments + * apply to nested QDicts, which are created on demand. @implied_key + * is as in keyval_parse(). + * + * If we're looking at "help" or "?", set *help to true. + * + * On success, return a pointer to the next parameter, or else to '\0'. + * On failure, return NULL. + */ +static const char *keyval_parse_one(QDict *qdict, const char *params, + const char *implied_key, bool *help, + Error **errp) +{ + const char *key, *key_end, *val_end, *s, *end; + size_t len; + char key_in_cur[128]; + QDict *cur; + int ret; + QObject *next; + GString *val; + + key = params; + val_end = NULL; + len = strcspn(params, "=,"); + if (len && key[len] != '=') { + if (starts_with_help_option(key) == len) { + *help = true; + s = key + len; + if (*s == ',') { + s++; + } + return s; + } + if (implied_key) { + /* Desugar implied key */ + key = implied_key; + val_end = params + len; + len = strlen(implied_key); + } + } + key_end = key + len; + + /* + * Loop over key fragments: @s points to current fragment, it + * applies to @cur. @key_in_cur[] holds the previous fragment. + */ + cur = qdict; + s = key; + for (;;) { + /* Want a key index (unless it's first) or a QAPI name */ + if (s != key && key_to_index(s, &end) >= 0) { + len = end - s; + } else { + ret = parse_qapi_name(s, false); + len = ret < 0 ? 0 : ret; + } + assert(s + len <= key_end); + if (!len || (s + len < key_end && s[len] != '.')) { + assert(key != implied_key); + error_setg(errp, "Invalid parameter '%.*s'", + (int)(key_end - key), key); + return NULL; + } + if (len >= sizeof(key_in_cur)) { + assert(key != implied_key); + error_setg(errp, "Parameter%s '%.*s' is too long", + s != key || s + len != key_end ? " fragment" : "", + (int)len, s); + return NULL; + } + + if (s != key) { + next = keyval_parse_put(cur, key_in_cur, NULL, + key, s - 1, errp); + if (!next) { + return NULL; + } + cur = qobject_to(QDict, next); + assert(cur); + } + + memcpy(key_in_cur, s, len); + key_in_cur[len] = 0; + s += len; + + if (*s != '.') { + break; + } + s++; + } + + if (key == implied_key) { + assert(!*s); + val = g_string_new_len(params, val_end - params); + s = val_end; + if (*s == ',') { + s++; + } + } else { + if (*s != '=') { + error_setg(errp, "Expected '=' after parameter '%.*s'", + (int)(s - key), key); + return NULL; + } + s++; + + val = g_string_new(NULL); + for (;;) { + if (!*s) { + break; + } else if (*s == ',') { + s++; + if (*s != ',') { + break; + } + } + g_string_append_c(val, *s++); + } + } + + if (!keyval_parse_put(cur, key_in_cur, qstring_from_gstring(val), + key, key_end, errp)) { + return NULL; + } + return s; +} + +static char *reassemble_key(GSList *key) +{ + GString *s = g_string_new(""); + GSList *p; + + for (p = key; p; p = p->next) { + g_string_prepend_c(s, '.'); + g_string_prepend(s, (char *)p->data); + } + + return g_string_free(s, FALSE); +} + +/* + * Recursive worker for keyval_merge. + * + * @str is the path that led to the * current dictionary (to be used for + * error messages). It is modified internally but restored before the + * function returns. + */ +static void keyval_do_merge(QDict *dest, const QDict *merged, GString *str, Error **errp) +{ + size_t save_len = str->len; + const QDictEntry *ent; + QObject *old_value; + + for (ent = qdict_first(merged); ent; ent = qdict_next(merged, ent)) { + old_value = qdict_get(dest, ent->key); + if (old_value) { + if (qobject_type(old_value) != qobject_type(ent->value)) { + error_setg(errp, "Parameter '%s%s' used inconsistently", + str->str, ent->key); + return; + } else if (qobject_type(ent->value) == QTYPE_QDICT) { + /* Merge sub-dictionaries. */ + g_string_append(str, ent->key); + g_string_append_c(str, '.'); + keyval_do_merge(qobject_to(QDict, old_value), + qobject_to(QDict, ent->value), + str, errp); + g_string_truncate(str, save_len); + continue; + } else if (qobject_type(ent->value) == QTYPE_QLIST) { + /* Append to old list. */ + QList *old = qobject_to(QList, old_value); + QList *new = qobject_to(QList, ent->value); + const QListEntry *item; + QLIST_FOREACH_ENTRY(new, item) { + qobject_ref(item->value); + qlist_append_obj(old, item->value); + } + continue; + } else { + assert(qobject_type(ent->value) == QTYPE_QSTRING); + } + } + + qobject_ref(ent->value); + qdict_put_obj(dest, ent->key, ent->value); + } +} + +/* Merge the @merged dictionary into @dest. + * + * The dictionaries are expected to be returned by the keyval parser, and + * therefore the only expected scalar type is the string. In case the same + * path is present in both @dest and @merged, the semantics are as follows: + * + * - lists are concatenated + * + * - dictionaries are merged recursively + * + * - for scalar values, @merged wins + * + * In case an error is reported, @dest may already have been modified. + * + * This function can be used to implement semantics analogous to QemuOpts's + * .merge_lists = true case, or to implement -set for options backed by QDicts. + * + * Note: while QemuOpts is commonly used so that repeated keys overwrite + * ("last one wins"), it can also be used so that repeated keys build up + * a list. keyval_merge() can only be used when the options' semantics are + * the former, not the latter. + */ +void keyval_merge(QDict *dest, const QDict *merged, Error **errp) +{ + GString *str; + + str = g_string_new(""); + keyval_do_merge(dest, merged, str, errp); + g_string_free(str, TRUE); +} + +/* + * Listify @cur recursively. + * Replace QDicts whose keys are all valid list indexes by QLists. + * @key_of_cur is the list of key fragments leading up to @cur. + * On success, return either @cur or its replacement. + * On failure, store an error through @errp and return NULL. + */ +static QObject *keyval_listify(QDict *cur, GSList *key_of_cur, Error **errp) +{ + GSList key_node; + bool has_index, has_member; + const QDictEntry *ent; + QDict *qdict; + QObject *val; + char *key; + size_t nelt; + QObject **elt; + int index, max_index, i; + QList *list; + + key_node.next = key_of_cur; + + /* + * Recursively listify @cur's members, and figure out whether @cur + * itself is to be listified. + */ + has_index = false; + has_member = false; + for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) { + if (key_to_index(ent->key, NULL) >= 0) { + has_index = true; + } else { + has_member = true; + } + + qdict = qobject_to(QDict, ent->value); + if (!qdict) { + continue; + } + + key_node.data = ent->key; + val = keyval_listify(qdict, &key_node, errp); + if (!val) { + return NULL; + } + if (val != ent->value) { + qdict_put_obj(cur, ent->key, val); + } + } + + if (has_index && has_member) { + key = reassemble_key(key_of_cur); + error_setg(errp, "Parameters '%s*' used inconsistently", key); + g_free(key); + return NULL; + } + if (!has_index) { + return QOBJECT(cur); + } + + /* Copy @cur's values to @elt[] */ + nelt = qdict_size(cur) + 1; /* one extra, for use as sentinel */ + elt = g_new0(QObject *, nelt); + max_index = -1; + for (ent = qdict_first(cur); ent; ent = qdict_next(cur, ent)) { + index = key_to_index(ent->key, NULL); + assert(index >= 0); + if (index > max_index) { + max_index = index; + } + /* + * We iterate @nelt times. If we get one exceeding @nelt + * here, we will put less than @nelt values into @elt[], + * triggering the error in the next loop. + */ + if ((size_t)index >= nelt - 1) { + continue; + } + /* Even though dict keys are distinct, indexes need not be */ + elt[index] = ent->value; + } + + /* + * Make a list from @elt[], reporting the first missing element, + * if any. + * If we dropped an index >= nelt in the previous loop, this loop + * will run into the sentinel and report index @nelt missing. + */ + list = qlist_new(); + assert(!elt[nelt-1]); /* need the sentinel to be null */ + for (i = 0; i < MIN(nelt, max_index + 1); i++) { + if (!elt[i]) { + key = reassemble_key(key_of_cur); + error_setg(errp, "Parameter '%s%d' missing", key, i); + g_free(key); + g_free(elt); + qobject_unref(list); + return NULL; + } + qobject_ref(elt[i]); + qlist_append_obj(list, elt[i]); + } + + g_free(elt); + return QOBJECT(list); +} + +/* + * Parse @params in QEMU's traditional KEY=VALUE,... syntax. + * + * If @implied_key, the first KEY= can be omitted. @implied_key is + * implied then, and VALUE can't be empty or contain ',' or '='. + * + * A parameter "help" or "?" without a value isn't added to the + * resulting dictionary, but instead is interpreted as help request. + * All other options are parsed and returned normally so that context + * specific help can be printed. + * + * If @p_help is not NULL, store whether help is requested there. + * If @p_help is NULL and help is requested, fail. + * + * On success, return @dict, now filled with the parsed keys and values. + * + * On failure, store an error through @errp and return NULL. Any keys + * and values parsed so far will be in @dict nevertheless. + */ +QDict *keyval_parse_into(QDict *qdict, const char *params, const char *implied_key, + bool *p_help, Error **errp) +{ + QObject *listified; + const char *s; + bool help = false; + + s = params; + while (*s) { + s = keyval_parse_one(qdict, s, implied_key, &help, errp); + if (!s) { + return NULL; + } + implied_key = NULL; + } + + if (p_help) { + *p_help = help; + } else if (help) { + error_setg(errp, "Help is not available for this option"); + return NULL; + } + + listified = keyval_listify(qdict, NULL, errp); + if (!listified) { + return NULL; + } + assert(listified == QOBJECT(qdict)); + return qdict; +} + +/* + * Parse @params in QEMU's traditional KEY=VALUE,... syntax. + * + * If @implied_key, the first KEY= can be omitted. @implied_key is + * implied then, and VALUE can't be empty or contain ',' or '='. + * + * A parameter "help" or "?" without a value isn't added to the + * resulting dictionary, but instead is interpreted as help request. + * All other options are parsed and returned normally so that context + * specific help can be printed. + * + * If @p_help is not NULL, store whether help is requested there. + * If @p_help is NULL and help is requested, fail. + * + * On success, return a dictionary of the parsed keys and values. + * On failure, store an error through @errp and return NULL. + */ +QDict *keyval_parse(const char *params, const char *implied_key, + bool *p_help, Error **errp) +{ + QDict *qdict = qdict_new(); + QDict *ret = keyval_parse_into(qdict, params, implied_key, p_help, errp); + + if (!ret) { + qobject_unref(qdict); + } + return ret; +} diff --git a/util/lockcnt.c b/util/lockcnt.c new file mode 100644 index 000000000..5da36946b --- /dev/null +++ b/util/lockcnt.c @@ -0,0 +1,399 @@ +/* + * QemuLockCnt implementation + * + * Copyright Red Hat, Inc. 2017 + * + * Author: + * Paolo Bonzini <pbonzini@redhat.com> + */ +#include "qemu/osdep.h" +#include "qemu/thread.h" +#include "qemu/atomic.h" +#include "trace.h" + +#ifdef CONFIG_LINUX +#include "qemu/futex.h" + +/* On Linux, bits 0-1 are a futex-based lock, bits 2-31 are the counter. + * For the mutex algorithm see Ulrich Drepper's "Futexes Are Tricky" (ok, + * this is not the most relaxing citation I could make...). It is similar + * to mutex2 in the paper. + */ + +#define QEMU_LOCKCNT_STATE_MASK 3 +#define QEMU_LOCKCNT_STATE_FREE 0 /* free, uncontended */ +#define QEMU_LOCKCNT_STATE_LOCKED 1 /* locked, uncontended */ +#define QEMU_LOCKCNT_STATE_WAITING 2 /* locked, contended */ + +#define QEMU_LOCKCNT_COUNT_STEP 4 +#define QEMU_LOCKCNT_COUNT_SHIFT 2 + +void qemu_lockcnt_init(QemuLockCnt *lockcnt) +{ + lockcnt->count = 0; +} + +void qemu_lockcnt_destroy(QemuLockCnt *lockcnt) +{ +} + +/* *val is the current value of lockcnt->count. + * + * If the lock is free, try a cmpxchg from *val to new_if_free; return + * true and set *val to the old value found by the cmpxchg in + * lockcnt->count. + * + * If the lock is taken, wait for it to be released and return false + * *without trying again to take the lock*. Again, set *val to the + * new value of lockcnt->count. + * + * If *waited is true on return, new_if_free's bottom two bits must not + * be QEMU_LOCKCNT_STATE_LOCKED on subsequent calls, because the caller + * does not know if there are other waiters. Furthermore, after *waited + * is set the caller has effectively acquired the lock. If it returns + * with the lock not taken, it must wake another futex waiter. + */ +static bool qemu_lockcnt_cmpxchg_or_wait(QemuLockCnt *lockcnt, int *val, + int new_if_free, bool *waited) +{ + /* Fast path for when the lock is free. */ + if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_FREE) { + int expected = *val; + + trace_lockcnt_fast_path_attempt(lockcnt, expected, new_if_free); + *val = qatomic_cmpxchg(&lockcnt->count, expected, new_if_free); + if (*val == expected) { + trace_lockcnt_fast_path_success(lockcnt, expected, new_if_free); + *val = new_if_free; + return true; + } + } + + /* The slow path moves from locked to waiting if necessary, then + * does a futex wait. Both steps can be repeated ad nauseam, + * only getting out of the loop if we can have another shot at the + * fast path. Once we can, get out to compute the new destination + * value for the fast path. + */ + while ((*val & QEMU_LOCKCNT_STATE_MASK) != QEMU_LOCKCNT_STATE_FREE) { + if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_LOCKED) { + int expected = *val; + int new = expected - QEMU_LOCKCNT_STATE_LOCKED + QEMU_LOCKCNT_STATE_WAITING; + + trace_lockcnt_futex_wait_prepare(lockcnt, expected, new); + *val = qatomic_cmpxchg(&lockcnt->count, expected, new); + if (*val == expected) { + *val = new; + } + continue; + } + + if ((*val & QEMU_LOCKCNT_STATE_MASK) == QEMU_LOCKCNT_STATE_WAITING) { + *waited = true; + trace_lockcnt_futex_wait(lockcnt, *val); + qemu_futex_wait(&lockcnt->count, *val); + *val = qatomic_read(&lockcnt->count); + trace_lockcnt_futex_wait_resume(lockcnt, *val); + continue; + } + + abort(); + } + return false; +} + +static void lockcnt_wake(QemuLockCnt *lockcnt) +{ + trace_lockcnt_futex_wake(lockcnt); + qemu_futex_wake(&lockcnt->count, 1); +} + +void qemu_lockcnt_inc(QemuLockCnt *lockcnt) +{ + int val = qatomic_read(&lockcnt->count); + bool waited = false; + + for (;;) { + if (val >= QEMU_LOCKCNT_COUNT_STEP) { + int expected = val; + val = qatomic_cmpxchg(&lockcnt->count, val, + val + QEMU_LOCKCNT_COUNT_STEP); + if (val == expected) { + break; + } + } else { + /* The fast path is (0, unlocked)->(1, unlocked). */ + if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, QEMU_LOCKCNT_COUNT_STEP, + &waited)) { + break; + } + } + } + + /* If we were woken by another thread, we should also wake one because + * we are effectively releasing the lock that was given to us. This is + * the case where qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING + * in the low bits, and qemu_lockcnt_inc_and_unlock would find it and + * wake someone. + */ + if (waited) { + lockcnt_wake(lockcnt); + } +} + +void qemu_lockcnt_dec(QemuLockCnt *lockcnt) +{ + qatomic_sub(&lockcnt->count, QEMU_LOCKCNT_COUNT_STEP); +} + +/* Decrement a counter, and return locked if it is decremented to zero. + * If the function returns true, it is impossible for the counter to + * become nonzero until the next qemu_lockcnt_unlock. + */ +bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt) +{ + int val = qatomic_read(&lockcnt->count); + int locked_state = QEMU_LOCKCNT_STATE_LOCKED; + bool waited = false; + + for (;;) { + if (val >= 2 * QEMU_LOCKCNT_COUNT_STEP) { + int expected = val; + val = qatomic_cmpxchg(&lockcnt->count, val, + val - QEMU_LOCKCNT_COUNT_STEP); + if (val == expected) { + break; + } + } else { + /* If count is going 1->0, take the lock. The fast path is + * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting). + */ + if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) { + return true; + } + + if (waited) { + /* At this point we do not know if there are more waiters. Assume + * there are. + */ + locked_state = QEMU_LOCKCNT_STATE_WAITING; + } + } + } + + /* If we were woken by another thread, but we're returning in unlocked + * state, we should also wake a thread because we are effectively + * releasing the lock that was given to us. This is the case where + * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low + * bits, and qemu_lockcnt_unlock would find it and wake someone. + */ + if (waited) { + lockcnt_wake(lockcnt); + } + return false; +} + +/* If the counter is one, decrement it and return locked. Otherwise do + * nothing. + * + * If the function returns true, it is impossible for the counter to + * become nonzero until the next qemu_lockcnt_unlock. + */ +bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt) +{ + int val = qatomic_read(&lockcnt->count); + int locked_state = QEMU_LOCKCNT_STATE_LOCKED; + bool waited = false; + + while (val < 2 * QEMU_LOCKCNT_COUNT_STEP) { + /* If count is going 1->0, take the lock. The fast path is + * (1, unlocked)->(0, locked) or (1, unlocked)->(0, waiting). + */ + if (qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, locked_state, &waited)) { + return true; + } + + if (waited) { + /* At this point we do not know if there are more waiters. Assume + * there are. + */ + locked_state = QEMU_LOCKCNT_STATE_WAITING; + } + } + + /* If we were woken by another thread, but we're returning in unlocked + * state, we should also wake a thread because we are effectively + * releasing the lock that was given to us. This is the case where + * qemu_lockcnt_lock would leave QEMU_LOCKCNT_STATE_WAITING in the low + * bits, and qemu_lockcnt_inc_and_unlock would find it and wake someone. + */ + if (waited) { + lockcnt_wake(lockcnt); + } + return false; +} + +void qemu_lockcnt_lock(QemuLockCnt *lockcnt) +{ + int val = qatomic_read(&lockcnt->count); + int step = QEMU_LOCKCNT_STATE_LOCKED; + bool waited = false; + + /* The third argument is only used if the low bits of val are 0 + * (QEMU_LOCKCNT_STATE_FREE), so just blindly mix in the desired + * state. + */ + while (!qemu_lockcnt_cmpxchg_or_wait(lockcnt, &val, val + step, &waited)) { + if (waited) { + /* At this point we do not know if there are more waiters. Assume + * there are. + */ + step = QEMU_LOCKCNT_STATE_WAITING; + } + } +} + +void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt) +{ + int expected, new, val; + + val = qatomic_read(&lockcnt->count); + do { + expected = val; + new = (val + QEMU_LOCKCNT_COUNT_STEP) & ~QEMU_LOCKCNT_STATE_MASK; + trace_lockcnt_unlock_attempt(lockcnt, val, new); + val = qatomic_cmpxchg(&lockcnt->count, val, new); + } while (val != expected); + + trace_lockcnt_unlock_success(lockcnt, val, new); + if (val & QEMU_LOCKCNT_STATE_WAITING) { + lockcnt_wake(lockcnt); + } +} + +void qemu_lockcnt_unlock(QemuLockCnt *lockcnt) +{ + int expected, new, val; + + val = qatomic_read(&lockcnt->count); + do { + expected = val; + new = val & ~QEMU_LOCKCNT_STATE_MASK; + trace_lockcnt_unlock_attempt(lockcnt, val, new); + val = qatomic_cmpxchg(&lockcnt->count, val, new); + } while (val != expected); + + trace_lockcnt_unlock_success(lockcnt, val, new); + if (val & QEMU_LOCKCNT_STATE_WAITING) { + lockcnt_wake(lockcnt); + } +} + +unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt) +{ + return qatomic_read(&lockcnt->count) >> QEMU_LOCKCNT_COUNT_SHIFT; +} +#else +void qemu_lockcnt_init(QemuLockCnt *lockcnt) +{ + qemu_mutex_init(&lockcnt->mutex); + lockcnt->count = 0; +} + +void qemu_lockcnt_destroy(QemuLockCnt *lockcnt) +{ + qemu_mutex_destroy(&lockcnt->mutex); +} + +void qemu_lockcnt_inc(QemuLockCnt *lockcnt) +{ + int old; + for (;;) { + old = qatomic_read(&lockcnt->count); + if (old == 0) { + qemu_lockcnt_lock(lockcnt); + qemu_lockcnt_inc_and_unlock(lockcnt); + return; + } else { + if (qatomic_cmpxchg(&lockcnt->count, old, old + 1) == old) { + return; + } + } + } +} + +void qemu_lockcnt_dec(QemuLockCnt *lockcnt) +{ + qatomic_dec(&lockcnt->count); +} + +/* Decrement a counter, and return locked if it is decremented to zero. + * It is impossible for the counter to become nonzero while the mutex + * is taken. + */ +bool qemu_lockcnt_dec_and_lock(QemuLockCnt *lockcnt) +{ + int val = qatomic_read(&lockcnt->count); + while (val > 1) { + int old = qatomic_cmpxchg(&lockcnt->count, val, val - 1); + if (old != val) { + val = old; + continue; + } + + return false; + } + + qemu_lockcnt_lock(lockcnt); + if (qatomic_fetch_dec(&lockcnt->count) == 1) { + return true; + } + + qemu_lockcnt_unlock(lockcnt); + return false; +} + +/* Decrement a counter and return locked if it is decremented to zero. + * Otherwise do nothing. + * + * It is impossible for the counter to become nonzero while the mutex + * is taken. + */ +bool qemu_lockcnt_dec_if_lock(QemuLockCnt *lockcnt) +{ + /* No need for acquire semantics if we return false. */ + int val = qatomic_read(&lockcnt->count); + if (val > 1) { + return false; + } + + qemu_lockcnt_lock(lockcnt); + if (qatomic_fetch_dec(&lockcnt->count) == 1) { + return true; + } + + qemu_lockcnt_inc_and_unlock(lockcnt); + return false; +} + +void qemu_lockcnt_lock(QemuLockCnt *lockcnt) +{ + qemu_mutex_lock(&lockcnt->mutex); +} + +void qemu_lockcnt_inc_and_unlock(QemuLockCnt *lockcnt) +{ + qatomic_inc(&lockcnt->count); + qemu_mutex_unlock(&lockcnt->mutex); +} + +void qemu_lockcnt_unlock(QemuLockCnt *lockcnt) +{ + qemu_mutex_unlock(&lockcnt->mutex); +} + +unsigned qemu_lockcnt_count(QemuLockCnt *lockcnt) +{ + return qatomic_read(&lockcnt->count); +} +#endif diff --git a/util/log.c b/util/log.c new file mode 100644 index 000000000..2ee1500be --- /dev/null +++ b/util/log.c @@ -0,0 +1,389 @@ +/* + * Logging support + * + * Copyright (c) 2003 Fabrice Bellard + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/log.h" +#include "qemu/range.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "qemu/cutils.h" +#include "trace/control.h" +#include "qemu/thread.h" +#include "qemu/lockable.h" + +static char *logfilename; +static QemuMutex qemu_logfile_mutex; +QemuLogFile *qemu_logfile; +int qemu_loglevel; +static int log_append = 0; +static GArray *debug_regions; + +/* Return the number of characters emitted. */ +int qemu_log(const char *fmt, ...) +{ + int ret = 0; + QemuLogFile *logfile; + + rcu_read_lock(); + logfile = qatomic_rcu_read(&qemu_logfile); + if (logfile) { + va_list ap; + va_start(ap, fmt); + ret = vfprintf(logfile->fd, fmt, ap); + va_end(ap); + + /* Don't pass back error results. */ + if (ret < 0) { + ret = 0; + } + } + rcu_read_unlock(); + return ret; +} + +static void __attribute__((__constructor__)) qemu_logfile_init(void) +{ + qemu_mutex_init(&qemu_logfile_mutex); +} + +static void qemu_logfile_free(QemuLogFile *logfile) +{ + g_assert(logfile); + + if (logfile->fd != stderr) { + fclose(logfile->fd); + } + g_free(logfile); +} + +static bool log_uses_own_buffers; + +/* enable or disable low levels log */ +void qemu_set_log(int log_flags) +{ + bool need_to_open_file = false; + QemuLogFile *logfile; + + qemu_loglevel = log_flags; +#ifdef CONFIG_TRACE_LOG + qemu_loglevel |= LOG_TRACE; +#endif + /* + * In all cases we only log if qemu_loglevel is set. + * Also: + * If not daemonized we will always log either to stderr + * or to a file (if there is a logfilename). + * If we are daemonized, + * we will only log if there is a logfilename. + */ + if (qemu_loglevel && (!is_daemonized() || logfilename)) { + need_to_open_file = true; + } + QEMU_LOCK_GUARD(&qemu_logfile_mutex); + if (qemu_logfile && !need_to_open_file) { + logfile = qemu_logfile; + qatomic_rcu_set(&qemu_logfile, NULL); + call_rcu(logfile, qemu_logfile_free, rcu); + } else if (!qemu_logfile && need_to_open_file) { + logfile = g_new0(QemuLogFile, 1); + if (logfilename) { + logfile->fd = fopen(logfilename, log_append ? "a" : "w"); + if (!logfile->fd) { + g_free(logfile); + perror(logfilename); + _exit(1); + } + /* In case we are a daemon redirect stderr to logfile */ + if (is_daemonized()) { + dup2(fileno(logfile->fd), STDERR_FILENO); + fclose(logfile->fd); + /* This will skip closing logfile in qemu_log_close() */ + logfile->fd = stderr; + } + } else { + /* Default to stderr if no log file specified */ + assert(!is_daemonized()); + logfile->fd = stderr; + } + /* must avoid mmap() usage of glibc by setting a buffer "by hand" */ + if (log_uses_own_buffers) { + static char logfile_buf[4096]; + + setvbuf(logfile->fd, logfile_buf, _IOLBF, sizeof(logfile_buf)); + } else { +#if defined(_WIN32) + /* Win32 doesn't support line-buffering, so use unbuffered output. */ + setvbuf(logfile->fd, NULL, _IONBF, 0); +#else + setvbuf(logfile->fd, NULL, _IOLBF, 0); +#endif + log_append = 1; + } + qatomic_rcu_set(&qemu_logfile, logfile); + } +} + +void qemu_log_needs_buffers(void) +{ + log_uses_own_buffers = true; +} + +/* + * Allow the user to include %d in their logfile which will be + * substituted with the current PID. This is useful for debugging many + * nested linux-user tasks but will result in lots of logs. + * + * filename may be NULL. In that case, log output is sent to stderr + */ +void qemu_set_log_filename(const char *filename, Error **errp) +{ + g_free(logfilename); + logfilename = NULL; + + if (filename) { + char *pidstr = strstr(filename, "%"); + if (pidstr) { + /* We only accept one %d, no other format strings */ + if (pidstr[1] != 'd' || strchr(pidstr + 2, '%')) { + error_setg(errp, "Bad logfile format: %s", filename); + return; + } else { + logfilename = g_strdup_printf(filename, getpid()); + } + } else { + logfilename = g_strdup(filename); + } + } + + qemu_log_close(); + qemu_set_log(qemu_loglevel); +} + +/* Returns true if addr is in our debug filter or no filter defined + */ +bool qemu_log_in_addr_range(uint64_t addr) +{ + if (debug_regions) { + int i = 0; + for (i = 0; i < debug_regions->len; i++) { + Range *range = &g_array_index(debug_regions, Range, i); + if (range_contains(range, addr)) { + return true; + } + } + return false; + } else { + return true; + } +} + + +void qemu_set_dfilter_ranges(const char *filter_spec, Error **errp) +{ + gchar **ranges = g_strsplit(filter_spec, ",", 0); + int i; + + if (debug_regions) { + g_array_unref(debug_regions); + debug_regions = NULL; + } + + debug_regions = g_array_sized_new(FALSE, FALSE, + sizeof(Range), g_strv_length(ranges)); + for (i = 0; ranges[i]; i++) { + const char *r = ranges[i]; + const char *range_op, *r2, *e; + uint64_t r1val, r2val, lob, upb; + struct Range range; + + range_op = strstr(r, "-"); + r2 = range_op ? range_op + 1 : NULL; + if (!range_op) { + range_op = strstr(r, "+"); + r2 = range_op ? range_op + 1 : NULL; + } + if (!range_op) { + range_op = strstr(r, ".."); + r2 = range_op ? range_op + 2 : NULL; + } + if (!range_op) { + error_setg(errp, "Bad range specifier"); + goto out; + } + + if (qemu_strtou64(r, &e, 0, &r1val) + || e != range_op) { + error_setg(errp, "Invalid number to the left of %.*s", + (int)(r2 - range_op), range_op); + goto out; + } + if (qemu_strtou64(r2, NULL, 0, &r2val)) { + error_setg(errp, "Invalid number to the right of %.*s", + (int)(r2 - range_op), range_op); + goto out; + } + + switch (*range_op) { + case '+': + lob = r1val; + upb = r1val + r2val - 1; + break; + case '-': + upb = r1val; + lob = r1val - (r2val - 1); + break; + case '.': + lob = r1val; + upb = r2val; + break; + default: + g_assert_not_reached(); + } + if (lob > upb) { + error_setg(errp, "Invalid range"); + goto out; + } + range_set_bounds(&range, lob, upb); + g_array_append_val(debug_regions, range); + } +out: + g_strfreev(ranges); +} + +/* fflush() the log file */ +void qemu_log_flush(void) +{ + QemuLogFile *logfile; + + rcu_read_lock(); + logfile = qatomic_rcu_read(&qemu_logfile); + if (logfile) { + fflush(logfile->fd); + } + rcu_read_unlock(); +} + +/* Close the log file */ +void qemu_log_close(void) +{ + QemuLogFile *logfile; + + qemu_mutex_lock(&qemu_logfile_mutex); + logfile = qemu_logfile; + + if (logfile) { + qatomic_rcu_set(&qemu_logfile, NULL); + call_rcu(logfile, qemu_logfile_free, rcu); + } + qemu_mutex_unlock(&qemu_logfile_mutex); +} + +const QEMULogItem qemu_log_items[] = { + { CPU_LOG_TB_OUT_ASM, "out_asm", + "show generated host assembly code for each compiled TB" }, + { CPU_LOG_TB_IN_ASM, "in_asm", + "show target assembly code for each compiled TB" }, + { CPU_LOG_TB_OP, "op", + "show micro ops for each compiled TB" }, + { CPU_LOG_TB_OP_OPT, "op_opt", + "show micro ops after optimization" }, + { CPU_LOG_TB_OP_IND, "op_ind", + "show micro ops before indirect lowering" }, + { CPU_LOG_INT, "int", + "show interrupts/exceptions in short format" }, + { CPU_LOG_EXEC, "exec", + "show trace before each executed TB (lots of logs)" }, + { CPU_LOG_TB_CPU, "cpu", + "show CPU registers before entering a TB (lots of logs)" }, + { CPU_LOG_TB_FPU, "fpu", + "include FPU registers in the 'cpu' logging" }, + { CPU_LOG_MMU, "mmu", + "log MMU-related activities" }, + { CPU_LOG_PCALL, "pcall", + "x86 only: show protected mode far calls/returns/exceptions" }, + { CPU_LOG_RESET, "cpu_reset", + "show CPU state before CPU resets" }, + { LOG_UNIMP, "unimp", + "log unimplemented functionality" }, + { LOG_GUEST_ERROR, "guest_errors", + "log when the guest OS does something invalid (eg accessing a\n" + "non-existent register)" }, + { CPU_LOG_PAGE, "page", + "dump pages at beginning of user mode emulation" }, + { CPU_LOG_TB_NOCHAIN, "nochain", + "do not chain compiled TBs so that \"exec\" and \"cpu\" show\n" + "complete traces" }, +#ifdef CONFIG_PLUGIN + { CPU_LOG_PLUGIN, "plugin", "output from TCG plugins\n"}, +#endif + { LOG_STRACE, "strace", + "log every user-mode syscall, its input, and its result" }, + { 0, NULL, NULL }, +}; + +/* takes a comma separated list of log masks. Return 0 if error. */ +int qemu_str_to_log_mask(const char *str) +{ + const QEMULogItem *item; + int mask = 0; + char **parts = g_strsplit(str, ",", 0); + char **tmp; + + for (tmp = parts; tmp && *tmp; tmp++) { + if (g_str_equal(*tmp, "all")) { + for (item = qemu_log_items; item->mask != 0; item++) { + mask |= item->mask; + } +#ifdef CONFIG_TRACE_LOG + } else if (g_str_has_prefix(*tmp, "trace:") && (*tmp)[6] != '\0') { + trace_enable_events((*tmp) + 6); + mask |= LOG_TRACE; +#endif + } else { + for (item = qemu_log_items; item->mask != 0; item++) { + if (g_str_equal(*tmp, item->name)) { + goto found; + } + } + goto error; + found: + mask |= item->mask; + } + } + + g_strfreev(parts); + return mask; + + error: + g_strfreev(parts); + return 0; +} + +void qemu_print_log_usage(FILE *f) +{ + const QEMULogItem *item; + fprintf(f, "Log items (comma separated):\n"); + for (item = qemu_log_items; item->mask != 0; item++) { + fprintf(f, "%-15s %s\n", item->name, item->help); + } +#ifdef CONFIG_TRACE_LOG + fprintf(f, "trace:PATTERN enable trace events\n"); + fprintf(f, "\nUse \"-d trace:help\" to get a list of trace events.\n\n"); +#endif +} diff --git a/util/main-loop.c b/util/main-loop.c new file mode 100644 index 000000000..06b18b195 --- /dev/null +++ b/util/main-loop.c @@ -0,0 +1,594 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/cutils.h" +#include "qemu/timer.h" +#include "sysemu/cpu-timers.h" +#include "sysemu/replay.h" +#include "qemu/main-loop.h" +#include "block/aio.h" +#include "qemu/error-report.h" +#include "qemu/queue.h" +#include "qemu/compiler.h" + +#ifndef _WIN32 +#include <sys/wait.h> +#endif + +#ifndef _WIN32 + +/* If we have signalfd, we mask out the signals we want to handle and then + * use signalfd to listen for them. We rely on whatever the current signal + * handler is to dispatch the signals when we receive them. + */ +/* + * Disable CFI checks. + * We are going to call a signal hander directly. Such handler may or may not + * have been defined in our binary, so there's no guarantee that the pointer + * used to set the handler is a cfi-valid pointer. Since the handlers are + * stored in kernel memory, changing the handler to an attacker-defined + * function requires being able to call a sigaction() syscall, + * which is not as easy as overwriting a pointer in memory. + */ +QEMU_DISABLE_CFI +static void sigfd_handler(void *opaque) +{ + int fd = (intptr_t)opaque; + struct qemu_signalfd_siginfo info; + struct sigaction action; + ssize_t len; + + while (1) { + do { + len = read(fd, &info, sizeof(info)); + } while (len == -1 && errno == EINTR); + + if (len == -1 && errno == EAGAIN) { + break; + } + + if (len != sizeof(info)) { + error_report("read from sigfd returned %zd: %s", len, + g_strerror(errno)); + return; + } + + sigaction(info.ssi_signo, NULL, &action); + if ((action.sa_flags & SA_SIGINFO) && action.sa_sigaction) { + sigaction_invoke(&action, &info); + } else if (action.sa_handler) { + action.sa_handler(info.ssi_signo); + } + } +} + +static int qemu_signal_init(Error **errp) +{ + int sigfd; + sigset_t set; + + /* + * SIG_IPI must be blocked in the main thread and must not be caught + * by sigwait() in the signal thread. Otherwise, the cpu thread will + * not catch it reliably. + */ + sigemptyset(&set); + sigaddset(&set, SIG_IPI); + sigaddset(&set, SIGIO); + sigaddset(&set, SIGALRM); + sigaddset(&set, SIGBUS); + /* SIGINT cannot be handled via signalfd, so that ^C can be used + * to interrupt QEMU when it is being run under gdb. SIGHUP and + * SIGTERM are also handled asynchronously, even though it is not + * strictly necessary, because they use the same handler as SIGINT. + */ + pthread_sigmask(SIG_BLOCK, &set, NULL); + + sigdelset(&set, SIG_IPI); + sigfd = qemu_signalfd(&set); + if (sigfd == -1) { + error_setg_errno(errp, errno, "failed to create signalfd"); + return -errno; + } + + fcntl_setfl(sigfd, O_NONBLOCK); + + qemu_set_fd_handler(sigfd, sigfd_handler, NULL, (void *)(intptr_t)sigfd); + + return 0; +} + +#else /* _WIN32 */ + +static int qemu_signal_init(Error **errp) +{ + return 0; +} +#endif + +static AioContext *qemu_aio_context; +static QEMUBH *qemu_notify_bh; + +static void notify_event_cb(void *opaque) +{ + /* No need to do anything; this bottom half is only used to + * kick the kernel out of ppoll/poll/WaitForMultipleObjects. + */ +} + +AioContext *qemu_get_aio_context(void) +{ + return qemu_aio_context; +} + +void qemu_notify_event(void) +{ + if (!qemu_aio_context) { + return; + } + qemu_bh_schedule(qemu_notify_bh); +} + +static GArray *gpollfds; + +int qemu_init_main_loop(Error **errp) +{ + int ret; + GSource *src; + + init_clocks(qemu_timer_notify_cb); + + ret = qemu_signal_init(errp); + if (ret) { + return ret; + } + + qemu_aio_context = aio_context_new(errp); + if (!qemu_aio_context) { + return -EMFILE; + } + qemu_set_current_aio_context(qemu_aio_context); + qemu_notify_bh = qemu_bh_new(notify_event_cb, NULL); + gpollfds = g_array_new(FALSE, FALSE, sizeof(GPollFD)); + src = aio_get_g_source(qemu_aio_context); + g_source_set_name(src, "aio-context"); + g_source_attach(src, NULL); + g_source_unref(src); + src = iohandler_get_g_source(); + g_source_set_name(src, "io-handler"); + g_source_attach(src, NULL); + g_source_unref(src); + return 0; +} + +static int max_priority; + +#ifndef _WIN32 +static int glib_pollfds_idx; +static int glib_n_poll_fds; + +void qemu_fd_register(int fd) +{ +} + +static void glib_pollfds_fill(int64_t *cur_timeout) +{ + GMainContext *context = g_main_context_default(); + int timeout = 0; + int64_t timeout_ns; + int n; + + g_main_context_prepare(context, &max_priority); + + glib_pollfds_idx = gpollfds->len; + n = glib_n_poll_fds; + do { + GPollFD *pfds; + glib_n_poll_fds = n; + g_array_set_size(gpollfds, glib_pollfds_idx + glib_n_poll_fds); + pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx); + n = g_main_context_query(context, max_priority, &timeout, pfds, + glib_n_poll_fds); + } while (n != glib_n_poll_fds); + + if (timeout < 0) { + timeout_ns = -1; + } else { + timeout_ns = (int64_t)timeout * (int64_t)SCALE_MS; + } + + *cur_timeout = qemu_soonest_timeout(timeout_ns, *cur_timeout); +} + +static void glib_pollfds_poll(void) +{ + GMainContext *context = g_main_context_default(); + GPollFD *pfds = &g_array_index(gpollfds, GPollFD, glib_pollfds_idx); + + if (g_main_context_check(context, max_priority, pfds, glib_n_poll_fds)) { + g_main_context_dispatch(context); + } +} + +#define MAX_MAIN_LOOP_SPIN (1000) + +static int os_host_main_loop_wait(int64_t timeout) +{ + GMainContext *context = g_main_context_default(); + int ret; + + g_main_context_acquire(context); + + glib_pollfds_fill(&timeout); + + qemu_mutex_unlock_iothread(); + replay_mutex_unlock(); + + ret = qemu_poll_ns((GPollFD *)gpollfds->data, gpollfds->len, timeout); + + replay_mutex_lock(); + qemu_mutex_lock_iothread(); + + glib_pollfds_poll(); + + g_main_context_release(context); + + return ret; +} +#else +/***********************************************************/ +/* Polling handling */ + +typedef struct PollingEntry { + PollingFunc *func; + void *opaque; + struct PollingEntry *next; +} PollingEntry; + +static PollingEntry *first_polling_entry; + +int qemu_add_polling_cb(PollingFunc *func, void *opaque) +{ + PollingEntry **ppe, *pe; + pe = g_malloc0(sizeof(PollingEntry)); + pe->func = func; + pe->opaque = opaque; + for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next); + *ppe = pe; + return 0; +} + +void qemu_del_polling_cb(PollingFunc *func, void *opaque) +{ + PollingEntry **ppe, *pe; + for(ppe = &first_polling_entry; *ppe != NULL; ppe = &(*ppe)->next) { + pe = *ppe; + if (pe->func == func && pe->opaque == opaque) { + *ppe = pe->next; + g_free(pe); + break; + } + } +} + +/***********************************************************/ +/* Wait objects support */ +typedef struct WaitObjects { + int num; + int revents[MAXIMUM_WAIT_OBJECTS + 1]; + HANDLE events[MAXIMUM_WAIT_OBJECTS + 1]; + WaitObjectFunc *func[MAXIMUM_WAIT_OBJECTS + 1]; + void *opaque[MAXIMUM_WAIT_OBJECTS + 1]; +} WaitObjects; + +static WaitObjects wait_objects = {0}; + +int qemu_add_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque) +{ + WaitObjects *w = &wait_objects; + if (w->num >= MAXIMUM_WAIT_OBJECTS) { + return -1; + } + w->events[w->num] = handle; + w->func[w->num] = func; + w->opaque[w->num] = opaque; + w->revents[w->num] = 0; + w->num++; + return 0; +} + +void qemu_del_wait_object(HANDLE handle, WaitObjectFunc *func, void *opaque) +{ + int i, found; + WaitObjects *w = &wait_objects; + + found = 0; + for (i = 0; i < w->num; i++) { + if (w->events[i] == handle) { + found = 1; + } + if (found) { + w->events[i] = w->events[i + 1]; + w->func[i] = w->func[i + 1]; + w->opaque[i] = w->opaque[i + 1]; + w->revents[i] = w->revents[i + 1]; + } + } + if (found) { + w->num--; + } +} + +void qemu_fd_register(int fd) +{ + WSAEventSelect(fd, event_notifier_get_handle(&qemu_aio_context->notifier), + FD_READ | FD_ACCEPT | FD_CLOSE | + FD_CONNECT | FD_WRITE | FD_OOB); +} + +static int pollfds_fill(GArray *pollfds, fd_set *rfds, fd_set *wfds, + fd_set *xfds) +{ + int nfds = -1; + int i; + + for (i = 0; i < pollfds->len; i++) { + GPollFD *pfd = &g_array_index(pollfds, GPollFD, i); + int fd = pfd->fd; + int events = pfd->events; + if (events & G_IO_IN) { + FD_SET(fd, rfds); + nfds = MAX(nfds, fd); + } + if (events & G_IO_OUT) { + FD_SET(fd, wfds); + nfds = MAX(nfds, fd); + } + if (events & G_IO_PRI) { + FD_SET(fd, xfds); + nfds = MAX(nfds, fd); + } + } + return nfds; +} + +static void pollfds_poll(GArray *pollfds, int nfds, fd_set *rfds, + fd_set *wfds, fd_set *xfds) +{ + int i; + + for (i = 0; i < pollfds->len; i++) { + GPollFD *pfd = &g_array_index(pollfds, GPollFD, i); + int fd = pfd->fd; + int revents = 0; + + if (FD_ISSET(fd, rfds)) { + revents |= G_IO_IN; + } + if (FD_ISSET(fd, wfds)) { + revents |= G_IO_OUT; + } + if (FD_ISSET(fd, xfds)) { + revents |= G_IO_PRI; + } + pfd->revents = revents & pfd->events; + } +} + +static int os_host_main_loop_wait(int64_t timeout) +{ + GMainContext *context = g_main_context_default(); + GPollFD poll_fds[1024 * 2]; /* this is probably overkill */ + int select_ret = 0; + int g_poll_ret, ret, i, n_poll_fds; + PollingEntry *pe; + WaitObjects *w = &wait_objects; + gint poll_timeout; + int64_t poll_timeout_ns; + static struct timeval tv0; + fd_set rfds, wfds, xfds; + int nfds; + + g_main_context_acquire(context); + + /* XXX: need to suppress polling by better using win32 events */ + ret = 0; + for (pe = first_polling_entry; pe != NULL; pe = pe->next) { + ret |= pe->func(pe->opaque); + } + if (ret != 0) { + g_main_context_release(context); + return ret; + } + + FD_ZERO(&rfds); + FD_ZERO(&wfds); + FD_ZERO(&xfds); + nfds = pollfds_fill(gpollfds, &rfds, &wfds, &xfds); + if (nfds >= 0) { + select_ret = select(nfds + 1, &rfds, &wfds, &xfds, &tv0); + if (select_ret != 0) { + timeout = 0; + } + if (select_ret > 0) { + pollfds_poll(gpollfds, nfds, &rfds, &wfds, &xfds); + } + } + + g_main_context_prepare(context, &max_priority); + n_poll_fds = g_main_context_query(context, max_priority, &poll_timeout, + poll_fds, ARRAY_SIZE(poll_fds)); + g_assert(n_poll_fds + w->num <= ARRAY_SIZE(poll_fds)); + + for (i = 0; i < w->num; i++) { + poll_fds[n_poll_fds + i].fd = (DWORD_PTR)w->events[i]; + poll_fds[n_poll_fds + i].events = G_IO_IN; + } + + if (poll_timeout < 0) { + poll_timeout_ns = -1; + } else { + poll_timeout_ns = (int64_t)poll_timeout * (int64_t)SCALE_MS; + } + + poll_timeout_ns = qemu_soonest_timeout(poll_timeout_ns, timeout); + + qemu_mutex_unlock_iothread(); + + replay_mutex_unlock(); + + g_poll_ret = qemu_poll_ns(poll_fds, n_poll_fds + w->num, poll_timeout_ns); + + replay_mutex_lock(); + + qemu_mutex_lock_iothread(); + if (g_poll_ret > 0) { + for (i = 0; i < w->num; i++) { + w->revents[i] = poll_fds[n_poll_fds + i].revents; + } + for (i = 0; i < w->num; i++) { + if (w->revents[i] && w->func[i]) { + w->func[i](w->opaque[i]); + } + } + } + + if (g_main_context_check(context, max_priority, poll_fds, n_poll_fds)) { + g_main_context_dispatch(context); + } + + g_main_context_release(context); + + return select_ret || g_poll_ret; +} +#endif + +static NotifierList main_loop_poll_notifiers = + NOTIFIER_LIST_INITIALIZER(main_loop_poll_notifiers); + +void main_loop_poll_add_notifier(Notifier *notify) +{ + notifier_list_add(&main_loop_poll_notifiers, notify); +} + +void main_loop_poll_remove_notifier(Notifier *notify) +{ + notifier_remove(notify); +} + +void main_loop_wait(int nonblocking) +{ + MainLoopPoll mlpoll = { + .state = MAIN_LOOP_POLL_FILL, + .timeout = UINT32_MAX, + .pollfds = gpollfds, + }; + int ret; + int64_t timeout_ns; + + if (nonblocking) { + mlpoll.timeout = 0; + } + + /* poll any events */ + g_array_set_size(gpollfds, 0); /* reset for new iteration */ + /* XXX: separate device handlers from system ones */ + notifier_list_notify(&main_loop_poll_notifiers, &mlpoll); + + if (mlpoll.timeout == UINT32_MAX) { + timeout_ns = -1; + } else { + timeout_ns = (uint64_t)mlpoll.timeout * (int64_t)(SCALE_MS); + } + + timeout_ns = qemu_soonest_timeout(timeout_ns, + timerlistgroup_deadline_ns( + &main_loop_tlg)); + + ret = os_host_main_loop_wait(timeout_ns); + mlpoll.state = ret < 0 ? MAIN_LOOP_POLL_ERR : MAIN_LOOP_POLL_OK; + notifier_list_notify(&main_loop_poll_notifiers, &mlpoll); + + if (icount_enabled()) { + /* + * CPU thread can infinitely wait for event after + * missing the warp + */ + icount_start_warp_timer(); + } + qemu_clock_run_all_timers(); +} + +/* Functions to operate on the main QEMU AioContext. */ + +QEMUBH *qemu_bh_new_full(QEMUBHFunc *cb, void *opaque, const char *name) +{ + return aio_bh_new_full(qemu_aio_context, cb, opaque, name); +} + +/* + * Functions to operate on the I/O handler AioContext. + * This context runs on top of main loop. We can't reuse qemu_aio_context + * because iohandlers mustn't be polled by aio_poll(qemu_aio_context). + */ +static AioContext *iohandler_ctx; + +static void iohandler_init(void) +{ + if (!iohandler_ctx) { + iohandler_ctx = aio_context_new(&error_abort); + } +} + +AioContext *iohandler_get_aio_context(void) +{ + iohandler_init(); + return iohandler_ctx; +} + +GSource *iohandler_get_g_source(void) +{ + iohandler_init(); + return aio_get_g_source(iohandler_ctx); +} + +void qemu_set_fd_handler(int fd, + IOHandler *fd_read, + IOHandler *fd_write, + void *opaque) +{ + iohandler_init(); + aio_set_fd_handler(iohandler_ctx, fd, false, + fd_read, fd_write, NULL, opaque); +} + +void event_notifier_set_handler(EventNotifier *e, + EventNotifierHandler *handler) +{ + iohandler_init(); + aio_set_event_notifier(iohandler_ctx, e, false, + handler, NULL); +} diff --git a/util/memfd.c b/util/memfd.c new file mode 100644 index 000000000..4a3c07e0b --- /dev/null +++ b/util/memfd.c @@ -0,0 +1,206 @@ +/* + * memfd.c + * + * Copyright (c) 2015 Red Hat, Inc. + * + * QEMU library functions on POSIX which are shared between QEMU and + * the QEMU tools. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/memfd.h" +#include "qemu/host-utils.h" + +#if defined CONFIG_LINUX && !defined CONFIG_MEMFD +#include <sys/syscall.h> +#include <asm/unistd.h> + +int memfd_create(const char *name, unsigned int flags) +{ +#ifdef __NR_memfd_create + return syscall(__NR_memfd_create, name, flags); +#else + errno = ENOSYS; + return -1; +#endif +} +#endif + +int qemu_memfd_create(const char *name, size_t size, bool hugetlb, + uint64_t hugetlbsize, unsigned int seals, Error **errp) +{ + int htsize = hugetlbsize ? ctz64(hugetlbsize) : 0; + + if (htsize && 1ULL << htsize != hugetlbsize) { + error_setg(errp, "Hugepage size must be a power of 2"); + return -1; + } + + htsize = htsize << MFD_HUGE_SHIFT; + +#ifdef CONFIG_LINUX + int mfd = -1; + unsigned int flags = MFD_CLOEXEC; + + if (seals) { + flags |= MFD_ALLOW_SEALING; + } + if (hugetlb) { + flags |= MFD_HUGETLB; + flags |= htsize; + } + mfd = memfd_create(name, flags); + if (mfd < 0) { + error_setg_errno(errp, errno, + "failed to create memfd with flags 0x%x", flags); + goto err; + } + + if (ftruncate(mfd, size) == -1) { + error_setg_errno(errp, errno, "failed to resize memfd to %zu", size); + goto err; + } + + if (seals && fcntl(mfd, F_ADD_SEALS, seals) == -1) { + error_setg_errno(errp, errno, "failed to add seals 0x%x", seals); + goto err; + } + + return mfd; + +err: + if (mfd >= 0) { + close(mfd); + } +#else + error_setg_errno(errp, ENOSYS, "failed to create memfd"); +#endif + return -1; +} + +/* + * This is a best-effort helper for shared memory allocation, with + * optional sealing. The helper will do his best to allocate using + * memfd with sealing, but may fallback on other methods without + * sealing. + */ +void *qemu_memfd_alloc(const char *name, size_t size, unsigned int seals, + int *fd, Error **errp) +{ + void *ptr; + int mfd = qemu_memfd_create(name, size, false, 0, seals, NULL); + + /* some systems have memfd without sealing */ + if (mfd == -1) { + mfd = qemu_memfd_create(name, size, false, 0, 0, NULL); + } + + if (mfd == -1) { + const char *tmpdir = g_get_tmp_dir(); + gchar *fname; + + fname = g_strdup_printf("%s/memfd-XXXXXX", tmpdir); + mfd = mkstemp(fname); + unlink(fname); + g_free(fname); + + if (mfd == -1 || + ftruncate(mfd, size) == -1) { + goto err; + } + } + + ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0); + if (ptr == MAP_FAILED) { + goto err; + } + + *fd = mfd; + return ptr; + +err: + error_setg_errno(errp, errno, "failed to allocate shared memory"); + if (mfd >= 0) { + close(mfd); + } + return NULL; +} + +void qemu_memfd_free(void *ptr, size_t size, int fd) +{ + if (ptr) { + munmap(ptr, size); + } + + if (fd != -1) { + close(fd); + } +} + +enum { + MEMFD_KO, + MEMFD_OK, + MEMFD_TODO +}; + +/** + * qemu_memfd_alloc_check(): + * + * Check if qemu_memfd_alloc() can allocate, including using a + * fallback implementation when host doesn't support memfd. + */ +bool qemu_memfd_alloc_check(void) +{ + static int memfd_check = MEMFD_TODO; + + if (memfd_check == MEMFD_TODO) { + int fd; + void *ptr; + + fd = -1; + ptr = qemu_memfd_alloc("test", 4096, 0, &fd, NULL); + memfd_check = ptr ? MEMFD_OK : MEMFD_KO; + qemu_memfd_free(ptr, 4096, fd); + } + + return memfd_check == MEMFD_OK; +} + +/** + * qemu_memfd_check(): + * + * Check if host supports memfd. + */ +bool qemu_memfd_check(unsigned int flags) +{ +#ifdef CONFIG_LINUX + int mfd = memfd_create("test", flags | MFD_CLOEXEC); + + if (mfd >= 0) { + close(mfd); + return true; + } +#endif + + return false; +} diff --git a/util/meson.build b/util/meson.build new file mode 100644 index 000000000..05b593055 --- /dev/null +++ b/util/meson.build @@ -0,0 +1,89 @@ +util_ss.add(files('osdep.c', 'cutils.c', 'unicode.c', 'qemu-timer-common.c')) +if not config_host_data.get('CONFIG_ATOMIC64') + util_ss.add(files('atomic64.c')) +endif +util_ss.add(when: 'CONFIG_POSIX', if_true: files('aio-posix.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('fdmon-poll.c')) +if config_host_data.get('CONFIG_EPOLL_CREATE1') + util_ss.add(files('fdmon-epoll.c')) +endif +util_ss.add(when: linux_io_uring, if_true: files('fdmon-io_uring.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('compatfd.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('event_notifier-posix.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('mmap-alloc.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('oslib-posix.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: [files('qemu-openpty.c'), util]) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('qemu-thread-posix.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('memfd.c')) +util_ss.add(when: 'CONFIG_WIN32', if_true: files('aio-win32.c')) +util_ss.add(when: 'CONFIG_WIN32', if_true: files('event_notifier-win32.c')) +util_ss.add(when: 'CONFIG_WIN32', if_true: files('oslib-win32.c')) +util_ss.add(when: 'CONFIG_WIN32', if_true: files('qemu-thread-win32.c')) +util_ss.add(when: 'CONFIG_WIN32', if_true: winmm) +util_ss.add(files('envlist.c', 'path.c', 'module.c')) +util_ss.add(files('host-utils.c')) +util_ss.add(files('bitmap.c', 'bitops.c')) +util_ss.add(files('fifo8.c')) +util_ss.add(files('cacheinfo.c', 'cacheflush.c')) +util_ss.add(files('error.c', 'qemu-error.c')) +util_ss.add(files('qemu-print.c')) +util_ss.add(files('id.c')) +util_ss.add(files('qemu-config.c', 'notify.c')) +util_ss.add(files('qemu-option.c', 'qemu-progress.c')) +util_ss.add(files('keyval.c')) +util_ss.add(files('crc32c.c')) +util_ss.add(files('uuid.c')) +util_ss.add(files('getauxval.c')) +util_ss.add(files('rcu.c')) +util_ss.add(when: 'CONFIG_MEMBARRIER', if_true: files('sys_membarrier.c')) +util_ss.add(files('log.c')) +util_ss.add(files('pagesize.c')) +util_ss.add(files('qdist.c')) +util_ss.add(files('qht.c')) +util_ss.add(files('qsp.c')) +util_ss.add(files('range.c')) +util_ss.add(files('stats64.c')) +util_ss.add(files('systemd.c')) +util_ss.add(files('transactions.c')) +util_ss.add(when: 'CONFIG_POSIX', if_true: files('drm.c')) +util_ss.add(files('guest-random.c')) +util_ss.add(files('yank.c')) + +if have_user + util_ss.add(files('selfmap.c')) +endif + +if have_system + util_ss.add(files('crc-ccitt.c')) + util_ss.add(when: 'CONFIG_GIO', if_true: [files('dbus.c'), gio]) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('userfaultfd.c')) +endif + +if have_block + util_ss.add(files('aiocb.c', 'async.c', 'aio-wait.c')) + util_ss.add(files('base64.c')) + util_ss.add(files('buffer.c')) + util_ss.add(files('bufferiszero.c')) + util_ss.add(files('coroutine-@0@.c'.format(config_host['CONFIG_COROUTINE_BACKEND']))) + util_ss.add(files('hbitmap.c')) + util_ss.add(files('hexdump.c')) + util_ss.add(files('iova-tree.c')) + util_ss.add(files('iov.c', 'qemu-sockets.c', 'uri.c')) + util_ss.add(files('lockcnt.c')) + util_ss.add(files('main-loop.c')) + util_ss.add(files('nvdimm-utils.c')) + util_ss.add(files('qemu-coroutine.c', 'qemu-coroutine-lock.c', 'qemu-coroutine-io.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: [ + files('vhost-user-server.c'), vhost_user + ]) + util_ss.add(files('block-helpers.c')) + util_ss.add(files('qemu-coroutine-sleep.c')) + util_ss.add(files('qemu-co-shared-resource.c')) + util_ss.add(files('thread-pool.c', 'qemu-timer.c')) + util_ss.add(files('readline.c')) + util_ss.add(files('throttle.c')) + util_ss.add(files('timed-average.c')) + util_ss.add(when: 'CONFIG_INOTIFY1', if_true: files('filemonitor-inotify.c'), + if_false: files('filemonitor-stub.c')) + util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c')) +endif diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c new file mode 100644 index 000000000..893d86435 --- /dev/null +++ b/util/mmap-alloc.c @@ -0,0 +1,306 @@ +/* + * Support for RAM backed by mmaped host memory. + * + * Copyright (c) 2015 Red Hat, Inc. + * + * Authors: + * Michael S. Tsirkin <mst@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifdef CONFIG_LINUX +#include <linux/mman.h> +#else /* !CONFIG_LINUX */ +#define MAP_SYNC 0x0 +#define MAP_SHARED_VALIDATE 0x0 +#endif /* CONFIG_LINUX */ + +#include "qemu/osdep.h" +#include "qemu/mmap-alloc.h" +#include "qemu/host-utils.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" + +#define HUGETLBFS_MAGIC 0x958458f6 + +#ifdef CONFIG_LINUX +#include <sys/vfs.h> +#endif + +size_t qemu_fd_getpagesize(int fd) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (fd != -1) { + do { + ret = fstatfs(fd, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret == 0 && fs.f_type == HUGETLBFS_MAGIC) { + return fs.f_bsize; + } + } +#ifdef __sparc__ + /* SPARC Linux needs greater alignment than the pagesize */ + return QEMU_VMALLOC_ALIGN; +#endif +#endif + + return qemu_real_host_page_size; +} + +size_t qemu_mempath_getpagesize(const char *mem_path) +{ +#ifdef CONFIG_LINUX + struct statfs fs; + int ret; + + if (mem_path) { + do { + ret = statfs(mem_path, &fs); + } while (ret != 0 && errno == EINTR); + + if (ret != 0) { + fprintf(stderr, "Couldn't statfs() memory path: %s\n", + strerror(errno)); + exit(1); + } + + if (fs.f_type == HUGETLBFS_MAGIC) { + /* It's hugepage, return the huge page size */ + return fs.f_bsize; + } + } +#ifdef __sparc__ + /* SPARC Linux needs greater alignment than the pagesize */ + return QEMU_VMALLOC_ALIGN; +#endif +#endif + + return qemu_real_host_page_size; +} + +#define OVERCOMMIT_MEMORY_PATH "/proc/sys/vm/overcommit_memory" +static bool map_noreserve_effective(int fd, uint32_t qemu_map_flags) +{ +#if defined(__linux__) + const bool readonly = qemu_map_flags & QEMU_MAP_READONLY; + const bool shared = qemu_map_flags & QEMU_MAP_SHARED; + gchar *content = NULL; + const char *endptr; + unsigned int tmp; + + /* + * hugeltb accounting is different than ordinary swap reservation: + * a) Hugetlb pages from the pool are reserved for both private and + * shared mappings. For shared mappings, all mappers have to specify + * MAP_NORESERVE. + * b) MAP_NORESERVE is not affected by /proc/sys/vm/overcommit_memory. + */ + if (qemu_fd_getpagesize(fd) != qemu_real_host_page_size) { + return true; + } + + /* + * Accountable mappings in the kernel that can be affected by MAP_NORESEVE + * are private writable mappings (see mm/mmap.c:accountable_mapping() in + * Linux). For all shared or readonly mappings, MAP_NORESERVE is always + * implicitly active -- no reservation; this includes shmem. The only + * exception is shared anonymous memory, it is accounted like private + * anonymous memory. + */ + if (readonly || (shared && fd >= 0)) { + return true; + } + + /* + * MAP_NORESERVE is globally ignored for applicable !hugetlb mappings when + * memory overcommit is set to "never". Sparse memory regions aren't really + * possible in this system configuration. + * + * Bail out now instead of silently committing way more memory than + * currently desired by the user. + */ + if (g_file_get_contents(OVERCOMMIT_MEMORY_PATH, &content, NULL, NULL) && + !qemu_strtoui(content, &endptr, 0, &tmp) && + (!endptr || *endptr == '\n')) { + if (tmp == 2) { + error_report("Skipping reservation of swap space is not supported:" + " \"" OVERCOMMIT_MEMORY_PATH "\" is \"2\""); + return false; + } + return true; + } + /* this interface has been around since Linux 2.6 */ + error_report("Skipping reservation of swap space is not supported:" + " Could not read: \"" OVERCOMMIT_MEMORY_PATH "\""); + return false; +#endif + /* + * E.g., FreeBSD used to define MAP_NORESERVE, never implemented it, + * and removed it a while ago. + */ + error_report("Skipping reservation of swap space is not supported"); + return false; +} + +/* + * Reserve a new memory region of the requested size to be used for mapping + * from the given fd (if any). + */ +static void *mmap_reserve(size_t size, int fd) +{ + int flags = MAP_PRIVATE; + +#if defined(__powerpc64__) && defined(__linux__) + /* + * On ppc64 mappings in the same segment (aka slice) must share the same + * page size. Since we will be re-allocating part of this segment + * from the supplied fd, we should make sure to use the same page size, to + * this end we mmap the supplied fd. In this case, set MAP_NORESERVE to + * avoid allocating backing store memory. + * We do this unless we are using the system page size, in which case + * anonymous memory is OK. + */ + if (fd == -1 || qemu_fd_getpagesize(fd) == qemu_real_host_page_size) { + fd = -1; + flags |= MAP_ANONYMOUS; + } else { + flags |= MAP_NORESERVE; + } +#else + fd = -1; + flags |= MAP_ANONYMOUS; +#endif + + return mmap(0, size, PROT_NONE, flags, fd, 0); +} + +/* + * Activate memory in a reserved region from the given fd (if any), to make + * it accessible. + */ +static void *mmap_activate(void *ptr, size_t size, int fd, + uint32_t qemu_map_flags, off_t map_offset) +{ + const bool noreserve = qemu_map_flags & QEMU_MAP_NORESERVE; + const bool readonly = qemu_map_flags & QEMU_MAP_READONLY; + const bool shared = qemu_map_flags & QEMU_MAP_SHARED; + const bool sync = qemu_map_flags & QEMU_MAP_SYNC; + const int prot = PROT_READ | (readonly ? 0 : PROT_WRITE); + int map_sync_flags = 0; + int flags = MAP_FIXED; + void *activated_ptr; + + if (noreserve && !map_noreserve_effective(fd, qemu_map_flags)) { + return MAP_FAILED; + } + + flags |= fd == -1 ? MAP_ANONYMOUS : 0; + flags |= shared ? MAP_SHARED : MAP_PRIVATE; + flags |= noreserve ? MAP_NORESERVE : 0; + if (shared && sync) { + map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE; + } + + activated_ptr = mmap(ptr, size, prot, flags | map_sync_flags, fd, + map_offset); + if (activated_ptr == MAP_FAILED && map_sync_flags) { + if (errno == ENOTSUP) { + char *proc_link = g_strdup_printf("/proc/self/fd/%d", fd); + char *file_name = g_malloc0(PATH_MAX); + int len = readlink(proc_link, file_name, PATH_MAX - 1); + + if (len < 0) { + len = 0; + } + file_name[len] = '\0'; + fprintf(stderr, "Warning: requesting persistence across crashes " + "for backend file %s failed. Proceeding without " + "persistence, data might become corrupted in case of host " + "crash.\n", file_name); + g_free(proc_link); + g_free(file_name); + warn_report("Using non DAX backing file with 'pmem=on' option" + " is deprecated"); + } + /* + * If mmap failed with MAP_SHARED_VALIDATE | MAP_SYNC, we will try + * again without these flags to handle backwards compatibility. + */ + activated_ptr = mmap(ptr, size, prot, flags, fd, map_offset); + } + return activated_ptr; +} + +static inline size_t mmap_guard_pagesize(int fd) +{ +#if defined(__powerpc64__) && defined(__linux__) + /* Mappings in the same segment must share the same page size */ + return qemu_fd_getpagesize(fd); +#else + return qemu_real_host_page_size; +#endif +} + +void *qemu_ram_mmap(int fd, + size_t size, + size_t align, + uint32_t qemu_map_flags, + off_t map_offset) +{ + const size_t guard_pagesize = mmap_guard_pagesize(fd); + size_t offset, total; + void *ptr, *guardptr; + + /* + * Note: this always allocates at least one extra page of virtual address + * space, even if size is already aligned. + */ + total = size + align; + + guardptr = mmap_reserve(total, fd); + if (guardptr == MAP_FAILED) { + return MAP_FAILED; + } + + assert(is_power_of_2(align)); + /* Always align to host page size */ + assert(align >= guard_pagesize); + + offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr; + + ptr = mmap_activate(guardptr + offset, size, fd, qemu_map_flags, + map_offset); + if (ptr == MAP_FAILED) { + munmap(guardptr, total); + return MAP_FAILED; + } + + if (offset > 0) { + munmap(guardptr, offset); + } + + /* + * Leave a single PROT_NONE page allocated after the RAM block, to serve as + * a guard page guarding against potential buffer overflows. + */ + total -= offset; + if (total > size + guard_pagesize) { + munmap(ptr + size + guard_pagesize, total - size - guard_pagesize); + } + + return ptr; +} + +void qemu_ram_munmap(int fd, void *ptr, size_t size) +{ + if (ptr) { + /* Unmap both the RAM block and the guard page */ + munmap(ptr, size + mmap_guard_pagesize(fd)); + } +} diff --git a/util/module.c b/util/module.c new file mode 100644 index 000000000..6bb4ad915 --- /dev/null +++ b/util/module.c @@ -0,0 +1,387 @@ +/* + * QEMU Module Infrastructure + * + * Copyright IBM, Corp. 2009 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#ifdef CONFIG_MODULES +#include <gmodule.h> +#endif +#include "qemu/queue.h" +#include "qemu/module.h" +#include "qemu/cutils.h" +#include "qemu/config-file.h" +#ifdef CONFIG_MODULE_UPGRADES +#include "qemu-version.h" +#endif +#include "trace.h" + +typedef struct ModuleEntry +{ + void (*init)(void); + QTAILQ_ENTRY(ModuleEntry) node; + module_init_type type; +} ModuleEntry; + +typedef QTAILQ_HEAD(, ModuleEntry) ModuleTypeList; + +static ModuleTypeList init_type_list[MODULE_INIT_MAX]; +static bool modules_init_done[MODULE_INIT_MAX]; + +static ModuleTypeList dso_init_list; + +static void init_lists(void) +{ + static int inited; + int i; + + if (inited) { + return; + } + + for (i = 0; i < MODULE_INIT_MAX; i++) { + QTAILQ_INIT(&init_type_list[i]); + } + + QTAILQ_INIT(&dso_init_list); + + inited = 1; +} + + +static ModuleTypeList *find_type(module_init_type type) +{ + init_lists(); + + return &init_type_list[type]; +} + +void register_module_init(void (*fn)(void), module_init_type type) +{ + ModuleEntry *e; + ModuleTypeList *l; + + e = g_malloc0(sizeof(*e)); + e->init = fn; + e->type = type; + + l = find_type(type); + + QTAILQ_INSERT_TAIL(l, e, node); +} + +void register_dso_module_init(void (*fn)(void), module_init_type type) +{ + ModuleEntry *e; + + init_lists(); + + e = g_malloc0(sizeof(*e)); + e->init = fn; + e->type = type; + + QTAILQ_INSERT_TAIL(&dso_init_list, e, node); +} + +void module_call_init(module_init_type type) +{ + ModuleTypeList *l; + ModuleEntry *e; + + if (modules_init_done[type]) { + return; + } + + l = find_type(type); + + QTAILQ_FOREACH(e, l, node) { + e->init(); + } + + modules_init_done[type] = true; +} + +#ifdef CONFIG_MODULES + +static const QemuModinfo module_info_stub[] = { { + /* end of list */ +} }; +static const QemuModinfo *module_info = module_info_stub; +static const char *module_arch; + +void module_init_info(const QemuModinfo *info) +{ + module_info = info; +} + +void module_allow_arch(const char *arch) +{ + module_arch = arch; +} + +static bool module_check_arch(const QemuModinfo *modinfo) +{ + if (modinfo->arch) { + if (!module_arch) { + /* no arch set -> ignore all */ + return false; + } + if (strcmp(module_arch, modinfo->arch) != 0) { + /* mismatch */ + return false; + } + } + return true; +} + +static int module_load_file(const char *fname, bool mayfail, bool export_symbols) +{ + GModule *g_module; + void (*sym)(void); + const char *dsosuf = CONFIG_HOST_DSOSUF; + int len = strlen(fname); + int suf_len = strlen(dsosuf); + ModuleEntry *e, *next; + int ret, flags; + + if (len <= suf_len || strcmp(&fname[len - suf_len], dsosuf)) { + /* wrong suffix */ + ret = -EINVAL; + goto out; + } + if (access(fname, F_OK)) { + ret = -ENOENT; + goto out; + } + + assert(QTAILQ_EMPTY(&dso_init_list)); + + flags = 0; + if (!export_symbols) { + flags |= G_MODULE_BIND_LOCAL; + } + g_module = g_module_open(fname, flags); + if (!g_module) { + if (!mayfail) { + fprintf(stderr, "Failed to open module: %s\n", + g_module_error()); + } + ret = -EINVAL; + goto out; + } + if (!g_module_symbol(g_module, DSO_STAMP_FUN_STR, (gpointer *)&sym)) { + fprintf(stderr, "Failed to initialize module: %s\n", + fname); + /* Print some info if this is a QEMU module (but from different build), + * this will make debugging user problems easier. */ + if (g_module_symbol(g_module, "qemu_module_dummy", (gpointer *)&sym)) { + fprintf(stderr, + "Note: only modules from the same build can be loaded.\n"); + } + g_module_close(g_module); + ret = -EINVAL; + } else { + QTAILQ_FOREACH(e, &dso_init_list, node) { + e->init(); + register_module_init(e->init, e->type); + } + ret = 0; + } + + trace_module_load_module(fname); + QTAILQ_FOREACH_SAFE(e, &dso_init_list, node, next) { + QTAILQ_REMOVE(&dso_init_list, e, node); + g_free(e); + } +out: + return ret; +} +#endif + +bool module_load_one(const char *prefix, const char *lib_name, bool mayfail) +{ + bool success = false; + +#ifdef CONFIG_MODULES + char *fname = NULL; +#ifdef CONFIG_MODULE_UPGRADES + char *version_dir; +#endif + const char *search_dir; + char *dirs[5]; + char *module_name; + int i = 0, n_dirs = 0; + int ret; + bool export_symbols = false; + static GHashTable *loaded_modules; + const QemuModinfo *modinfo; + const char **sl; + + if (!g_module_supported()) { + fprintf(stderr, "Module is not supported by system.\n"); + return false; + } + + if (!loaded_modules) { + loaded_modules = g_hash_table_new(g_str_hash, g_str_equal); + } + + module_name = g_strdup_printf("%s%s", prefix, lib_name); + + if (g_hash_table_contains(loaded_modules, module_name)) { + g_free(module_name); + return true; + } + g_hash_table_add(loaded_modules, module_name); + + for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + if (modinfo->arch) { + if (strcmp(modinfo->name, module_name) == 0) { + if (!module_check_arch(modinfo)) { + return false; + } + } + } + if (modinfo->deps) { + if (strcmp(modinfo->name, module_name) == 0) { + /* we depend on other module(s) */ + for (sl = modinfo->deps; *sl != NULL; sl++) { + module_load_one("", *sl, false); + } + } else { + for (sl = modinfo->deps; *sl != NULL; sl++) { + if (strcmp(module_name, *sl) == 0) { + /* another module depends on us */ + export_symbols = true; + } + } + } + } + } + + search_dir = getenv("QEMU_MODULE_DIR"); + if (search_dir != NULL) { + dirs[n_dirs++] = g_strdup_printf("%s", search_dir); + } + dirs[n_dirs++] = get_relocated_path(CONFIG_QEMU_MODDIR); + dirs[n_dirs++] = g_strdup(qemu_get_exec_dir()); + +#ifdef CONFIG_MODULE_UPGRADES + version_dir = g_strcanon(g_strdup(QEMU_PKGVERSION), + G_CSET_A_2_Z G_CSET_a_2_z G_CSET_DIGITS "+-.~", + '_'); + dirs[n_dirs++] = g_strdup_printf("/var/run/qemu/%s", version_dir); +#endif + + assert(n_dirs <= ARRAY_SIZE(dirs)); + + for (i = 0; i < n_dirs; i++) { + fname = g_strdup_printf("%s/%s%s", + dirs[i], module_name, CONFIG_HOST_DSOSUF); + ret = module_load_file(fname, mayfail, export_symbols); + g_free(fname); + fname = NULL; + /* Try loading until loaded a module file */ + if (!ret) { + success = true; + break; + } + } + + if (!success) { + g_hash_table_remove(loaded_modules, module_name); + g_free(module_name); + } + + for (i = 0; i < n_dirs; i++) { + g_free(dirs[i]); + } + +#endif + return success; +} + +#ifdef CONFIG_MODULES + +static bool module_loaded_qom_all; + +void module_load_qom_one(const char *type) +{ + const QemuModinfo *modinfo; + const char **sl; + + if (!type) { + return; + } + + trace_module_lookup_object_type(type); + for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + if (!modinfo->objs) { + continue; + } + if (!module_check_arch(modinfo)) { + continue; + } + for (sl = modinfo->objs; *sl != NULL; sl++) { + if (strcmp(type, *sl) == 0) { + module_load_one("", modinfo->name, false); + } + } + } +} + +void module_load_qom_all(void) +{ + const QemuModinfo *modinfo; + + if (module_loaded_qom_all) { + return; + } + + for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + if (!modinfo->objs) { + continue; + } + if (!module_check_arch(modinfo)) { + continue; + } + module_load_one("", modinfo->name, false); + } + module_loaded_qom_all = true; +} + +void qemu_load_module_for_opts(const char *group) +{ + const QemuModinfo *modinfo; + const char **sl; + + for (modinfo = module_info; modinfo->name != NULL; modinfo++) { + if (!modinfo->opts) { + continue; + } + for (sl = modinfo->opts; *sl != NULL; sl++) { + if (strcmp(group, *sl) == 0) { + module_load_one("", modinfo->name, false); + } + } + } +} + +#else + +void module_allow_arch(const char *arch) {} +void qemu_load_module_for_opts(const char *group) {} +void module_load_qom_one(const char *type) {} +void module_load_qom_all(void) {} + +#endif diff --git a/util/notify.c b/util/notify.c new file mode 100644 index 000000000..76bab212a --- /dev/null +++ b/util/notify.c @@ -0,0 +1,76 @@ +/* + * Notifier lists + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/notify.h" + +void notifier_list_init(NotifierList *list) +{ + QLIST_INIT(&list->notifiers); +} + +void notifier_list_add(NotifierList *list, Notifier *notifier) +{ + QLIST_INSERT_HEAD(&list->notifiers, notifier, node); +} + +void notifier_remove(Notifier *notifier) +{ + QLIST_REMOVE(notifier, node); +} + +void notifier_list_notify(NotifierList *list, void *data) +{ + Notifier *notifier, *next; + + QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) { + notifier->notify(notifier, data); + } +} + +bool notifier_list_empty(NotifierList *list) +{ + return QLIST_EMPTY(&list->notifiers); +} + +void notifier_with_return_list_init(NotifierWithReturnList *list) +{ + QLIST_INIT(&list->notifiers); +} + +void notifier_with_return_list_add(NotifierWithReturnList *list, + NotifierWithReturn *notifier) +{ + QLIST_INSERT_HEAD(&list->notifiers, notifier, node); +} + +void notifier_with_return_remove(NotifierWithReturn *notifier) +{ + QLIST_REMOVE(notifier, node); +} + +int notifier_with_return_list_notify(NotifierWithReturnList *list, void *data) +{ + NotifierWithReturn *notifier, *next; + int ret = 0; + + QLIST_FOREACH_SAFE(notifier, &list->notifiers, node, next) { + ret = notifier->notify(notifier, data); + if (ret != 0) { + break; + } + } + return ret; +} diff --git a/util/nvdimm-utils.c b/util/nvdimm-utils.c new file mode 100644 index 000000000..aa3d199f2 --- /dev/null +++ b/util/nvdimm-utils.c @@ -0,0 +1,30 @@ +#include "qemu/osdep.h" +#include "qemu/nvdimm-utils.h" +#include "hw/mem/nvdimm.h" + +static int nvdimm_device_list(Object *obj, void *opaque) +{ + GSList **list = opaque; + + if (object_dynamic_cast(obj, TYPE_NVDIMM)) { + *list = g_slist_append(*list, DEVICE(obj)); + } + + object_child_foreach(obj, nvdimm_device_list, opaque); + return 0; +} + +/* + * inquire NVDIMM devices and link them into the list which is + * returned to the caller. + * + * Note: it is the caller's responsibility to free the list to avoid + * memory leak. + */ +GSList *nvdimm_get_device_list(void) +{ + GSList *list = NULL; + + object_child_foreach(qdev_get_machine(), nvdimm_device_list, &list); + return list; +} diff --git a/util/osdep.c b/util/osdep.c new file mode 100644 index 000000000..42a0a4986 --- /dev/null +++ b/util/osdep.c @@ -0,0 +1,617 @@ +/* + * QEMU low level functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qapi/error.h" + +/* Needed early for CONFIG_BSD etc. */ + +#ifdef CONFIG_SOLARIS +#include <sys/statvfs.h> +/* See MySQL bug #7156 (http://bugs.mysql.com/bug.php?id=7156) for + discussion about Solaris header problems */ +extern int madvise(char *, size_t, int); +#endif + +#include "qemu-common.h" +#include "qemu/cutils.h" +#include "qemu/sockets.h" +#include "qemu/error-report.h" +#include "monitor/monitor.h" + +static bool fips_enabled = false; + +static const char *hw_version = QEMU_HW_VERSION; + +int socket_set_cork(int fd, int v) +{ +#if defined(SOL_TCP) && defined(TCP_CORK) + return qemu_setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v)); +#else + return 0; +#endif +} + +int socket_set_nodelay(int fd) +{ + int v = 1; + return qemu_setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &v, sizeof(v)); +} + +int qemu_madvise(void *addr, size_t len, int advice) +{ + if (advice == QEMU_MADV_INVALID) { + errno = EINVAL; + return -1; + } +#if defined(CONFIG_MADVISE) + return madvise(addr, len, advice); +#elif defined(CONFIG_POSIX_MADVISE) + return posix_madvise(addr, len, advice); +#else + errno = EINVAL; + return -1; +#endif +} + +static int qemu_mprotect__osdep(void *addr, size_t size, int prot) +{ + g_assert(!((uintptr_t)addr & ~qemu_real_host_page_mask)); + g_assert(!(size & ~qemu_real_host_page_mask)); + +#ifdef _WIN32 + DWORD old_protect; + + if (!VirtualProtect(addr, size, prot, &old_protect)) { + g_autofree gchar *emsg = g_win32_error_message(GetLastError()); + error_report("%s: VirtualProtect failed: %s", __func__, emsg); + return -1; + } + return 0; +#else + if (mprotect(addr, size, prot)) { + error_report("%s: mprotect failed: %s", __func__, strerror(errno)); + return -1; + } + return 0; +#endif +} + +int qemu_mprotect_rw(void *addr, size_t size) +{ +#ifdef _WIN32 + return qemu_mprotect__osdep(addr, size, PAGE_READWRITE); +#else + return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE); +#endif +} + +int qemu_mprotect_rwx(void *addr, size_t size) +{ +#ifdef _WIN32 + return qemu_mprotect__osdep(addr, size, PAGE_EXECUTE_READWRITE); +#else + return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE | PROT_EXEC); +#endif +} + +int qemu_mprotect_none(void *addr, size_t size) +{ +#ifdef _WIN32 + return qemu_mprotect__osdep(addr, size, PAGE_NOACCESS); +#else + return qemu_mprotect__osdep(addr, size, PROT_NONE); +#endif +} + +#ifndef _WIN32 + +static int fcntl_op_setlk = -1; +static int fcntl_op_getlk = -1; + +/* + * Dups an fd and sets the flags + */ +int qemu_dup_flags(int fd, int flags) +{ + int ret; + int serrno; + int dup_flags; + + ret = qemu_dup(fd); + if (ret == -1) { + goto fail; + } + + dup_flags = fcntl(ret, F_GETFL); + if (dup_flags == -1) { + goto fail; + } + + if ((flags & O_SYNC) != (dup_flags & O_SYNC)) { + errno = EINVAL; + goto fail; + } + + /* Set/unset flags that we can with fcntl */ + if (fcntl(ret, F_SETFL, flags) == -1) { + goto fail; + } + + /* Truncate the file in the cases that open() would truncate it */ + if (flags & O_TRUNC || + ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))) { + if (ftruncate(ret, 0) == -1) { + goto fail; + } + } + + return ret; + +fail: + serrno = errno; + if (ret != -1) { + close(ret); + } + errno = serrno; + return -1; +} + +int qemu_dup(int fd) +{ + int ret; +#ifdef F_DUPFD_CLOEXEC + ret = fcntl(fd, F_DUPFD_CLOEXEC, 0); +#else + ret = dup(fd); + if (ret != -1) { + qemu_set_cloexec(ret); + } +#endif + return ret; +} + +static int qemu_parse_fdset(const char *param) +{ + return qemu_parse_fd(param); +} + +static void qemu_probe_lock_ops(void) +{ + if (fcntl_op_setlk == -1) { +#ifdef F_OFD_SETLK + int fd; + int ret; + struct flock fl = { + .l_whence = SEEK_SET, + .l_start = 0, + .l_len = 0, + .l_type = F_WRLCK, + }; + + fd = open("/dev/null", O_RDWR); + if (fd < 0) { + fprintf(stderr, + "Failed to open /dev/null for OFD lock probing: %s\n", + strerror(errno)); + fcntl_op_setlk = F_SETLK; + fcntl_op_getlk = F_GETLK; + return; + } + ret = fcntl(fd, F_OFD_GETLK, &fl); + close(fd); + if (!ret) { + fcntl_op_setlk = F_OFD_SETLK; + fcntl_op_getlk = F_OFD_GETLK; + } else { + fcntl_op_setlk = F_SETLK; + fcntl_op_getlk = F_GETLK; + } +#else + fcntl_op_setlk = F_SETLK; + fcntl_op_getlk = F_GETLK; +#endif + } +} + +bool qemu_has_ofd_lock(void) +{ + qemu_probe_lock_ops(); +#ifdef F_OFD_SETLK + return fcntl_op_setlk == F_OFD_SETLK; +#else + return false; +#endif +} + +static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type) +{ + int ret; + struct flock fl = { + .l_whence = SEEK_SET, + .l_start = start, + .l_len = len, + .l_type = fl_type, + }; + qemu_probe_lock_ops(); + do { + ret = fcntl(fd, fcntl_op_setlk, &fl); + } while (ret == -1 && errno == EINTR); + return ret == -1 ? -errno : 0; +} + +int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive) +{ + return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK); +} + +int qemu_unlock_fd(int fd, int64_t start, int64_t len) +{ + return qemu_lock_fcntl(fd, start, len, F_UNLCK); +} + +int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive) +{ + int ret; + struct flock fl = { + .l_whence = SEEK_SET, + .l_start = start, + .l_len = len, + .l_type = exclusive ? F_WRLCK : F_RDLCK, + }; + qemu_probe_lock_ops(); + ret = fcntl(fd, fcntl_op_getlk, &fl); + if (ret == -1) { + return -errno; + } else { + return fl.l_type == F_UNLCK ? 0 : -EAGAIN; + } +} +#endif + +static int qemu_open_cloexec(const char *name, int flags, mode_t mode) +{ + int ret; +#ifdef O_CLOEXEC + ret = open(name, flags | O_CLOEXEC, mode); +#else + ret = open(name, flags, mode); + if (ret >= 0) { + qemu_set_cloexec(ret); + } +#endif + return ret; +} + +/* + * Opens a file with FD_CLOEXEC set + */ +static int +qemu_open_internal(const char *name, int flags, mode_t mode, Error **errp) +{ + int ret; + +#ifndef _WIN32 + const char *fdset_id_str; + + /* Attempt dup of fd from fd set */ + if (strstart(name, "/dev/fdset/", &fdset_id_str)) { + int64_t fdset_id; + int dupfd; + + fdset_id = qemu_parse_fdset(fdset_id_str); + if (fdset_id == -1) { + error_setg(errp, "Could not parse fdset %s", name); + errno = EINVAL; + return -1; + } + + dupfd = monitor_fdset_dup_fd_add(fdset_id, flags); + if (dupfd == -1) { + error_setg_errno(errp, errno, "Could not dup FD for %s flags %x", + name, flags); + return -1; + } + + return dupfd; + } +#endif + + ret = qemu_open_cloexec(name, flags, mode); + + if (ret == -1) { + const char *action = flags & O_CREAT ? "create" : "open"; +#ifdef O_DIRECT + /* Give more helpful error message for O_DIRECT */ + if (errno == EINVAL && (flags & O_DIRECT)) { + ret = open(name, flags & ~O_DIRECT, mode); + if (ret != -1) { + close(ret); + error_setg(errp, "Could not %s '%s': " + "filesystem does not support O_DIRECT", + action, name); + errno = EINVAL; /* restore first open()'s errno */ + return -1; + } + } +#endif /* O_DIRECT */ + error_setg_errno(errp, errno, "Could not %s '%s'", + action, name); + } + + return ret; +} + + +int qemu_open(const char *name, int flags, Error **errp) +{ + assert(!(flags & O_CREAT)); + + return qemu_open_internal(name, flags, 0, errp); +} + + +int qemu_create(const char *name, int flags, mode_t mode, Error **errp) +{ + assert(!(flags & O_CREAT)); + + return qemu_open_internal(name, flags | O_CREAT, mode, errp); +} + + +int qemu_open_old(const char *name, int flags, ...) +{ + va_list ap; + mode_t mode = 0; + int ret; + + va_start(ap, flags); + if (flags & O_CREAT) { + mode = va_arg(ap, int); + } + va_end(ap); + + ret = qemu_open_internal(name, flags, mode, NULL); + +#ifdef O_DIRECT + if (ret == -1 && errno == EINVAL && (flags & O_DIRECT)) { + error_report("file system may not support O_DIRECT"); + errno = EINVAL; /* in case it was clobbered */ + } +#endif /* O_DIRECT */ + + return ret; +} + +int qemu_close(int fd) +{ + int64_t fdset_id; + + /* Close fd that was dup'd from an fdset */ + fdset_id = monitor_fdset_dup_fd_find(fd); + if (fdset_id != -1) { + int ret; + + ret = close(fd); + if (ret == 0) { + monitor_fdset_dup_fd_remove(fd); + } + + return ret; + } + + return close(fd); +} + +/* + * Delete a file from the filesystem, unless the filename is /dev/fdset/... + * + * Returns: On success, zero is returned. On error, -1 is returned, + * and errno is set appropriately. + */ +int qemu_unlink(const char *name) +{ + if (g_str_has_prefix(name, "/dev/fdset/")) { + return 0; + } + + return unlink(name); +} + +/* + * A variant of write(2) which handles partial write. + * + * Return the number of bytes transferred. + * Set errno if fewer than `count' bytes are written. + * + * This function don't work with non-blocking fd's. + * Any of the possibilities with non-blocking fd's is bad: + * - return a short write (then name is wrong) + * - busy wait adding (errno == EAGAIN) to the loop + */ +ssize_t qemu_write_full(int fd, const void *buf, size_t count) +{ + ssize_t ret = 0; + ssize_t total = 0; + + while (count) { + ret = write(fd, buf, count); + if (ret < 0) { + if (errno == EINTR) + continue; + break; + } + + count -= ret; + buf += ret; + total += ret; + } + + return total; +} + +/* + * Opens a socket with FD_CLOEXEC set + */ +int qemu_socket(int domain, int type, int protocol) +{ + int ret; + +#ifdef SOCK_CLOEXEC + ret = socket(domain, type | SOCK_CLOEXEC, protocol); + if (ret != -1 || errno != EINVAL) { + return ret; + } +#endif + ret = socket(domain, type, protocol); + if (ret >= 0) { + qemu_set_cloexec(ret); + } + + return ret; +} + +/* + * Accept a connection and set FD_CLOEXEC + */ +int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen) +{ + int ret; + +#ifdef CONFIG_ACCEPT4 + ret = accept4(s, addr, addrlen, SOCK_CLOEXEC); + if (ret != -1 || errno != ENOSYS) { + return ret; + } +#endif + ret = accept(s, addr, addrlen); + if (ret >= 0) { + qemu_set_cloexec(ret); + } + + return ret; +} + +void qemu_set_hw_version(const char *version) +{ + hw_version = version; +} + +const char *qemu_hw_version(void) +{ + return hw_version; +} + +void fips_set_state(bool requested) +{ +#ifdef __linux__ + if (requested) { + FILE *fds = fopen("/proc/sys/crypto/fips_enabled", "r"); + if (fds != NULL) { + fips_enabled = (fgetc(fds) == '1'); + fclose(fds); + } + } +#else + fips_enabled = false; +#endif /* __linux__ */ + +#ifdef _FIPS_DEBUG + fprintf(stderr, "FIPS mode %s (requested %s)\n", + (fips_enabled ? "enabled" : "disabled"), + (requested ? "enabled" : "disabled")); +#endif +} + +bool fips_get_state(void) +{ + return fips_enabled; +} + +#ifdef _WIN32 +static void socket_cleanup(void) +{ + WSACleanup(); +} +#endif + +int socket_init(void) +{ +#ifdef _WIN32 + WSADATA Data; + int ret, err; + + ret = WSAStartup(MAKEWORD(2, 2), &Data); + if (ret != 0) { + err = WSAGetLastError(); + fprintf(stderr, "WSAStartup: %d\n", err); + return -1; + } + atexit(socket_cleanup); +#endif + return 0; +} + + +#ifndef CONFIG_IOVEC +/* helper function for iov_send_recv() */ +static ssize_t +readv_writev(int fd, const struct iovec *iov, int iov_cnt, bool do_write) +{ + unsigned i = 0; + ssize_t ret = 0; + while (i < iov_cnt) { + ssize_t r = do_write + ? write(fd, iov[i].iov_base, iov[i].iov_len) + : read(fd, iov[i].iov_base, iov[i].iov_len); + if (r > 0) { + ret += r; + } else if (!r) { + break; + } else if (errno == EINTR) { + continue; + } else { + /* else it is some "other" error, + * only return if there was no data processed. */ + if (ret == 0) { + ret = -1; + } + break; + } + i++; + } + return ret; +} + +ssize_t +readv(int fd, const struct iovec *iov, int iov_cnt) +{ + return readv_writev(fd, iov, iov_cnt, false); +} + +ssize_t +writev(int fd, const struct iovec *iov, int iov_cnt) +{ + return readv_writev(fd, iov, iov_cnt, true); +} +#endif diff --git a/util/oslib-posix.c b/util/oslib-posix.c new file mode 100644 index 000000000..e8bdb02e1 --- /dev/null +++ b/util/oslib-posix.c @@ -0,0 +1,860 @@ +/* + * os-posix-lib.c + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010 Red Hat, Inc. + * + * QEMU library functions on POSIX which are shared between QEMU and + * the QEMU tools. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include <termios.h> + +#include <glib/gprintf.h> + +#include "qemu-common.h" +#include "sysemu/sysemu.h" +#include "trace.h" +#include "qapi/error.h" +#include "qemu/sockets.h" +#include "qemu/thread.h" +#include <libgen.h> +#include "qemu/cutils.h" +#include "qemu/compiler.h" + +#ifdef CONFIG_LINUX +#include <sys/syscall.h> +#endif + +#ifdef __FreeBSD__ +#include <sys/sysctl.h> +#include <sys/user.h> +#include <sys/thr.h> +#include <libutil.h> +#endif + +#ifdef __NetBSD__ +#include <sys/sysctl.h> +#include <lwp.h> +#endif + +#ifdef __APPLE__ +#include <mach-o/dyld.h> +#endif + +#ifdef __HAIKU__ +#include <kernel/image.h> +#endif + +#include "qemu/mmap-alloc.h" + +#ifdef CONFIG_DEBUG_STACK_USAGE +#include "qemu/error-report.h" +#endif + +#define MAX_MEM_PREALLOC_THREAD_COUNT 16 + +struct MemsetThread { + char *addr; + size_t numpages; + size_t hpagesize; + QemuThread pgthread; + sigjmp_buf env; +}; +typedef struct MemsetThread MemsetThread; + +static MemsetThread *memset_thread; +static int memset_num_threads; +static bool memset_thread_failed; + +static QemuMutex page_mutex; +static QemuCond page_cond; +static bool threads_created_flag; + +int qemu_get_thread_id(void) +{ +#if defined(__linux__) + return syscall(SYS_gettid); +#elif defined(__FreeBSD__) + /* thread id is up to INT_MAX */ + long tid; + thr_self(&tid); + return (int)tid; +#elif defined(__NetBSD__) + return _lwp_self(); +#elif defined(__OpenBSD__) + return getthrid(); +#else + return getpid(); +#endif +} + +int qemu_daemon(int nochdir, int noclose) +{ + return daemon(nochdir, noclose); +} + +bool qemu_write_pidfile(const char *path, Error **errp) +{ + int fd; + char pidstr[32]; + + while (1) { + struct stat a, b; + struct flock lock = { + .l_type = F_WRLCK, + .l_whence = SEEK_SET, + .l_len = 0, + }; + + fd = qemu_open_old(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR); + if (fd == -1) { + error_setg_errno(errp, errno, "Cannot open pid file"); + return false; + } + + if (fstat(fd, &b) < 0) { + error_setg_errno(errp, errno, "Cannot stat file"); + goto fail_close; + } + + if (fcntl(fd, F_SETLK, &lock)) { + error_setg_errno(errp, errno, "Cannot lock pid file"); + goto fail_close; + } + + /* + * Now make sure the path we locked is the same one that now + * exists on the filesystem. + */ + if (stat(path, &a) < 0) { + /* + * PID file disappeared, someone else must be racing with + * us, so try again. + */ + close(fd); + continue; + } + + if (a.st_ino == b.st_ino) { + break; + } + + /* + * PID file was recreated, someone else must be racing with + * us, so try again. + */ + close(fd); + } + + if (ftruncate(fd, 0) < 0) { + error_setg_errno(errp, errno, "Failed to truncate pid file"); + goto fail_unlink; + } + + snprintf(pidstr, sizeof(pidstr), FMT_pid "\n", getpid()); + if (write(fd, pidstr, strlen(pidstr)) != strlen(pidstr)) { + error_setg(errp, "Failed to write pid file"); + goto fail_unlink; + } + + return true; + +fail_unlink: + unlink(path); +fail_close: + close(fd); + return false; +} + +void *qemu_oom_check(void *ptr) +{ + if (ptr == NULL) { + fprintf(stderr, "Failed to allocate memory: %s\n", strerror(errno)); + abort(); + } + return ptr; +} + +void *qemu_try_memalign(size_t alignment, size_t size) +{ + void *ptr; + + if (alignment < sizeof(void*)) { + alignment = sizeof(void*); + } else { + g_assert(is_power_of_2(alignment)); + } + +#if defined(CONFIG_POSIX_MEMALIGN) + int ret; + ret = posix_memalign(&ptr, alignment, size); + if (ret != 0) { + errno = ret; + ptr = NULL; + } +#elif defined(CONFIG_BSD) + ptr = valloc(size); +#else + ptr = memalign(alignment, size); +#endif + trace_qemu_memalign(alignment, size, ptr); + return ptr; +} + +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + +/* alloc shared memory pages */ +void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared, + bool noreserve) +{ + const uint32_t qemu_map_flags = (shared ? QEMU_MAP_SHARED : 0) | + (noreserve ? QEMU_MAP_NORESERVE : 0); + size_t align = QEMU_VMALLOC_ALIGN; + void *ptr = qemu_ram_mmap(-1, size, align, qemu_map_flags, 0); + + if (ptr == MAP_FAILED) { + return NULL; + } + + if (alignment) { + *alignment = align; + } + + trace_qemu_anon_ram_alloc(size, ptr); + return ptr; +} + +void qemu_vfree(void *ptr) +{ + trace_qemu_vfree(ptr); + free(ptr); +} + +void qemu_anon_ram_free(void *ptr, size_t size) +{ + trace_qemu_anon_ram_free(ptr, size); + qemu_ram_munmap(-1, ptr, size); +} + +void qemu_set_block(int fd) +{ + int f; + f = fcntl(fd, F_GETFL); + assert(f != -1); + f = fcntl(fd, F_SETFL, f & ~O_NONBLOCK); + assert(f != -1); +} + +int qemu_try_set_nonblock(int fd) +{ + int f; + f = fcntl(fd, F_GETFL); + if (f == -1) { + return -errno; + } + if (fcntl(fd, F_SETFL, f | O_NONBLOCK) == -1) { + return -errno; + } + return 0; +} + +void qemu_set_nonblock(int fd) +{ + int f; + f = qemu_try_set_nonblock(fd); + assert(f == 0); +} + +int socket_set_fast_reuse(int fd) +{ + int val = 1, ret; + + ret = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, + (const char *)&val, sizeof(val)); + + assert(ret == 0); + + return ret; +} + +void qemu_set_cloexec(int fd) +{ + int f; + f = fcntl(fd, F_GETFD); + assert(f != -1); + f = fcntl(fd, F_SETFD, f | FD_CLOEXEC); + assert(f != -1); +} + +/* + * Creates a pipe with FD_CLOEXEC set on both file descriptors + */ +int qemu_pipe(int pipefd[2]) +{ + int ret; + +#ifdef CONFIG_PIPE2 + ret = pipe2(pipefd, O_CLOEXEC); + if (ret != -1 || errno != ENOSYS) { + return ret; + } +#endif + ret = pipe(pipefd); + if (ret == 0) { + qemu_set_cloexec(pipefd[0]); + qemu_set_cloexec(pipefd[1]); + } + + return ret; +} + +char * +qemu_get_local_state_pathname(const char *relative_pathname) +{ + g_autofree char *dir = g_strdup_printf("%s/%s", + CONFIG_QEMU_LOCALSTATEDIR, + relative_pathname); + return get_relocated_path(dir); +} + +void qemu_set_tty_echo(int fd, bool echo) +{ + struct termios tty; + + tcgetattr(fd, &tty); + + if (echo) { + tty.c_lflag |= ECHO | ECHONL | ICANON | IEXTEN; + } else { + tty.c_lflag &= ~(ECHO | ECHONL | ICANON | IEXTEN); + } + + tcsetattr(fd, TCSANOW, &tty); +} + +static const char *exec_dir; + +void qemu_init_exec_dir(const char *argv0) +{ + char *p = NULL; + char buf[PATH_MAX]; + + if (exec_dir) { + return; + } + +#if defined(__linux__) + { + int len; + len = readlink("/proc/self/exe", buf, sizeof(buf) - 1); + if (len > 0) { + buf[len] = 0; + p = buf; + } + } +#elif defined(__FreeBSD__) \ + || (defined(__NetBSD__) && defined(KERN_PROC_PATHNAME)) + { +#if defined(__FreeBSD__) + static int mib[4] = {CTL_KERN, KERN_PROC, KERN_PROC_PATHNAME, -1}; +#else + static int mib[4] = {CTL_KERN, KERN_PROC_ARGS, -1, KERN_PROC_PATHNAME}; +#endif + size_t len = sizeof(buf) - 1; + + *buf = '\0'; + if (!sysctl(mib, ARRAY_SIZE(mib), buf, &len, NULL, 0) && + *buf) { + buf[sizeof(buf) - 1] = '\0'; + p = buf; + } + } +#elif defined(__APPLE__) + { + char fpath[PATH_MAX]; + uint32_t len = sizeof(fpath); + if (_NSGetExecutablePath(fpath, &len) == 0) { + p = realpath(fpath, buf); + if (!p) { + return; + } + } + } +#elif defined(__HAIKU__) + { + image_info ii; + int32_t c = 0; + + *buf = '\0'; + while (get_next_image_info(0, &c, &ii) == B_OK) { + if (ii.type == B_APP_IMAGE) { + strncpy(buf, ii.name, sizeof(buf)); + buf[sizeof(buf) - 1] = 0; + p = buf; + break; + } + } + } +#endif + /* If we don't have any way of figuring out the actual executable + location then try argv[0]. */ + if (!p && argv0) { + p = realpath(argv0, buf); + } + if (p) { + exec_dir = g_path_get_dirname(p); + } else { + exec_dir = CONFIG_BINDIR; + } +} + +const char *qemu_get_exec_dir(void) +{ + return exec_dir; +} + +static void sigbus_handler(int signal) +{ + int i; + if (memset_thread) { + for (i = 0; i < memset_num_threads; i++) { + if (qemu_thread_is_self(&memset_thread[i].pgthread)) { + siglongjmp(memset_thread[i].env, 1); + } + } + } +} + +static void *do_touch_pages(void *arg) +{ + MemsetThread *memset_args = (MemsetThread *)arg; + sigset_t set, oldset; + + /* + * On Linux, the page faults from the loop below can cause mmap_sem + * contention with allocation of the thread stacks. Do not start + * clearing until all threads have been created. + */ + qemu_mutex_lock(&page_mutex); + while(!threads_created_flag){ + qemu_cond_wait(&page_cond, &page_mutex); + } + qemu_mutex_unlock(&page_mutex); + + /* unblock SIGBUS */ + sigemptyset(&set); + sigaddset(&set, SIGBUS); + pthread_sigmask(SIG_UNBLOCK, &set, &oldset); + + if (sigsetjmp(memset_args->env, 1)) { + memset_thread_failed = true; + } else { + char *addr = memset_args->addr; + size_t numpages = memset_args->numpages; + size_t hpagesize = memset_args->hpagesize; + size_t i; + for (i = 0; i < numpages; i++) { + /* + * Read & write back the same value, so we don't + * corrupt existing user/app data that might be + * stored. + * + * 'volatile' to stop compiler optimizing this away + * to a no-op + * + * TODO: get a better solution from kernel so we + * don't need to write at all so we don't cause + * wear on the storage backing the region... + */ + *(volatile char *)addr = *addr; + addr += hpagesize; + } + } + pthread_sigmask(SIG_SETMASK, &oldset, NULL); + return NULL; +} + +static inline int get_memset_num_threads(int smp_cpus) +{ + long host_procs = sysconf(_SC_NPROCESSORS_ONLN); + int ret = 1; + + if (host_procs > 0) { + ret = MIN(MIN(host_procs, MAX_MEM_PREALLOC_THREAD_COUNT), smp_cpus); + } + /* In case sysconf() fails, we fall back to single threaded */ + return ret; +} + +static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages, + int smp_cpus) +{ + static gsize initialized = 0; + size_t numpages_per_thread, leftover; + char *addr = area; + int i = 0; + + if (g_once_init_enter(&initialized)) { + qemu_mutex_init(&page_mutex); + qemu_cond_init(&page_cond); + g_once_init_leave(&initialized, 1); + } + + memset_thread_failed = false; + threads_created_flag = false; + memset_num_threads = get_memset_num_threads(smp_cpus); + memset_thread = g_new0(MemsetThread, memset_num_threads); + numpages_per_thread = numpages / memset_num_threads; + leftover = numpages % memset_num_threads; + for (i = 0; i < memset_num_threads; i++) { + memset_thread[i].addr = addr; + memset_thread[i].numpages = numpages_per_thread + (i < leftover); + memset_thread[i].hpagesize = hpagesize; + qemu_thread_create(&memset_thread[i].pgthread, "touch_pages", + do_touch_pages, &memset_thread[i], + QEMU_THREAD_JOINABLE); + addr += memset_thread[i].numpages * hpagesize; + } + + qemu_mutex_lock(&page_mutex); + threads_created_flag = true; + qemu_cond_broadcast(&page_cond); + qemu_mutex_unlock(&page_mutex); + + for (i = 0; i < memset_num_threads; i++) { + qemu_thread_join(&memset_thread[i].pgthread); + } + g_free(memset_thread); + memset_thread = NULL; + + return memset_thread_failed; +} + +void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, + Error **errp) +{ + int ret; + struct sigaction act, oldact; + size_t hpagesize = qemu_fd_getpagesize(fd); + size_t numpages = DIV_ROUND_UP(memory, hpagesize); + + memset(&act, 0, sizeof(act)); + act.sa_handler = &sigbus_handler; + act.sa_flags = 0; + + ret = sigaction(SIGBUS, &act, &oldact); + if (ret) { + error_setg_errno(errp, errno, + "os_mem_prealloc: failed to install signal handler"); + return; + } + + /* touch pages simultaneously */ + if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) { + error_setg(errp, "os_mem_prealloc: Insufficient free host memory " + "pages available to allocate guest RAM"); + } + + ret = sigaction(SIGBUS, &oldact, NULL); + if (ret) { + /* Terminate QEMU since it can't recover from error */ + perror("os_mem_prealloc: failed to reinstall signal handler"); + exit(1); + } +} + +char *qemu_get_pid_name(pid_t pid) +{ + char *name = NULL; + +#if defined(__FreeBSD__) + /* BSDs don't have /proc, but they provide a nice substitute */ + struct kinfo_proc *proc = kinfo_getproc(pid); + + if (proc) { + name = g_strdup(proc->ki_comm); + free(proc); + } +#else + /* Assume a system with reasonable procfs */ + char *pid_path; + size_t len; + + pid_path = g_strdup_printf("/proc/%d/cmdline", pid); + g_file_get_contents(pid_path, &name, &len, NULL); + g_free(pid_path); +#endif + + return name; +} + + +pid_t qemu_fork(Error **errp) +{ + sigset_t oldmask, newmask; + struct sigaction sig_action; + int saved_errno; + pid_t pid; + + /* + * Need to block signals now, so that child process can safely + * kill off caller's signal handlers without a race. + */ + sigfillset(&newmask); + if (pthread_sigmask(SIG_SETMASK, &newmask, &oldmask) != 0) { + error_setg_errno(errp, errno, + "cannot block signals"); + return -1; + } + + pid = fork(); + saved_errno = errno; + + if (pid < 0) { + /* attempt to restore signal mask, but ignore failure, to + * avoid obscuring the fork failure */ + (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); + error_setg_errno(errp, saved_errno, + "cannot fork child process"); + errno = saved_errno; + return -1; + } else if (pid) { + /* parent process */ + + /* Restore our original signal mask now that the child is + * safely running. Only documented failures are EFAULT (not + * possible, since we are using just-grabbed mask) or EINVAL + * (not possible, since we are using correct arguments). */ + (void)pthread_sigmask(SIG_SETMASK, &oldmask, NULL); + } else { + /* child process */ + size_t i; + + /* Clear out all signal handlers from parent so nothing + * unexpected can happen in our child once we unblock + * signals */ + sig_action.sa_handler = SIG_DFL; + sig_action.sa_flags = 0; + sigemptyset(&sig_action.sa_mask); + + for (i = 1; i < NSIG; i++) { + /* Only possible errors are EFAULT or EINVAL The former + * won't happen, the latter we expect, so no need to check + * return value */ + (void)sigaction(i, &sig_action, NULL); + } + + /* Unmask all signals in child, since we've no idea what the + * caller's done with their signal mask and don't want to + * propagate that to children */ + sigemptyset(&newmask); + if (pthread_sigmask(SIG_SETMASK, &newmask, NULL) != 0) { + Error *local_err = NULL; + error_setg_errno(&local_err, errno, + "cannot unblock signals"); + error_report_err(local_err); + _exit(1); + } + } + return pid; +} + +void *qemu_alloc_stack(size_t *sz) +{ + void *ptr, *guardpage; + int flags; +#ifdef CONFIG_DEBUG_STACK_USAGE + void *ptr2; +#endif + size_t pagesz = qemu_real_host_page_size; +#ifdef _SC_THREAD_STACK_MIN + /* avoid stacks smaller than _SC_THREAD_STACK_MIN */ + long min_stack_sz = sysconf(_SC_THREAD_STACK_MIN); + *sz = MAX(MAX(min_stack_sz, 0), *sz); +#endif + /* adjust stack size to a multiple of the page size */ + *sz = ROUND_UP(*sz, pagesz); + /* allocate one extra page for the guard page */ + *sz += pagesz; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; +#if defined(MAP_STACK) && defined(__OpenBSD__) + /* Only enable MAP_STACK on OpenBSD. Other OS's such as + * Linux/FreeBSD/NetBSD have a flag with the same name + * but have differing functionality. OpenBSD will SEGV + * if it spots execution with a stack pointer pointing + * at memory that was not allocated with MAP_STACK. + */ + flags |= MAP_STACK; +#endif + + ptr = mmap(NULL, *sz, PROT_READ | PROT_WRITE, flags, -1, 0); + if (ptr == MAP_FAILED) { + perror("failed to allocate memory for stack"); + abort(); + } + +#if defined(HOST_IA64) + /* separate register stack */ + guardpage = ptr + (((*sz - pagesz) / 2) & ~pagesz); +#elif defined(HOST_HPPA) + /* stack grows up */ + guardpage = ptr + *sz - pagesz; +#else + /* stack grows down */ + guardpage = ptr; +#endif + if (mprotect(guardpage, pagesz, PROT_NONE) != 0) { + perror("failed to set up stack guard page"); + abort(); + } + +#ifdef CONFIG_DEBUG_STACK_USAGE + for (ptr2 = ptr + pagesz; ptr2 < ptr + *sz; ptr2 += sizeof(uint32_t)) { + *(uint32_t *)ptr2 = 0xdeadbeaf; + } +#endif + + return ptr; +} + +#ifdef CONFIG_DEBUG_STACK_USAGE +static __thread unsigned int max_stack_usage; +#endif + +void qemu_free_stack(void *stack, size_t sz) +{ +#ifdef CONFIG_DEBUG_STACK_USAGE + unsigned int usage; + void *ptr; + + for (ptr = stack + qemu_real_host_page_size; ptr < stack + sz; + ptr += sizeof(uint32_t)) { + if (*(uint32_t *)ptr != 0xdeadbeaf) { + break; + } + } + usage = sz - (uintptr_t) (ptr - stack); + if (usage > max_stack_usage) { + error_report("thread %d max stack usage increased from %u to %u", + qemu_get_thread_id(), max_stack_usage, usage); + max_stack_usage = usage; + } +#endif + + munmap(stack, sz); +} + +/* + * Disable CFI checks. + * We are going to call a signal hander directly. Such handler may or may not + * have been defined in our binary, so there's no guarantee that the pointer + * used to set the handler is a cfi-valid pointer. Since the handlers are + * stored in kernel memory, changing the handler to an attacker-defined + * function requires being able to call a sigaction() syscall, + * which is not as easy as overwriting a pointer in memory. + */ +QEMU_DISABLE_CFI +void sigaction_invoke(struct sigaction *action, + struct qemu_signalfd_siginfo *info) +{ + siginfo_t si = {}; + si.si_signo = info->ssi_signo; + si.si_errno = info->ssi_errno; + si.si_code = info->ssi_code; + + /* Convert the minimal set of fields defined by POSIX. + * Positive si_code values are reserved for kernel-generated + * signals, where the valid siginfo fields are determined by + * the signal number. But according to POSIX, it is unspecified + * whether SI_USER and SI_QUEUE have values less than or equal to + * zero. + */ + if (info->ssi_code == SI_USER || info->ssi_code == SI_QUEUE || + info->ssi_code <= 0) { + /* SIGTERM, etc. */ + si.si_pid = info->ssi_pid; + si.si_uid = info->ssi_uid; + } else if (info->ssi_signo == SIGILL || info->ssi_signo == SIGFPE || + info->ssi_signo == SIGSEGV || info->ssi_signo == SIGBUS) { + si.si_addr = (void *)(uintptr_t)info->ssi_addr; + } else if (info->ssi_signo == SIGCHLD) { + si.si_pid = info->ssi_pid; + si.si_status = info->ssi_status; + si.si_uid = info->ssi_uid; + } + action->sa_sigaction(info->ssi_signo, &si, NULL); +} + +#ifndef HOST_NAME_MAX +# ifdef _POSIX_HOST_NAME_MAX +# define HOST_NAME_MAX _POSIX_HOST_NAME_MAX +# else +# define HOST_NAME_MAX 255 +# endif +#endif + +char *qemu_get_host_name(Error **errp) +{ + long len = -1; + g_autofree char *hostname = NULL; + +#ifdef _SC_HOST_NAME_MAX + len = sysconf(_SC_HOST_NAME_MAX); +#endif /* _SC_HOST_NAME_MAX */ + + if (len < 0) { + len = HOST_NAME_MAX; + } + + /* Unfortunately, gethostname() below does not guarantee a + * NULL terminated string. Therefore, allocate one byte more + * to be sure. */ + hostname = g_new0(char, len + 1); + + if (gethostname(hostname, len) < 0) { + error_setg_errno(errp, errno, + "cannot get hostname"); + return NULL; + } + + return g_steal_pointer(&hostname); +} + +size_t qemu_get_host_physmem(void) +{ +#ifdef _SC_PHYS_PAGES + long pages = sysconf(_SC_PHYS_PAGES); + if (pages > 0) { + if (pages > SIZE_MAX / qemu_real_host_page_size) { + return SIZE_MAX; + } else { + return pages * qemu_real_host_page_size; + } + } +#endif + return 0; +} diff --git a/util/oslib-win32.c b/util/oslib-win32.c new file mode 100644 index 000000000..af559ef33 --- /dev/null +++ b/util/oslib-win32.c @@ -0,0 +1,654 @@ +/* + * os-win32.c + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010-2016 Red Hat, Inc. + * + * QEMU library functions for win32 which are shared between QEMU and + * the QEMU tools. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * The implementation of g_poll (functions poll_rest, g_poll) at the end of + * this file are based on code from GNOME glib-2 and use a different license, + * see the license comment there. + */ + +#include "qemu/osdep.h" +#include <windows.h> +#include "qemu-common.h" +#include "qapi/error.h" +#include "qemu/main-loop.h" +#include "trace.h" +#include "qemu/sockets.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" +#include <malloc.h> + +/* this must come after including "trace.h" */ +#include <shlobj.h> + +void *qemu_oom_check(void *ptr) +{ + if (ptr == NULL) { + fprintf(stderr, "Failed to allocate memory: %lu\n", GetLastError()); + abort(); + } + return ptr; +} + +void *qemu_try_memalign(size_t alignment, size_t size) +{ + void *ptr; + + g_assert(size != 0); + if (alignment < sizeof(void *)) { + alignment = sizeof(void *); + } else { + g_assert(is_power_of_2(alignment)); + } + ptr = _aligned_malloc(size, alignment); + trace_qemu_memalign(alignment, size, ptr); + return ptr; +} + +void *qemu_memalign(size_t alignment, size_t size) +{ + return qemu_oom_check(qemu_try_memalign(alignment, size)); +} + +static int get_allocation_granularity(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + return system_info.dwAllocationGranularity; +} + +void *qemu_anon_ram_alloc(size_t size, uint64_t *align, bool shared, + bool noreserve) +{ + void *ptr; + + if (noreserve) { + /* + * We need a MEM_COMMIT before accessing any memory in a MEM_RESERVE + * area; we cannot easily mimic POSIX MAP_NORESERVE semantics. + */ + error_report("Skipping reservation of swap space is not supported."); + return NULL; + } + + ptr = VirtualAlloc(NULL, size, MEM_COMMIT, PAGE_READWRITE); + trace_qemu_anon_ram_alloc(size, ptr); + + if (ptr && align) { + *align = MAX(get_allocation_granularity(), getpagesize()); + } + return ptr; +} + +void qemu_vfree(void *ptr) +{ + trace_qemu_vfree(ptr); + _aligned_free(ptr); +} + +void qemu_anon_ram_free(void *ptr, size_t size) +{ + trace_qemu_anon_ram_free(ptr, size); + if (ptr) { + VirtualFree(ptr, 0, MEM_RELEASE); + } +} + +#ifndef _POSIX_THREAD_SAFE_FUNCTIONS +/* FIXME: add proper locking */ +struct tm *gmtime_r(const time_t *timep, struct tm *result) +{ + struct tm *p = gmtime(timep); + memset(result, 0, sizeof(*result)); + if (p) { + *result = *p; + p = result; + } + return p; +} + +/* FIXME: add proper locking */ +struct tm *localtime_r(const time_t *timep, struct tm *result) +{ + struct tm *p = localtime(timep); + memset(result, 0, sizeof(*result)); + if (p) { + *result = *p; + p = result; + } + return p; +} +#endif /* _POSIX_THREAD_SAFE_FUNCTIONS */ + +static int socket_error(void) +{ + switch (WSAGetLastError()) { + case 0: + return 0; + case WSAEINTR: + return EINTR; + case WSAEINVAL: + return EINVAL; + case WSA_INVALID_HANDLE: + return EBADF; + case WSA_NOT_ENOUGH_MEMORY: + return ENOMEM; + case WSA_INVALID_PARAMETER: + return EINVAL; + case WSAENAMETOOLONG: + return ENAMETOOLONG; + case WSAENOTEMPTY: + return ENOTEMPTY; + case WSAEWOULDBLOCK: + /* not using EWOULDBLOCK as we don't want code to have + * to check both EWOULDBLOCK and EAGAIN */ + return EAGAIN; + case WSAEINPROGRESS: + return EINPROGRESS; + case WSAEALREADY: + return EALREADY; + case WSAENOTSOCK: + return ENOTSOCK; + case WSAEDESTADDRREQ: + return EDESTADDRREQ; + case WSAEMSGSIZE: + return EMSGSIZE; + case WSAEPROTOTYPE: + return EPROTOTYPE; + case WSAENOPROTOOPT: + return ENOPROTOOPT; + case WSAEPROTONOSUPPORT: + return EPROTONOSUPPORT; + case WSAEOPNOTSUPP: + return EOPNOTSUPP; + case WSAEAFNOSUPPORT: + return EAFNOSUPPORT; + case WSAEADDRINUSE: + return EADDRINUSE; + case WSAEADDRNOTAVAIL: + return EADDRNOTAVAIL; + case WSAENETDOWN: + return ENETDOWN; + case WSAENETUNREACH: + return ENETUNREACH; + case WSAENETRESET: + return ENETRESET; + case WSAECONNABORTED: + return ECONNABORTED; + case WSAECONNRESET: + return ECONNRESET; + case WSAENOBUFS: + return ENOBUFS; + case WSAEISCONN: + return EISCONN; + case WSAENOTCONN: + return ENOTCONN; + case WSAETIMEDOUT: + return ETIMEDOUT; + case WSAECONNREFUSED: + return ECONNREFUSED; + case WSAELOOP: + return ELOOP; + case WSAEHOSTUNREACH: + return EHOSTUNREACH; + default: + return EIO; + } +} + +void qemu_set_block(int fd) +{ + unsigned long opt = 0; + WSAEventSelect(fd, NULL, 0); + ioctlsocket(fd, FIONBIO, &opt); +} + +int qemu_try_set_nonblock(int fd) +{ + unsigned long opt = 1; + if (ioctlsocket(fd, FIONBIO, &opt) != NO_ERROR) { + return -socket_error(); + } + return 0; +} + +void qemu_set_nonblock(int fd) +{ + (void)qemu_try_set_nonblock(fd); +} + +int socket_set_fast_reuse(int fd) +{ + /* Enabling the reuse of an endpoint that was used by a socket still in + * TIME_WAIT state is usually performed by setting SO_REUSEADDR. On Windows + * fast reuse is the default and SO_REUSEADDR does strange things. So we + * don't have to do anything here. More info can be found at: + * http://msdn.microsoft.com/en-us/library/windows/desktop/ms740621.aspx */ + return 0; +} + +int inet_aton(const char *cp, struct in_addr *ia) +{ + uint32_t addr = inet_addr(cp); + if (addr == 0xffffffff) { + return 0; + } + ia->s_addr = addr; + return 1; +} + +void qemu_set_cloexec(int fd) +{ +} + +/* Offset between 1/1/1601 and 1/1/1970 in 100 nanosec units */ +#define _W32_FT_OFFSET (116444736000000000ULL) + +int qemu_gettimeofday(qemu_timeval *tp) +{ + union { + unsigned long long ns100; /*time since 1 Jan 1601 in 100ns units */ + FILETIME ft; + } _now; + + if(tp) { + GetSystemTimeAsFileTime (&_now.ft); + tp->tv_usec=(long)((_now.ns100 / 10ULL) % 1000000ULL ); + tp->tv_sec= (long)((_now.ns100 - _W32_FT_OFFSET) / 10000000ULL); + } + /* Always return 0 as per Open Group Base Specifications Issue 6. + Do not set errno on error. */ + return 0; +} + +int qemu_get_thread_id(void) +{ + return GetCurrentThreadId(); +} + +char * +qemu_get_local_state_pathname(const char *relative_pathname) +{ + HRESULT result; + char base_path[MAX_PATH+1] = ""; + + result = SHGetFolderPath(NULL, CSIDL_COMMON_APPDATA, NULL, + /* SHGFP_TYPE_CURRENT */ 0, base_path); + if (result != S_OK) { + /* misconfigured environment */ + g_critical("CSIDL_COMMON_APPDATA unavailable: %ld", (long)result); + abort(); + } + return g_strdup_printf("%s" G_DIR_SEPARATOR_S "%s", base_path, + relative_pathname); +} + +void qemu_set_tty_echo(int fd, bool echo) +{ + HANDLE handle = (HANDLE)_get_osfhandle(fd); + DWORD dwMode = 0; + + if (handle == INVALID_HANDLE_VALUE) { + return; + } + + GetConsoleMode(handle, &dwMode); + + if (echo) { + SetConsoleMode(handle, dwMode | ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT); + } else { + SetConsoleMode(handle, + dwMode & ~(ENABLE_ECHO_INPUT | ENABLE_LINE_INPUT)); + } +} + +static const char *exec_dir; + +void qemu_init_exec_dir(const char *argv0) +{ + + char *p; + char buf[MAX_PATH]; + DWORD len; + + if (exec_dir) { + return; + } + + len = GetModuleFileName(NULL, buf, sizeof(buf) - 1); + if (len == 0) { + return; + } + + buf[len] = 0; + p = buf + len - 1; + while (p != buf && *p != '\\') { + p--; + } + *p = 0; + if (access(buf, R_OK) == 0) { + exec_dir = g_strdup(buf); + } else { + exec_dir = CONFIG_BINDIR; + } +} + +const char *qemu_get_exec_dir(void) +{ + return exec_dir; +} + +int getpagesize(void) +{ + SYSTEM_INFO system_info; + + GetSystemInfo(&system_info); + return system_info.dwPageSize; +} + +void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, + Error **errp) +{ + int i; + size_t pagesize = qemu_real_host_page_size; + + memory = (memory + pagesize - 1) & -pagesize; + for (i = 0; i < memory / pagesize; i++) { + memset(area + pagesize * i, 0, 1); + } +} + +char *qemu_get_pid_name(pid_t pid) +{ + /* XXX Implement me */ + abort(); +} + + +pid_t qemu_fork(Error **errp) +{ + errno = ENOSYS; + error_setg_errno(errp, errno, + "cannot fork child process"); + return -1; +} + + +#undef connect +int qemu_connect_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + int ret; + ret = connect(sockfd, addr, addrlen); + if (ret < 0) { + if (WSAGetLastError() == WSAEWOULDBLOCK) { + errno = EINPROGRESS; + } else { + errno = socket_error(); + } + } + return ret; +} + + +#undef listen +int qemu_listen_wrap(int sockfd, int backlog) +{ + int ret; + ret = listen(sockfd, backlog); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef bind +int qemu_bind_wrap(int sockfd, const struct sockaddr *addr, + socklen_t addrlen) +{ + int ret; + ret = bind(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef socket +int qemu_socket_wrap(int domain, int type, int protocol) +{ + int ret; + ret = socket(domain, type, protocol); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef accept +int qemu_accept_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = accept(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef shutdown +int qemu_shutdown_wrap(int sockfd, int how) +{ + int ret; + ret = shutdown(sockfd, how); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef ioctlsocket +int qemu_ioctlsocket_wrap(int fd, int req, void *val) +{ + int ret; + ret = ioctlsocket(fd, req, val); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef closesocket +int qemu_closesocket_wrap(int fd) +{ + int ret; + ret = closesocket(fd); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getsockopt +int qemu_getsockopt_wrap(int sockfd, int level, int optname, + void *optval, socklen_t *optlen) +{ + int ret; + ret = getsockopt(sockfd, level, optname, optval, optlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef setsockopt +int qemu_setsockopt_wrap(int sockfd, int level, int optname, + const void *optval, socklen_t optlen) +{ + int ret; + ret = setsockopt(sockfd, level, optname, optval, optlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getpeername +int qemu_getpeername_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = getpeername(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef getsockname +int qemu_getsockname_wrap(int sockfd, struct sockaddr *addr, + socklen_t *addrlen) +{ + int ret; + ret = getsockname(sockfd, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef send +ssize_t qemu_send_wrap(int sockfd, const void *buf, size_t len, int flags) +{ + int ret; + ret = send(sockfd, buf, len, flags); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef sendto +ssize_t qemu_sendto_wrap(int sockfd, const void *buf, size_t len, int flags, + const struct sockaddr *addr, socklen_t addrlen) +{ + int ret; + ret = sendto(sockfd, buf, len, flags, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef recv +ssize_t qemu_recv_wrap(int sockfd, void *buf, size_t len, int flags) +{ + int ret; + ret = recv(sockfd, buf, len, flags); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + + +#undef recvfrom +ssize_t qemu_recvfrom_wrap(int sockfd, void *buf, size_t len, int flags, + struct sockaddr *addr, socklen_t *addrlen) +{ + int ret; + ret = recvfrom(sockfd, buf, len, flags, addr, addrlen); + if (ret < 0) { + errno = socket_error(); + } + return ret; +} + +bool qemu_write_pidfile(const char *filename, Error **errp) +{ + char buffer[128]; + int len; + HANDLE file; + OVERLAPPED overlap; + BOOL ret; + memset(&overlap, 0, sizeof(overlap)); + + file = CreateFile(filename, GENERIC_WRITE, FILE_SHARE_READ, NULL, + OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL); + + if (file == INVALID_HANDLE_VALUE) { + error_setg(errp, "Failed to create PID file"); + return false; + } + len = snprintf(buffer, sizeof(buffer), FMT_pid "\n", (pid_t)getpid()); + ret = WriteFile(file, (LPCVOID)buffer, (DWORD)len, + NULL, &overlap); + CloseHandle(file); + if (ret == 0) { + error_setg(errp, "Failed to write PID file"); + return false; + } + return true; +} + +char *qemu_get_host_name(Error **errp) +{ + wchar_t tmp[MAX_COMPUTERNAME_LENGTH + 1]; + DWORD size = G_N_ELEMENTS(tmp); + + if (GetComputerNameW(tmp, &size) == 0) { + error_setg_win32(errp, GetLastError(), "failed close handle"); + return NULL; + } + + return g_utf16_to_utf8(tmp, size, NULL, NULL, NULL); +} + +size_t qemu_get_host_physmem(void) +{ + MEMORYSTATUSEX statex; + statex.dwLength = sizeof(statex); + + if (GlobalMemoryStatusEx(&statex)) { + return statex.ullTotalPhys; + } + return 0; +} diff --git a/util/pagesize.c b/util/pagesize.c new file mode 100644 index 000000000..998632cf6 --- /dev/null +++ b/util/pagesize.c @@ -0,0 +1,18 @@ +/* + * pagesize.c - query the host about its page size + * + * Copyright (C) 2017, Emilio G. Cota <cota@braap.org> + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +uintptr_t qemu_real_host_page_size; +intptr_t qemu_real_host_page_mask; + +static void __attribute__((constructor)) init_real_host_page_size(void) +{ + qemu_real_host_page_size = getpagesize(); + qemu_real_host_page_mask = -(intptr_t)qemu_real_host_page_size; +} diff --git a/util/path.c b/util/path.c new file mode 100644 index 000000000..8e174eb43 --- /dev/null +++ b/util/path.c @@ -0,0 +1,70 @@ +/* Code to mangle pathnames into those matching a given prefix. + eg. open("/lib/foo.so") => open("/usr/gnemul/i386-linux/lib/foo.so"); + + The assumption is that this area does not change. +*/ +#include "qemu/osdep.h" +#include <sys/param.h> +#include <dirent.h> +#include "qemu/cutils.h" +#include "qemu/path.h" +#include "qemu/thread.h" + +static const char *base; +static GHashTable *hash; +static QemuMutex lock; + +void init_paths(const char *prefix) +{ + if (prefix[0] == '\0' || !strcmp(prefix, "/")) { + return; + } + + if (prefix[0] == '/') { + base = g_strdup(prefix); + } else { + char *cwd = g_get_current_dir(); + base = g_build_filename(cwd, prefix, NULL); + g_free(cwd); + } + + hash = g_hash_table_new(g_str_hash, g_str_equal); + qemu_mutex_init(&lock); +} + +/* Look for path in emulation dir, otherwise return name. */ +const char *path(const char *name) +{ + gpointer key, value; + const char *ret; + + /* Only do absolute paths: quick and dirty, but should mostly be OK. */ + if (!base || !name || name[0] != '/') { + return name; + } + + qemu_mutex_lock(&lock); + + /* Have we looked up this file before? */ + if (g_hash_table_lookup_extended(hash, name, &key, &value)) { + ret = value ? value : name; + } else { + char *save = g_strdup(name); + char *full = g_build_filename(base, name, NULL); + + /* Look for the path; record the result, pass or fail. */ + if (access(full, F_OK) == 0) { + /* Exists. */ + g_hash_table_insert(hash, save, full); + ret = full; + } else { + /* Does not exist. */ + g_free(full); + g_hash_table_insert(hash, save, NULL); + ret = name; + } + } + + qemu_mutex_unlock(&lock); + return ret; +} diff --git a/util/qdist.c b/util/qdist.c new file mode 100644 index 000000000..5f75e24c2 --- /dev/null +++ b/util/qdist.c @@ -0,0 +1,397 @@ +/* + * qdist.c - QEMU helpers for handling frequency distributions of data. + * + * Copyright (C) 2016, Emilio G. Cota <cota@braap.org> + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/qdist.h" + +#include <math.h> +#ifndef NAN +#define NAN (0.0 / 0.0) +#endif + +#define QDIST_EMPTY_STR "(empty)" + +void qdist_init(struct qdist *dist) +{ + dist->entries = g_new(struct qdist_entry, 1); + dist->size = 1; + dist->n = 0; +} + +void qdist_destroy(struct qdist *dist) +{ + g_free(dist->entries); +} + +static inline int qdist_cmp_double(double a, double b) +{ + if (a > b) { + return 1; + } else if (a < b) { + return -1; + } + return 0; +} + +static int qdist_cmp(const void *ap, const void *bp) +{ + const struct qdist_entry *a = ap; + const struct qdist_entry *b = bp; + + return qdist_cmp_double(a->x, b->x); +} + +void qdist_add(struct qdist *dist, double x, long count) +{ + struct qdist_entry *entry = NULL; + + if (dist->n) { + struct qdist_entry e; + + e.x = x; + entry = bsearch(&e, dist->entries, dist->n, sizeof(e), qdist_cmp); + } + + if (entry) { + entry->count += count; + return; + } + + if (unlikely(dist->n == dist->size)) { + dist->size *= 2; + dist->entries = g_renew(struct qdist_entry, dist->entries, dist->size); + } + dist->n++; + entry = &dist->entries[dist->n - 1]; + entry->x = x; + entry->count = count; + qsort(dist->entries, dist->n, sizeof(*entry), qdist_cmp); +} + +void qdist_inc(struct qdist *dist, double x) +{ + qdist_add(dist, x, 1); +} + +/* + * Unicode for block elements. See: + * https://en.wikipedia.org/wiki/Block_Elements + */ +static const gunichar qdist_blocks[] = { + 0x2581, + 0x2582, + 0x2583, + 0x2584, + 0x2585, + 0x2586, + 0x2587, + 0x2588 +}; + +#define QDIST_NR_BLOCK_CODES ARRAY_SIZE(qdist_blocks) + +/* + * Print a distribution into a string. + * + * This function assumes that appropriate binning has been done on the input; + * see qdist_bin__internal() and qdist_pr_plain(). + * + * Callers must free the returned string with g_free(). + */ +static char *qdist_pr_internal(const struct qdist *dist) +{ + double min, max; + GString *s = g_string_new(""); + size_t i; + + /* if only one entry, its printout will be either full or empty */ + if (dist->n == 1) { + if (dist->entries[0].count) { + g_string_append_unichar(s, qdist_blocks[QDIST_NR_BLOCK_CODES - 1]); + } else { + g_string_append_c(s, ' '); + } + goto out; + } + + /* get min and max counts */ + min = dist->entries[0].count; + max = min; + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + + if (e->count < min) { + min = e->count; + } + if (e->count > max) { + max = e->count; + } + } + + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + int index; + + /* make an exception with 0; instead of using block[0], print a space */ + if (e->count) { + /* divide first to avoid loss of precision when e->count == max */ + index = (e->count - min) / (max - min) * (QDIST_NR_BLOCK_CODES - 1); + g_string_append_unichar(s, qdist_blocks[index]); + } else { + g_string_append_c(s, ' '); + } + } + out: + return g_string_free(s, FALSE); +} + +/* + * Bin the distribution in @from into @n bins of consecutive, non-overlapping + * intervals, copying the result to @to. + * + * This function is internal to qdist: only this file and test code should + * ever call it. + * + * Note: calling this function on an already-binned qdist is a bug. + * + * If @n == 0 or @from->n == 1, use @from->n. + */ +void qdist_bin__internal(struct qdist *to, const struct qdist *from, size_t n) +{ + double xmin, xmax; + double step; + size_t i, j; + + qdist_init(to); + + if (from->n == 0) { + return; + } + if (n == 0 || from->n == 1) { + n = from->n; + } + + /* set equally-sized bins between @from's left and right */ + xmin = qdist_xmin(from); + xmax = qdist_xmax(from); + step = (xmax - xmin) / n; + + if (n == from->n) { + /* if @from's entries are equally spaced, no need to re-bin */ + for (i = 0; i < from->n; i++) { + if (from->entries[i].x != xmin + i * step) { + goto rebin; + } + } + /* they're equally spaced, so copy the dist and bail out */ + to->entries = g_renew(struct qdist_entry, to->entries, n); + to->n = from->n; + memcpy(to->entries, from->entries, sizeof(*to->entries) * to->n); + return; + } + + rebin: + j = 0; + for (i = 0; i < n; i++) { + double x; + double left, right; + + left = xmin + i * step; + right = xmin + (i + 1) * step; + + /* Add x, even if it might not get any counts later */ + x = left; + qdist_add(to, x, 0); + + /* + * To avoid double-counting we capture [left, right) ranges, except for + * the righmost bin, which captures a [left, right] range. + */ + while (j < from->n && (from->entries[j].x < right || i == n - 1)) { + struct qdist_entry *o = &from->entries[j]; + + qdist_add(to, x, o->count); + j++; + } + } +} + +/* + * Print @dist into a string, after re-binning it into @n bins of consecutive, + * non-overlapping intervals. + * + * If @n == 0, use @orig->n. + * + * Callers must free the returned string with g_free(). + */ +char *qdist_pr_plain(const struct qdist *dist, size_t n) +{ + struct qdist binned; + char *ret; + + if (dist->n == 0) { + return g_strdup(QDIST_EMPTY_STR); + } + qdist_bin__internal(&binned, dist, n); + ret = qdist_pr_internal(&binned); + qdist_destroy(&binned); + return ret; +} + +static char *qdist_pr_label(const struct qdist *dist, size_t n_bins, + uint32_t opt, bool is_left) +{ + const char *percent; + const char *lparen; + const char *rparen; + GString *s; + double x1, x2, step; + double x; + double n; + int dec; + + s = g_string_new(""); + if (!(opt & QDIST_PR_LABELS)) { + goto out; + } + + dec = opt & QDIST_PR_NODECIMAL ? 0 : 1; + percent = opt & QDIST_PR_PERCENT ? "%" : ""; + + n = n_bins ? n_bins : dist->n; + x = is_left ? qdist_xmin(dist) : qdist_xmax(dist); + step = (qdist_xmax(dist) - qdist_xmin(dist)) / n; + + if (opt & QDIST_PR_100X) { + x *= 100.0; + step *= 100.0; + } + if (opt & QDIST_PR_NOBINRANGE) { + lparen = rparen = ""; + x1 = x; + x2 = x; /* unnecessary, but a dumb compiler might not figure it out */ + } else { + lparen = "["; + rparen = is_left ? ")" : "]"; + if (is_left) { + x1 = x; + x2 = x + step; + } else { + x1 = x - step; + x2 = x; + } + } + g_string_append_printf(s, "%s%.*f", lparen, dec, x1); + if (!(opt & QDIST_PR_NOBINRANGE)) { + g_string_append_printf(s, ",%.*f%s", dec, x2, rparen); + } + g_string_append(s, percent); + out: + return g_string_free(s, FALSE); +} + +/* + * Print the distribution's histogram into a string. + * + * See also: qdist_pr_plain(). + * + * Callers must free the returned string with g_free(). + */ +char *qdist_pr(const struct qdist *dist, size_t n_bins, uint32_t opt) +{ + const char *border = opt & QDIST_PR_BORDER ? "|" : ""; + char *llabel, *rlabel; + char *hgram; + GString *s; + + if (dist->n == 0) { + return g_strdup(QDIST_EMPTY_STR); + } + + s = g_string_new(""); + + llabel = qdist_pr_label(dist, n_bins, opt, true); + rlabel = qdist_pr_label(dist, n_bins, opt, false); + hgram = qdist_pr_plain(dist, n_bins); + g_string_append_printf(s, "%s%s%s%s%s", + llabel, border, hgram, border, rlabel); + g_free(llabel); + g_free(rlabel); + g_free(hgram); + + return g_string_free(s, FALSE); +} + +static inline double qdist_x(const struct qdist *dist, int index) +{ + if (dist->n == 0) { + return NAN; + } + return dist->entries[index].x; +} + +double qdist_xmin(const struct qdist *dist) +{ + return qdist_x(dist, 0); +} + +double qdist_xmax(const struct qdist *dist) +{ + return qdist_x(dist, dist->n - 1); +} + +size_t qdist_unique_entries(const struct qdist *dist) +{ + return dist->n; +} + +unsigned long qdist_sample_count(const struct qdist *dist) +{ + unsigned long count = 0; + size_t i; + + for (i = 0; i < dist->n; i++) { + struct qdist_entry *e = &dist->entries[i]; + + count += e->count; + } + return count; +} + +static double qdist_pairwise_avg(const struct qdist *dist, size_t index, + size_t n, unsigned long count) +{ + /* amortize the recursion by using a base case > 2 */ + if (n <= 8) { + size_t i; + double ret = 0; + + for (i = 0; i < n; i++) { + struct qdist_entry *e = &dist->entries[index + i]; + + ret += e->x * e->count / count; + } + return ret; + } else { + size_t n2 = n / 2; + + return qdist_pairwise_avg(dist, index, n2, count) + + qdist_pairwise_avg(dist, index + n2, n - n2, count); + } +} + +double qdist_avg(const struct qdist *dist) +{ + unsigned long count; + + count = qdist_sample_count(dist); + if (!count) { + return NAN; + } + return qdist_pairwise_avg(dist, 0, dist->n, count); +} diff --git a/util/qemu-co-shared-resource.c b/util/qemu-co-shared-resource.c new file mode 100644 index 000000000..a66cc07e7 --- /dev/null +++ b/util/qemu-co-shared-resource.c @@ -0,0 +1,90 @@ +/* + * Helper functionality for distributing a fixed total amount of + * an abstract resource among multiple coroutines. + * + * Copyright (c) 2019 Virtuozzo International GmbH + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine.h" +#include "qemu/co-shared-resource.h" + +struct SharedResource { + uint64_t total; /* Set in shres_create() and not changed anymore */ + + /* State fields protected by lock */ + uint64_t available; + CoQueue queue; + + QemuMutex lock; +}; + +SharedResource *shres_create(uint64_t total) +{ + SharedResource *s = g_new0(SharedResource, 1); + + s->total = s->available = total; + qemu_co_queue_init(&s->queue); + qemu_mutex_init(&s->lock); + + return s; +} + +void shres_destroy(SharedResource *s) +{ + assert(s->available == s->total); + qemu_mutex_destroy(&s->lock); + g_free(s); +} + +/* Called with lock held. */ +static bool co_try_get_from_shres_locked(SharedResource *s, uint64_t n) +{ + if (s->available >= n) { + s->available -= n; + return true; + } + + return false; +} + +bool co_try_get_from_shres(SharedResource *s, uint64_t n) +{ + QEMU_LOCK_GUARD(&s->lock); + return co_try_get_from_shres_locked(s, n); +} + +void coroutine_fn co_get_from_shres(SharedResource *s, uint64_t n) +{ + assert(n <= s->total); + QEMU_LOCK_GUARD(&s->lock); + while (!co_try_get_from_shres_locked(s, n)) { + qemu_co_queue_wait(&s->queue, &s->lock); + } +} + +void coroutine_fn co_put_to_shres(SharedResource *s, uint64_t n) +{ + QEMU_LOCK_GUARD(&s->lock); + assert(s->total - s->available >= n); + s->available += n; + qemu_co_queue_restart_all(&s->queue); +} diff --git a/util/qemu-config.c b/util/qemu-config.c new file mode 100644 index 000000000..436ab63b1 --- /dev/null +++ b/util/qemu-config.c @@ -0,0 +1,565 @@ +#include "qemu/osdep.h" +#include "block/qdict.h" /* for qdict_extract_subqdict() */ +#include "qapi/error.h" +#include "qapi/qapi-commands-misc.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qlist.h" +#include "qemu/error-report.h" +#include "qemu/option.h" +#include "qemu/config-file.h" + +static QemuOptsList *vm_config_groups[48]; +static QemuOptsList *drive_config_groups[5]; + +static QemuOptsList *find_list(QemuOptsList **lists, const char *group, + Error **errp) +{ + int i; + + qemu_load_module_for_opts(group); + for (i = 0; lists[i] != NULL; i++) { + if (strcmp(lists[i]->name, group) == 0) + break; + } + if (lists[i] == NULL) { + error_setg(errp, "There is no option group '%s'", group); + } + return lists[i]; +} + +QemuOptsList *qemu_find_opts(const char *group) +{ + QemuOptsList *ret; + Error *local_err = NULL; + + ret = find_list(vm_config_groups, group, &local_err); + if (local_err) { + error_report_err(local_err); + } + + return ret; +} + +QemuOpts *qemu_find_opts_singleton(const char *group) +{ + QemuOptsList *list; + QemuOpts *opts; + + list = qemu_find_opts(group); + assert(list); + opts = qemu_opts_find(list, NULL); + if (!opts) { + opts = qemu_opts_create(list, NULL, 0, &error_abort); + } + return opts; +} + +static CommandLineParameterInfoList *query_option_descs(const QemuOptDesc *desc) +{ + CommandLineParameterInfoList *param_list = NULL; + CommandLineParameterInfo *info; + int i; + + for (i = 0; desc[i].name != NULL; i++) { + info = g_malloc0(sizeof(*info)); + info->name = g_strdup(desc[i].name); + + switch (desc[i].type) { + case QEMU_OPT_STRING: + info->type = COMMAND_LINE_PARAMETER_TYPE_STRING; + break; + case QEMU_OPT_BOOL: + info->type = COMMAND_LINE_PARAMETER_TYPE_BOOLEAN; + break; + case QEMU_OPT_NUMBER: + info->type = COMMAND_LINE_PARAMETER_TYPE_NUMBER; + break; + case QEMU_OPT_SIZE: + info->type = COMMAND_LINE_PARAMETER_TYPE_SIZE; + break; + } + + if (desc[i].help) { + info->has_help = true; + info->help = g_strdup(desc[i].help); + } + if (desc[i].def_value_str) { + info->has_q_default = true; + info->q_default = g_strdup(desc[i].def_value_str); + } + + QAPI_LIST_PREPEND(param_list, info); + } + + return param_list; +} + +/* remove repeated entry from the info list */ +static void cleanup_infolist(CommandLineParameterInfoList *head) +{ + CommandLineParameterInfoList *pre_entry, *cur, *del_entry; + + cur = head; + while (cur->next) { + pre_entry = head; + while (pre_entry != cur->next) { + if (!strcmp(pre_entry->value->name, cur->next->value->name)) { + del_entry = cur->next; + cur->next = cur->next->next; + del_entry->next = NULL; + qapi_free_CommandLineParameterInfoList(del_entry); + break; + } + pre_entry = pre_entry->next; + } + cur = cur->next; + } +} + +/* merge the description items of two parameter infolists */ +static void connect_infolist(CommandLineParameterInfoList *head, + CommandLineParameterInfoList *new) +{ + CommandLineParameterInfoList *cur; + + cur = head; + while (cur->next) { + cur = cur->next; + } + cur->next = new; +} + +/* access all the local QemuOptsLists for drive option */ +static CommandLineParameterInfoList *get_drive_infolist(void) +{ + CommandLineParameterInfoList *head = NULL, *cur; + int i; + + for (i = 0; drive_config_groups[i] != NULL; i++) { + if (!head) { + head = query_option_descs(drive_config_groups[i]->desc); + } else { + cur = query_option_descs(drive_config_groups[i]->desc); + connect_infolist(head, cur); + } + } + cleanup_infolist(head); + + return head; +} + +/* restore machine options that are now machine's properties */ +static QemuOptsList machine_opts = { + .merge_lists = true, + .head = QTAILQ_HEAD_INITIALIZER(machine_opts.head), + .desc = { + { + .name = "type", + .type = QEMU_OPT_STRING, + .help = "emulated machine" + },{ + .name = "accel", + .type = QEMU_OPT_STRING, + .help = "accelerator list", + },{ + .name = "kernel_irqchip", + .type = QEMU_OPT_BOOL, + .help = "use KVM in-kernel irqchip", + },{ + .name = "kvm_shadow_mem", + .type = QEMU_OPT_SIZE, + .help = "KVM shadow MMU size", + },{ + .name = "kernel", + .type = QEMU_OPT_STRING, + .help = "Linux kernel image file", + },{ + .name = "initrd", + .type = QEMU_OPT_STRING, + .help = "Linux initial ramdisk file", + },{ + .name = "append", + .type = QEMU_OPT_STRING, + .help = "Linux kernel command line", + },{ + .name = "dtb", + .type = QEMU_OPT_STRING, + .help = "Linux kernel device tree file", + },{ + .name = "dumpdtb", + .type = QEMU_OPT_STRING, + .help = "Dump current dtb to a file and quit", + },{ + .name = "phandle_start", + .type = QEMU_OPT_NUMBER, + .help = "The first phandle ID we may generate dynamically", + },{ + .name = "dt_compatible", + .type = QEMU_OPT_STRING, + .help = "Overrides the \"compatible\" property of the dt root node", + },{ + .name = "dump-guest-core", + .type = QEMU_OPT_BOOL, + .help = "Include guest memory in a core dump", + },{ + .name = "mem-merge", + .type = QEMU_OPT_BOOL, + .help = "enable/disable memory merge support", + },{ + .name = "usb", + .type = QEMU_OPT_BOOL, + .help = "Set on/off to enable/disable usb", + },{ + .name = "firmware", + .type = QEMU_OPT_STRING, + .help = "firmware image", + },{ + .name = "iommu", + .type = QEMU_OPT_BOOL, + .help = "Set on/off to enable/disable Intel IOMMU (VT-d)", + },{ + .name = "suppress-vmdesc", + .type = QEMU_OPT_BOOL, + .help = "Set on to disable self-describing migration", + },{ + .name = "aes-key-wrap", + .type = QEMU_OPT_BOOL, + .help = "enable/disable AES key wrapping using the CPACF wrapping key", + },{ + .name = "dea-key-wrap", + .type = QEMU_OPT_BOOL, + .help = "enable/disable DEA key wrapping using the CPACF wrapping key", + },{ + .name = "loadparm", + .type = QEMU_OPT_STRING, + .help = "Up to 8 chars in set of [A-Za-z0-9. ](lower case chars" + " converted to upper case) to pass to machine" + " loader, boot manager, and guest kernel", + }, + { /* End of list */ } + } +}; + +CommandLineOptionInfoList *qmp_query_command_line_options(bool has_option, + const char *option, + Error **errp) +{ + CommandLineOptionInfoList *conf_list = NULL; + CommandLineOptionInfo *info; + int i; + + for (i = 0; vm_config_groups[i] != NULL; i++) { + if (!has_option || !strcmp(option, vm_config_groups[i]->name)) { + info = g_malloc0(sizeof(*info)); + info->option = g_strdup(vm_config_groups[i]->name); + if (!strcmp("drive", vm_config_groups[i]->name)) { + info->parameters = get_drive_infolist(); + } else { + info->parameters = + query_option_descs(vm_config_groups[i]->desc); + } + QAPI_LIST_PREPEND(conf_list, info); + } + } + + if (!has_option || !strcmp(option, "machine")) { + info = g_malloc0(sizeof(*info)); + info->option = g_strdup("machine"); + info->parameters = query_option_descs(machine_opts.desc); + QAPI_LIST_PREPEND(conf_list, info); + } + + if (conf_list == NULL) { + error_setg(errp, "invalid option name: %s", option); + } + + return conf_list; +} + +QemuOptsList *qemu_find_opts_err(const char *group, Error **errp) +{ + return find_list(vm_config_groups, group, errp); +} + +void qemu_add_drive_opts(QemuOptsList *list) +{ + int entries, i; + + entries = ARRAY_SIZE(drive_config_groups); + entries--; /* keep list NULL terminated */ + for (i = 0; i < entries; i++) { + if (drive_config_groups[i] == NULL) { + drive_config_groups[i] = list; + return; + } + } + fprintf(stderr, "ran out of space in drive_config_groups"); + abort(); +} + +void qemu_add_opts(QemuOptsList *list) +{ + int entries, i; + + entries = ARRAY_SIZE(vm_config_groups); + entries--; /* keep list NULL terminated */ + for (i = 0; i < entries; i++) { + if (vm_config_groups[i] == NULL) { + vm_config_groups[i] = list; + return; + } + } + fprintf(stderr, "ran out of space in vm_config_groups"); + abort(); +} + +struct ConfigWriteData { + QemuOptsList *list; + FILE *fp; +}; + +static int config_write_opt(void *opaque, const char *name, const char *value, + Error **errp) +{ + struct ConfigWriteData *data = opaque; + + fprintf(data->fp, " %s = \"%s\"\n", name, value); + return 0; +} + +static int config_write_opts(void *opaque, QemuOpts *opts, Error **errp) +{ + struct ConfigWriteData *data = opaque; + const char *id = qemu_opts_id(opts); + + if (id) { + fprintf(data->fp, "[%s \"%s\"]\n", data->list->name, id); + } else { + fprintf(data->fp, "[%s]\n", data->list->name); + } + qemu_opt_foreach(opts, config_write_opt, data, NULL); + fprintf(data->fp, "\n"); + return 0; +} + +void qemu_config_write(FILE *fp) +{ + struct ConfigWriteData data = { .fp = fp }; + QemuOptsList **lists = vm_config_groups; + int i; + + fprintf(fp, "# qemu config file\n\n"); + for (i = 0; lists[i] != NULL; i++) { + data.list = lists[i]; + qemu_opts_foreach(data.list, config_write_opts, &data, NULL); + } +} + +/* Returns number of config groups on success, -errno on error */ +static int qemu_config_foreach(FILE *fp, QEMUConfigCB *cb, void *opaque, + const char *fname, Error **errp) +{ + char line[1024], prev_group[64], group[64], arg[64], value[1024]; + Location loc; + Error *local_err = NULL; + QDict *qdict = NULL; + int res = -EINVAL, lno = 0; + int count = 0; + + loc_push_none(&loc); + while (fgets(line, sizeof(line), fp) != NULL) { + ++lno; + if (line[0] == '\n') { + /* skip empty lines */ + continue; + } + if (line[0] == '#') { + /* comment */ + continue; + } + if (line[0] == '[') { + QDict *prev = qdict; + if (sscanf(line, "[%63s \"%63[^\"]\"]", group, value) == 2) { + qdict = qdict_new(); + qdict_put_str(qdict, "id", value); + count++; + } else if (sscanf(line, "[%63[^]]]", group) == 1) { + qdict = qdict_new(); + count++; + } + if (qdict != prev) { + if (prev) { + cb(prev_group, prev, opaque, &local_err); + qobject_unref(prev); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + } + strcpy(prev_group, group); + continue; + } + } + loc_set_file(fname, lno); + value[0] = '\0'; + if (sscanf(line, " %63s = \"%1023[^\"]\"", arg, value) == 2 || + sscanf(line, " %63s = \"\"", arg) == 1) { + /* arg = value */ + if (qdict == NULL) { + error_setg(errp, "no group defined"); + goto out; + } + qdict_put_str(qdict, arg, value); + continue; + } + error_setg(errp, "parse error"); + goto out; + } + if (ferror(fp)) { + loc_pop(&loc); + error_setg_errno(errp, errno, "Cannot read config file"); + goto out_no_loc; + } + res = count; + if (qdict) { + cb(group, qdict, opaque, errp); + } +out: + loc_pop(&loc); +out_no_loc: + qobject_unref(qdict); + return res; +} + +void qemu_config_do_parse(const char *group, QDict *qdict, void *opaque, Error **errp) +{ + QemuOptsList **lists = opaque; + QemuOptsList *list; + + list = find_list(lists, group, errp); + if (!list) { + return; + } + + qemu_opts_from_qdict(list, qdict, errp); +} + +int qemu_config_parse(FILE *fp, QemuOptsList **lists, const char *fname, Error **errp) +{ + return qemu_config_foreach(fp, qemu_config_do_parse, lists, fname, errp); +} + +int qemu_read_config_file(const char *filename, QEMUConfigCB *cb, Error **errp) +{ + FILE *f = fopen(filename, "r"); + int ret; + + if (f == NULL) { + error_setg_file_open(errp, errno, filename); + return -errno; + } + + ret = qemu_config_foreach(f, cb, vm_config_groups, filename, errp); + fclose(f); + return ret; +} + +static void config_parse_qdict_section(QDict *options, QemuOptsList *opts, + Error **errp) +{ + QemuOpts *subopts; + QDict *subqdict; + QList *list = NULL; + size_t orig_size, enum_size; + char *prefix; + + prefix = g_strdup_printf("%s.", opts->name); + qdict_extract_subqdict(options, &subqdict, prefix); + g_free(prefix); + orig_size = qdict_size(subqdict); + if (!orig_size) { + goto out; + } + + subopts = qemu_opts_create(opts, NULL, 0, errp); + if (!subopts) { + goto out; + } + + if (!qemu_opts_absorb_qdict(subopts, subqdict, errp)) { + goto out; + } + + enum_size = qdict_size(subqdict); + if (enum_size < orig_size && enum_size) { + error_setg(errp, "Unknown option '%s' for [%s]", + qdict_first(subqdict)->key, opts->name); + goto out; + } + + if (enum_size) { + /* Multiple, enumerated sections */ + QListEntry *list_entry; + unsigned i = 0; + + /* Not required anymore */ + qemu_opts_del(subopts); + + qdict_array_split(subqdict, &list); + if (qdict_size(subqdict)) { + error_setg(errp, "Unused option '%s' for [%s]", + qdict_first(subqdict)->key, opts->name); + goto out; + } + + QLIST_FOREACH_ENTRY(list, list_entry) { + QDict *section = qobject_to(QDict, qlist_entry_obj(list_entry)); + char *opt_name; + + if (!section) { + error_setg(errp, "[%s] section (index %u) does not consist of " + "keys", opts->name, i); + goto out; + } + + opt_name = g_strdup_printf("%s.%u", opts->name, i++); + subopts = qemu_opts_create(opts, opt_name, 1, errp); + g_free(opt_name); + if (!subopts) { + goto out; + } + + if (!qemu_opts_absorb_qdict(subopts, section, errp)) { + qemu_opts_del(subopts); + goto out; + } + + if (qdict_size(section)) { + error_setg(errp, "[%s] section doesn't support the option '%s'", + opts->name, qdict_first(section)->key); + qemu_opts_del(subopts); + goto out; + } + } + } + +out: + qobject_unref(subqdict); + qobject_unref(list); +} + +void qemu_config_parse_qdict(QDict *options, QemuOptsList **lists, + Error **errp) +{ + int i; + Error *local_err = NULL; + + for (i = 0; lists[i]; i++) { + config_parse_qdict_section(options, lists[i], &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} diff --git a/util/qemu-coroutine-io.c b/util/qemu-coroutine-io.c new file mode 100644 index 000000000..5b80bb416 --- /dev/null +++ b/util/qemu-coroutine-io.c @@ -0,0 +1,93 @@ +/* + * Coroutine-aware I/O functions + * + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. + * Copyright (c) 2011, Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/sockets.h" +#include "qemu/coroutine.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" + +ssize_t coroutine_fn +qemu_co_sendv_recvv(int sockfd, struct iovec *iov, unsigned iov_cnt, + size_t offset, size_t bytes, bool do_send) +{ + size_t done = 0; + ssize_t ret; + while (done < bytes) { + ret = iov_send_recv(sockfd, iov, iov_cnt, + offset + done, bytes - done, do_send); + if (ret > 0) { + done += ret; + } else if (ret < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK) { + qemu_coroutine_yield(); + } else if (done == 0) { + return -errno; + } else { + break; + } + } else if (ret == 0 && !do_send) { + /* write (send) should never return 0. + * read (recv) returns 0 for end-of-file (-data). + * In both cases there's little point retrying, + * but we do for write anyway, just in case */ + break; + } + } + return done; +} + +ssize_t coroutine_fn +qemu_co_send_recv(int sockfd, void *buf, size_t bytes, bool do_send) +{ + struct iovec iov = { .iov_base = buf, .iov_len = bytes }; + return qemu_co_sendv_recvv(sockfd, &iov, 1, 0, bytes, do_send); +} + +typedef struct { + AioContext *ctx; + Coroutine *co; + int fd; +} FDYieldUntilData; + +static void fd_coroutine_enter(void *opaque) +{ + FDYieldUntilData *data = opaque; + aio_set_fd_handler(data->ctx, data->fd, false, NULL, NULL, NULL, NULL); + qemu_coroutine_enter(data->co); +} + +void coroutine_fn yield_until_fd_readable(int fd) +{ + FDYieldUntilData data; + + assert(qemu_in_coroutine()); + data.ctx = qemu_get_current_aio_context(); + data.co = qemu_coroutine_self(); + data.fd = fd; + aio_set_fd_handler( + data.ctx, fd, false, fd_coroutine_enter, NULL, NULL, &data); + qemu_coroutine_yield(); +} diff --git a/util/qemu-coroutine-lock.c b/util/qemu-coroutine-lock.c new file mode 100644 index 000000000..266940383 --- /dev/null +++ b/util/qemu-coroutine-lock.c @@ -0,0 +1,467 @@ +/* + * coroutine queues and locks + * + * Copyright (c) 2011 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * The lock-free mutex implementation is based on OSv + * (core/lfmutex.cc, include/lockfree/mutex.hh). + * Copyright (C) 2013 Cloudius Systems, Ltd. + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine.h" +#include "qemu/coroutine_int.h" +#include "qemu/processor.h" +#include "qemu/queue.h" +#include "block/aio.h" +#include "trace.h" + +void qemu_co_queue_init(CoQueue *queue) +{ + QSIMPLEQ_INIT(&queue->entries); +} + +void coroutine_fn qemu_co_queue_wait_impl(CoQueue *queue, QemuLockable *lock) +{ + Coroutine *self = qemu_coroutine_self(); + QSIMPLEQ_INSERT_TAIL(&queue->entries, self, co_queue_next); + + if (lock) { + qemu_lockable_unlock(lock); + } + + /* There is no race condition here. Other threads will call + * aio_co_schedule on our AioContext, which can reenter this + * coroutine but only after this yield and after the main loop + * has gone through the next iteration. + */ + qemu_coroutine_yield(); + assert(qemu_in_coroutine()); + + /* TODO: OSv implements wait morphing here, where the wakeup + * primitive automatically places the woken coroutine on the + * mutex's queue. This avoids the thundering herd effect. + * This could be implemented for CoMutexes, but not really for + * other cases of QemuLockable. + */ + if (lock) { + qemu_lockable_lock(lock); + } +} + +static bool qemu_co_queue_do_restart(CoQueue *queue, bool single) +{ + Coroutine *next; + + if (QSIMPLEQ_EMPTY(&queue->entries)) { + return false; + } + + while ((next = QSIMPLEQ_FIRST(&queue->entries)) != NULL) { + QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next); + aio_co_wake(next); + if (single) { + break; + } + } + return true; +} + +bool qemu_co_queue_next(CoQueue *queue) +{ + return qemu_co_queue_do_restart(queue, true); +} + +void qemu_co_queue_restart_all(CoQueue *queue) +{ + qemu_co_queue_do_restart(queue, false); +} + +bool qemu_co_enter_next_impl(CoQueue *queue, QemuLockable *lock) +{ + Coroutine *next; + + next = QSIMPLEQ_FIRST(&queue->entries); + if (!next) { + return false; + } + + QSIMPLEQ_REMOVE_HEAD(&queue->entries, co_queue_next); + if (lock) { + qemu_lockable_unlock(lock); + } + aio_co_wake(next); + if (lock) { + qemu_lockable_lock(lock); + } + return true; +} + +bool qemu_co_queue_empty(CoQueue *queue) +{ + return QSIMPLEQ_FIRST(&queue->entries) == NULL; +} + +/* The wait records are handled with a multiple-producer, single-consumer + * lock-free queue. There cannot be two concurrent pop_waiter() calls + * because pop_waiter() can only be called while mutex->handoff is zero. + * This can happen in three cases: + * - in qemu_co_mutex_unlock, before the hand-off protocol has started. + * In this case, qemu_co_mutex_lock will see mutex->handoff == 0 and + * not take part in the handoff. + * - in qemu_co_mutex_lock, if it steals the hand-off responsibility from + * qemu_co_mutex_unlock. In this case, qemu_co_mutex_unlock will fail + * the cmpxchg (it will see either 0 or the next sequence value) and + * exit. The next hand-off cannot begin until qemu_co_mutex_lock has + * woken up someone. + * - in qemu_co_mutex_unlock, if it takes the hand-off token itself. + * In this case another iteration starts with mutex->handoff == 0; + * a concurrent qemu_co_mutex_lock will fail the cmpxchg, and + * qemu_co_mutex_unlock will go back to case (1). + * + * The following functions manage this queue. + */ +typedef struct CoWaitRecord { + Coroutine *co; + QSLIST_ENTRY(CoWaitRecord) next; +} CoWaitRecord; + +static void push_waiter(CoMutex *mutex, CoWaitRecord *w) +{ + w->co = qemu_coroutine_self(); + QSLIST_INSERT_HEAD_ATOMIC(&mutex->from_push, w, next); +} + +static void move_waiters(CoMutex *mutex) +{ + QSLIST_HEAD(, CoWaitRecord) reversed; + QSLIST_MOVE_ATOMIC(&reversed, &mutex->from_push); + while (!QSLIST_EMPTY(&reversed)) { + CoWaitRecord *w = QSLIST_FIRST(&reversed); + QSLIST_REMOVE_HEAD(&reversed, next); + QSLIST_INSERT_HEAD(&mutex->to_pop, w, next); + } +} + +static CoWaitRecord *pop_waiter(CoMutex *mutex) +{ + CoWaitRecord *w; + + if (QSLIST_EMPTY(&mutex->to_pop)) { + move_waiters(mutex); + if (QSLIST_EMPTY(&mutex->to_pop)) { + return NULL; + } + } + w = QSLIST_FIRST(&mutex->to_pop); + QSLIST_REMOVE_HEAD(&mutex->to_pop, next); + return w; +} + +static bool has_waiters(CoMutex *mutex) +{ + return QSLIST_EMPTY(&mutex->to_pop) || QSLIST_EMPTY(&mutex->from_push); +} + +void qemu_co_mutex_init(CoMutex *mutex) +{ + memset(mutex, 0, sizeof(*mutex)); +} + +static void coroutine_fn qemu_co_mutex_wake(CoMutex *mutex, Coroutine *co) +{ + /* Read co before co->ctx; pairs with smp_wmb() in + * qemu_coroutine_enter(). + */ + smp_read_barrier_depends(); + mutex->ctx = co->ctx; + aio_co_wake(co); +} + +static void coroutine_fn qemu_co_mutex_lock_slowpath(AioContext *ctx, + CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + CoWaitRecord w; + unsigned old_handoff; + + trace_qemu_co_mutex_lock_entry(mutex, self); + push_waiter(mutex, &w); + + /* This is the "Responsibility Hand-Off" protocol; a lock() picks from + * a concurrent unlock() the responsibility of waking somebody up. + */ + old_handoff = qatomic_mb_read(&mutex->handoff); + if (old_handoff && + has_waiters(mutex) && + qatomic_cmpxchg(&mutex->handoff, old_handoff, 0) == old_handoff) { + /* There can be no concurrent pops, because there can be only + * one active handoff at a time. + */ + CoWaitRecord *to_wake = pop_waiter(mutex); + Coroutine *co = to_wake->co; + if (co == self) { + /* We got the lock ourselves! */ + assert(to_wake == &w); + mutex->ctx = ctx; + return; + } + + qemu_co_mutex_wake(mutex, co); + } + + qemu_coroutine_yield(); + trace_qemu_co_mutex_lock_return(mutex, self); +} + +void coroutine_fn qemu_co_mutex_lock(CoMutex *mutex) +{ + AioContext *ctx = qemu_get_current_aio_context(); + Coroutine *self = qemu_coroutine_self(); + int waiters, i; + + /* Running a very small critical section on pthread_mutex_t and CoMutex + * shows that pthread_mutex_t is much faster because it doesn't actually + * go to sleep. What happens is that the critical section is shorter + * than the latency of entering the kernel and thus FUTEX_WAIT always + * fails. With CoMutex there is no such latency but you still want to + * avoid wait and wakeup. So introduce it artificially. + */ + i = 0; +retry_fast_path: + waiters = qatomic_cmpxchg(&mutex->locked, 0, 1); + if (waiters != 0) { + while (waiters == 1 && ++i < 1000) { + if (qatomic_read(&mutex->ctx) == ctx) { + break; + } + if (qatomic_read(&mutex->locked) == 0) { + goto retry_fast_path; + } + cpu_relax(); + } + waiters = qatomic_fetch_inc(&mutex->locked); + } + + if (waiters == 0) { + /* Uncontended. */ + trace_qemu_co_mutex_lock_uncontended(mutex, self); + mutex->ctx = ctx; + } else { + qemu_co_mutex_lock_slowpath(ctx, mutex); + } + mutex->holder = self; + self->locks_held++; +} + +void coroutine_fn qemu_co_mutex_unlock(CoMutex *mutex) +{ + Coroutine *self = qemu_coroutine_self(); + + trace_qemu_co_mutex_unlock_entry(mutex, self); + + assert(mutex->locked); + assert(mutex->holder == self); + assert(qemu_in_coroutine()); + + mutex->ctx = NULL; + mutex->holder = NULL; + self->locks_held--; + if (qatomic_fetch_dec(&mutex->locked) == 1) { + /* No waiting qemu_co_mutex_lock(). Pfew, that was easy! */ + return; + } + + for (;;) { + CoWaitRecord *to_wake = pop_waiter(mutex); + unsigned our_handoff; + + if (to_wake) { + qemu_co_mutex_wake(mutex, to_wake->co); + break; + } + + /* Some concurrent lock() is in progress (we know this because + * mutex->locked was >1) but it hasn't yet put itself on the wait + * queue. Pick a sequence number for the handoff protocol (not 0). + */ + if (++mutex->sequence == 0) { + mutex->sequence = 1; + } + + our_handoff = mutex->sequence; + qatomic_mb_set(&mutex->handoff, our_handoff); + if (!has_waiters(mutex)) { + /* The concurrent lock has not added itself yet, so it + * will be able to pick our handoff. + */ + break; + } + + /* Try to do the handoff protocol ourselves; if somebody else has + * already taken it, however, we're done and they're responsible. + */ + if (qatomic_cmpxchg(&mutex->handoff, our_handoff, 0) != our_handoff) { + break; + } + } + + trace_qemu_co_mutex_unlock_return(mutex, self); +} + +struct CoRwTicket { + bool read; + Coroutine *co; + QSIMPLEQ_ENTRY(CoRwTicket) next; +}; + +void qemu_co_rwlock_init(CoRwlock *lock) +{ + qemu_co_mutex_init(&lock->mutex); + lock->owners = 0; + QSIMPLEQ_INIT(&lock->tickets); +} + +/* Releases the internal CoMutex. */ +static void qemu_co_rwlock_maybe_wake_one(CoRwlock *lock) +{ + CoRwTicket *tkt = QSIMPLEQ_FIRST(&lock->tickets); + Coroutine *co = NULL; + + /* + * Setting lock->owners here prevents rdlock and wrlock from + * sneaking in between unlock and wake. + */ + + if (tkt) { + if (tkt->read) { + if (lock->owners >= 0) { + lock->owners++; + co = tkt->co; + } + } else { + if (lock->owners == 0) { + lock->owners = -1; + co = tkt->co; + } + } + } + + if (co) { + QSIMPLEQ_REMOVE_HEAD(&lock->tickets, next); + qemu_co_mutex_unlock(&lock->mutex); + aio_co_wake(co); + } else { + qemu_co_mutex_unlock(&lock->mutex); + } +} + +void qemu_co_rwlock_rdlock(CoRwlock *lock) +{ + Coroutine *self = qemu_coroutine_self(); + + qemu_co_mutex_lock(&lock->mutex); + /* For fairness, wait if a writer is in line. */ + if (lock->owners == 0 || (lock->owners > 0 && QSIMPLEQ_EMPTY(&lock->tickets))) { + lock->owners++; + qemu_co_mutex_unlock(&lock->mutex); + } else { + CoRwTicket my_ticket = { true, self }; + + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); + qemu_co_mutex_unlock(&lock->mutex); + qemu_coroutine_yield(); + assert(lock->owners >= 1); + + /* Possibly wake another reader, which will wake the next in line. */ + qemu_co_mutex_lock(&lock->mutex); + qemu_co_rwlock_maybe_wake_one(lock); + } + + self->locks_held++; +} + +void qemu_co_rwlock_unlock(CoRwlock *lock) +{ + Coroutine *self = qemu_coroutine_self(); + + assert(qemu_in_coroutine()); + self->locks_held--; + + qemu_co_mutex_lock(&lock->mutex); + if (lock->owners > 0) { + lock->owners--; + } else { + assert(lock->owners == -1); + lock->owners = 0; + } + + qemu_co_rwlock_maybe_wake_one(lock); +} + +void qemu_co_rwlock_downgrade(CoRwlock *lock) +{ + qemu_co_mutex_lock(&lock->mutex); + assert(lock->owners == -1); + lock->owners = 1; + + /* Possibly wake another reader, which will wake the next in line. */ + qemu_co_rwlock_maybe_wake_one(lock); +} + +void qemu_co_rwlock_wrlock(CoRwlock *lock) +{ + Coroutine *self = qemu_coroutine_self(); + + qemu_co_mutex_lock(&lock->mutex); + if (lock->owners == 0) { + lock->owners = -1; + qemu_co_mutex_unlock(&lock->mutex); + } else { + CoRwTicket my_ticket = { false, qemu_coroutine_self() }; + + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); + qemu_co_mutex_unlock(&lock->mutex); + qemu_coroutine_yield(); + assert(lock->owners == -1); + } + + self->locks_held++; +} + +void qemu_co_rwlock_upgrade(CoRwlock *lock) +{ + qemu_co_mutex_lock(&lock->mutex); + assert(lock->owners > 0); + /* For fairness, wait if a writer is in line. */ + if (lock->owners == 1 && QSIMPLEQ_EMPTY(&lock->tickets)) { + lock->owners = -1; + qemu_co_mutex_unlock(&lock->mutex); + } else { + CoRwTicket my_ticket = { false, qemu_coroutine_self() }; + + lock->owners--; + QSIMPLEQ_INSERT_TAIL(&lock->tickets, &my_ticket, next); + qemu_co_rwlock_maybe_wake_one(lock); + qemu_coroutine_yield(); + assert(lock->owners == -1); + } +} diff --git a/util/qemu-coroutine-sleep.c b/util/qemu-coroutine-sleep.c new file mode 100644 index 000000000..571ab521f --- /dev/null +++ b/util/qemu-coroutine-sleep.c @@ -0,0 +1,80 @@ +/* + * QEMU coroutine sleep + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu/coroutine.h" +#include "qemu/coroutine_int.h" +#include "qemu/timer.h" +#include "block/aio.h" + +static const char *qemu_co_sleep_ns__scheduled = "qemu_co_sleep_ns"; + +void qemu_co_sleep_wake(QemuCoSleep *w) +{ + Coroutine *co; + + co = w->to_wake; + w->to_wake = NULL; + if (co) { + /* Write of schedule protected by barrier write in aio_co_schedule */ + const char *scheduled = qatomic_cmpxchg(&co->scheduled, + qemu_co_sleep_ns__scheduled, NULL); + + assert(scheduled == qemu_co_sleep_ns__scheduled); + aio_co_wake(co); + } +} + +static void co_sleep_cb(void *opaque) +{ + QemuCoSleep *w = opaque; + qemu_co_sleep_wake(w); +} + +void coroutine_fn qemu_co_sleep(QemuCoSleep *w) +{ + Coroutine *co = qemu_coroutine_self(); + + const char *scheduled = qatomic_cmpxchg(&co->scheduled, NULL, + qemu_co_sleep_ns__scheduled); + if (scheduled) { + fprintf(stderr, + "%s: Co-routine was already scheduled in '%s'\n", + __func__, scheduled); + abort(); + } + + w->to_wake = co; + qemu_coroutine_yield(); + + /* w->to_wake is cleared before resuming this coroutine. */ + assert(w->to_wake == NULL); +} + +void coroutine_fn qemu_co_sleep_ns_wakeable(QemuCoSleep *w, + QEMUClockType type, int64_t ns) +{ + AioContext *ctx = qemu_get_current_aio_context(); + QEMUTimer ts; + + aio_timer_init(ctx, &ts, type, SCALE_NS, co_sleep_cb, w); + timer_mod(&ts, qemu_clock_get_ns(type) + ns); + + /* + * The timer will fire in the current AiOContext, so the callback + * must happen after qemu_co_sleep yields and there is no race + * between timer_mod and qemu_co_sleep. + */ + qemu_co_sleep(w); + timer_del(&ts); +} diff --git a/util/qemu-coroutine.c b/util/qemu-coroutine.c new file mode 100644 index 000000000..38fb6d308 --- /dev/null +++ b/util/qemu-coroutine.c @@ -0,0 +1,204 @@ +/* + * QEMU coroutines + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Kevin Wolf <kwolf@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "trace.h" +#include "qemu/thread.h" +#include "qemu/atomic.h" +#include "qemu/coroutine.h" +#include "qemu/coroutine_int.h" +#include "block/aio.h" + +enum { + POOL_BATCH_SIZE = 64, +}; + +/** Free list to speed up creation */ +static QSLIST_HEAD(, Coroutine) release_pool = QSLIST_HEAD_INITIALIZER(pool); +static unsigned int release_pool_size; +static __thread QSLIST_HEAD(, Coroutine) alloc_pool = QSLIST_HEAD_INITIALIZER(pool); +static __thread unsigned int alloc_pool_size; +static __thread Notifier coroutine_pool_cleanup_notifier; + +static void coroutine_pool_cleanup(Notifier *n, void *value) +{ + Coroutine *co; + Coroutine *tmp; + + QSLIST_FOREACH_SAFE(co, &alloc_pool, pool_next, tmp) { + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); + qemu_coroutine_delete(co); + } +} + +Coroutine *qemu_coroutine_create(CoroutineEntry *entry, void *opaque) +{ + Coroutine *co = NULL; + + if (CONFIG_COROUTINE_POOL) { + co = QSLIST_FIRST(&alloc_pool); + if (!co) { + if (release_pool_size > POOL_BATCH_SIZE) { + /* Slow path; a good place to register the destructor, too. */ + if (!coroutine_pool_cleanup_notifier.notify) { + coroutine_pool_cleanup_notifier.notify = coroutine_pool_cleanup; + qemu_thread_atexit_add(&coroutine_pool_cleanup_notifier); + } + + /* This is not exact; there could be a little skew between + * release_pool_size and the actual size of release_pool. But + * it is just a heuristic, it does not need to be perfect. + */ + alloc_pool_size = qatomic_xchg(&release_pool_size, 0); + QSLIST_MOVE_ATOMIC(&alloc_pool, &release_pool); + co = QSLIST_FIRST(&alloc_pool); + } + } + if (co) { + QSLIST_REMOVE_HEAD(&alloc_pool, pool_next); + alloc_pool_size--; + } + } + + if (!co) { + co = qemu_coroutine_new(); + } + + co->entry = entry; + co->entry_arg = opaque; + QSIMPLEQ_INIT(&co->co_queue_wakeup); + return co; +} + +static void coroutine_delete(Coroutine *co) +{ + co->caller = NULL; + + if (CONFIG_COROUTINE_POOL) { + if (release_pool_size < POOL_BATCH_SIZE * 2) { + QSLIST_INSERT_HEAD_ATOMIC(&release_pool, co, pool_next); + qatomic_inc(&release_pool_size); + return; + } + if (alloc_pool_size < POOL_BATCH_SIZE) { + QSLIST_INSERT_HEAD(&alloc_pool, co, pool_next); + alloc_pool_size++; + return; + } + } + + qemu_coroutine_delete(co); +} + +void qemu_aio_coroutine_enter(AioContext *ctx, Coroutine *co) +{ + QSIMPLEQ_HEAD(, Coroutine) pending = QSIMPLEQ_HEAD_INITIALIZER(pending); + Coroutine *from = qemu_coroutine_self(); + + QSIMPLEQ_INSERT_TAIL(&pending, co, co_queue_next); + + /* Run co and any queued coroutines */ + while (!QSIMPLEQ_EMPTY(&pending)) { + Coroutine *to = QSIMPLEQ_FIRST(&pending); + CoroutineAction ret; + + /* Cannot rely on the read barrier for to in aio_co_wake(), as there are + * callers outside of aio_co_wake() */ + const char *scheduled = qatomic_mb_read(&to->scheduled); + + QSIMPLEQ_REMOVE_HEAD(&pending, co_queue_next); + + trace_qemu_aio_coroutine_enter(ctx, from, to, to->entry_arg); + + /* if the Coroutine has already been scheduled, entering it again will + * cause us to enter it twice, potentially even after the coroutine has + * been deleted */ + if (scheduled) { + fprintf(stderr, + "%s: Co-routine was already scheduled in '%s'\n", + __func__, scheduled); + abort(); + } + + if (to->caller) { + fprintf(stderr, "Co-routine re-entered recursively\n"); + abort(); + } + + to->caller = from; + to->ctx = ctx; + + /* Store to->ctx before anything that stores to. Matches + * barrier in aio_co_wake and qemu_co_mutex_wake. + */ + smp_wmb(); + + ret = qemu_coroutine_switch(from, to, COROUTINE_ENTER); + + /* Queued coroutines are run depth-first; previously pending coroutines + * run after those queued more recently. + */ + QSIMPLEQ_PREPEND(&pending, &to->co_queue_wakeup); + + switch (ret) { + case COROUTINE_YIELD: + break; + case COROUTINE_TERMINATE: + assert(!to->locks_held); + trace_qemu_coroutine_terminate(to); + coroutine_delete(to); + break; + default: + abort(); + } + } +} + +void qemu_coroutine_enter(Coroutine *co) +{ + qemu_aio_coroutine_enter(qemu_get_current_aio_context(), co); +} + +void qemu_coroutine_enter_if_inactive(Coroutine *co) +{ + if (!qemu_coroutine_entered(co)) { + qemu_coroutine_enter(co); + } +} + +void coroutine_fn qemu_coroutine_yield(void) +{ + Coroutine *self = qemu_coroutine_self(); + Coroutine *to = self->caller; + + trace_qemu_coroutine_yield(self, to); + + if (!to) { + fprintf(stderr, "Co-routine is yielding to no one\n"); + abort(); + } + + self->caller = NULL; + qemu_coroutine_switch(self, to, COROUTINE_YIELD); +} + +bool qemu_coroutine_entered(Coroutine *co) +{ + return co->caller; +} + +AioContext *coroutine_fn qemu_coroutine_get_aio_context(Coroutine *co) +{ + return co->ctx; +} diff --git a/util/qemu-error.c b/util/qemu-error.c new file mode 100644 index 000000000..52a9e013c --- /dev/null +++ b/util/qemu-error.c @@ -0,0 +1,413 @@ +/* + * Error reporting + * + * Copyright (C) 2010 Red Hat Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "monitor/monitor.h" +#include "qemu/error-report.h" + +/* + * @report_type is the type of message: error, warning or + * informational. + */ +typedef enum { + REPORT_TYPE_ERROR, + REPORT_TYPE_WARNING, + REPORT_TYPE_INFO, +} report_type; + +/* Prepend timestamp to messages */ +bool message_with_timestamp; +bool error_with_guestname; +const char *error_guest_name; + +int error_printf(const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = error_vprintf(fmt, ap); + va_end(ap); + return ret; +} + +int error_printf_unless_qmp(const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = error_vprintf_unless_qmp(fmt, ap); + va_end(ap); + return ret; +} + +static Location std_loc = { + .kind = LOC_NONE +}; +static Location *cur_loc = &std_loc; + +/* + * Push location saved in LOC onto the location stack, return it. + * The top of that stack is the current location. + * Needs a matching loc_pop(). + */ +Location *loc_push_restore(Location *loc) +{ + assert(!loc->prev); + loc->prev = cur_loc; + cur_loc = loc; + return loc; +} + +/* + * Initialize *LOC to "nowhere", push it onto the location stack. + * The top of that stack is the current location. + * Needs a matching loc_pop(). + * Return LOC. + */ +Location *loc_push_none(Location *loc) +{ + loc->kind = LOC_NONE; + loc->prev = NULL; + return loc_push_restore(loc); +} + +/* + * Pop the location stack. + * LOC must be the current location, i.e. the top of the stack. + */ +Location *loc_pop(Location *loc) +{ + assert(cur_loc == loc && loc->prev); + cur_loc = loc->prev; + loc->prev = NULL; + return loc; +} + +/* + * Save the current location in LOC, return LOC. + */ +Location *loc_save(Location *loc) +{ + *loc = *cur_loc; + loc->prev = NULL; + return loc; +} + +/* + * Change the current location to the one saved in LOC. + */ +void loc_restore(Location *loc) +{ + Location *prev = cur_loc->prev; + assert(!loc->prev); + *cur_loc = *loc; + cur_loc->prev = prev; +} + +/* + * Change the current location to "nowhere in particular". + */ +void loc_set_none(void) +{ + cur_loc->kind = LOC_NONE; +} + +/* + * Change the current location to argument ARGV[IDX..IDX+CNT-1]. + */ +void loc_set_cmdline(char **argv, int idx, int cnt) +{ + cur_loc->kind = LOC_CMDLINE; + cur_loc->num = cnt; + cur_loc->ptr = argv + idx; +} + +/* + * Change the current location to file FNAME, line LNO. + */ +void loc_set_file(const char *fname, int lno) +{ + assert (fname || cur_loc->kind == LOC_FILE); + cur_loc->kind = LOC_FILE; + cur_loc->num = lno; + if (fname) { + cur_loc->ptr = fname; + } +} + +static const char *progname; + +/* + * Set the program name for error_print_loc(). + */ +static void error_set_progname(const char *argv0) +{ + const char *p = strrchr(argv0, '/'); + progname = p ? p + 1 : argv0; +} + +const char *error_get_progname(void) +{ + return progname; +} + +/* + * Print current location to current monitor if we have one, else to stderr. + */ +static void print_loc(void) +{ + const char *sep = ""; + int i; + const char *const *argp; + + if (!monitor_cur() && progname) { + fprintf(stderr, "%s:", progname); + sep = " "; + } + switch (cur_loc->kind) { + case LOC_CMDLINE: + argp = cur_loc->ptr; + for (i = 0; i < cur_loc->num; i++) { + error_printf("%s%s", sep, argp[i]); + sep = " "; + } + error_printf(": "); + break; + case LOC_FILE: + error_printf("%s:", (const char *)cur_loc->ptr); + if (cur_loc->num) { + error_printf("%d:", cur_loc->num); + } + error_printf(" "); + break; + default: + error_printf("%s", sep); + } +} + +/* + * Print a message to current monitor if we have one, else to stderr. + * @report_type is the type of message: error, warning or informational. + * Format arguments like vsprintf(). The resulting message should be + * a single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + */ +static void vreport(report_type type, const char *fmt, va_list ap) +{ + GTimeVal tv; + gchar *timestr; + + if (message_with_timestamp && !monitor_cur()) { + g_get_current_time(&tv); + timestr = g_time_val_to_iso8601(&tv); + error_printf("%s ", timestr); + g_free(timestr); + } + + /* Only prepend guest name if -msg guest-name and -name guest=... are set */ + if (error_with_guestname && error_guest_name && !monitor_cur()) { + error_printf("%s ", error_guest_name); + } + + print_loc(); + + switch (type) { + case REPORT_TYPE_ERROR: + break; + case REPORT_TYPE_WARNING: + error_printf("warning: "); + break; + case REPORT_TYPE_INFO: + error_printf("info: "); + break; + } + + error_vprintf(fmt, ap); + error_printf("\n"); +} + +/* + * Print an error message to current monitor if we have one, else to stderr. + * Format arguments like vsprintf(). The resulting message should be + * a single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + * It's wrong to call this in a QMP monitor. Use error_setg() there. + */ +void error_vreport(const char *fmt, va_list ap) +{ + vreport(REPORT_TYPE_ERROR, fmt, ap); +} + +/* + * Print a warning message to current monitor if we have one, else to stderr. + * Format arguments like vsprintf(). The resulting message should be + * a single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + */ +void warn_vreport(const char *fmt, va_list ap) +{ + vreport(REPORT_TYPE_WARNING, fmt, ap); +} + +/* + * Print an information message to current monitor if we have one, else to + * stderr. + * Format arguments like vsprintf(). The resulting message should be + * a single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + */ +void info_vreport(const char *fmt, va_list ap) +{ + vreport(REPORT_TYPE_INFO, fmt, ap); +} + +/* + * Print an error message to current monitor if we have one, else to stderr. + * Format arguments like sprintf(). The resulting message should be + * a single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + * It's wrong to call this in a QMP monitor. Use error_setg() there. + */ +void error_report(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vreport(REPORT_TYPE_ERROR, fmt, ap); + va_end(ap); +} + +/* + * Print a warning message to current monitor if we have one, else to stderr. + * Format arguments like sprintf(). The resulting message should be a + * single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + */ +void warn_report(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vreport(REPORT_TYPE_WARNING, fmt, ap); + va_end(ap); +} + +/* + * Print an information message to current monitor if we have one, else to + * stderr. + * Format arguments like sprintf(). The resulting message should be a + * single phrase, with no newline or trailing punctuation. + * Prepend the current location and append a newline. + */ +void info_report(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vreport(REPORT_TYPE_INFO, fmt, ap); + va_end(ap); +} + +/* + * Like error_report(), except print just once. + * If *printed is false, print the message, and flip *printed to true. + * Return whether the message was printed. + */ +bool error_report_once_cond(bool *printed, const char *fmt, ...) +{ + va_list ap; + + assert(printed); + if (*printed) { + return false; + } + *printed = true; + va_start(ap, fmt); + vreport(REPORT_TYPE_ERROR, fmt, ap); + va_end(ap); + return true; +} + +/* + * Like warn_report(), except print just once. + * If *printed is false, print the message, and flip *printed to true. + * Return whether the message was printed. + */ +bool warn_report_once_cond(bool *printed, const char *fmt, ...) +{ + va_list ap; + + assert(printed); + if (*printed) { + return false; + } + *printed = true; + va_start(ap, fmt); + vreport(REPORT_TYPE_WARNING, fmt, ap); + va_end(ap); + return true; +} + +static char *qemu_glog_domains; + +static void qemu_log_func(const gchar *log_domain, + GLogLevelFlags log_level, + const gchar *message, + gpointer user_data) +{ + switch (log_level & G_LOG_LEVEL_MASK) { + case G_LOG_LEVEL_DEBUG: + case G_LOG_LEVEL_INFO: + /* + * Use same G_MESSAGES_DEBUG logic as glib to enable/disable debug + * messages + */ + if (qemu_glog_domains == NULL) { + break; + } + if (strcmp(qemu_glog_domains, "all") != 0 && + (log_domain == NULL || !strstr(qemu_glog_domains, log_domain))) { + break; + } + /* Fall through */ + case G_LOG_LEVEL_MESSAGE: + info_report("%s%s%s", + log_domain ?: "", log_domain ? ": " : "", message); + + break; + case G_LOG_LEVEL_WARNING: + warn_report("%s%s%s", + log_domain ?: "", log_domain ? ": " : "", message); + break; + case G_LOG_LEVEL_CRITICAL: + case G_LOG_LEVEL_ERROR: + error_report("%s%s%s", + log_domain ?: "", log_domain ? ": " : "", message); + break; + } +} + +void error_init(const char *argv0) +{ + /* Set the program name for error_print_loc(). */ + error_set_progname(argv0); + + /* + * This sets up glib logging so libraries using it also print their logs + * through error_report(), warn_report(), info_report(). + */ + g_log_set_default_handler(qemu_log_func, NULL); + g_warn_if_fail(qemu_glog_domains == NULL); + qemu_glog_domains = g_strdup(g_getenv("G_MESSAGES_DEBUG")); +} diff --git a/util/qemu-openpty.c b/util/qemu-openpty.c new file mode 100644 index 000000000..427f43a76 --- /dev/null +++ b/util/qemu-openpty.c @@ -0,0 +1,139 @@ +/* + * qemu-openpty.c + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2010 Red Hat, Inc. + * + * Wrapper function qemu_openpty() implementation. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* + * This is not part of oslib-posix.c because this function + * uses openpty() which often in -lutil, and if we add this + * dependency to oslib-posix.o, every app will have to be + * linked with -lutil. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +#if defined HAVE_PTY_H +# include <pty.h> +#elif defined CONFIG_BSD +# include <termios.h> +# if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__DragonFly__) +# include <libutil.h> +# else +# include <util.h> +# endif +#elif defined CONFIG_SOLARIS +# include <termios.h> +# include <stropts.h> +#else +# include <termios.h> +#endif + +#ifdef __sun__ + +#if !defined(HAVE_OPENPTY) +/* Once illumos has openpty(), this is going to be removed. */ +static int openpty(int *amaster, int *aslave, char *name, + struct termios *termp, struct winsize *winp) +{ + const char *slave; + int mfd = -1, sfd = -1; + + *amaster = *aslave = -1; + + mfd = open("/dev/ptmx", O_RDWR | O_NOCTTY); + if (mfd < 0) + goto err; + + if (grantpt(mfd) == -1 || unlockpt(mfd) == -1) + goto err; + + if ((slave = ptsname(mfd)) == NULL) + goto err; + + if ((sfd = open(slave, O_RDONLY | O_NOCTTY)) == -1) + goto err; + + if (ioctl(sfd, I_PUSH, "ptem") == -1 || + (termp != NULL && tcgetattr(sfd, termp) < 0)) + goto err; + + *amaster = mfd; + *aslave = sfd; + + if (winp) + ioctl(sfd, TIOCSWINSZ, winp); + + return 0; + +err: + if (sfd != -1) + close(sfd); + close(mfd); + return -1; +} +#endif + +static void cfmakeraw (struct termios *termios_p) +{ + termios_p->c_iflag &= + ~(IGNBRK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON); + termios_p->c_oflag &= ~OPOST; + termios_p->c_lflag &= ~(ECHO|ECHONL|ICANON|ISIG|IEXTEN); + termios_p->c_cflag &= ~(CSIZE|PARENB); + termios_p->c_cflag |= CS8; + + termios_p->c_cc[VMIN] = 0; + termios_p->c_cc[VTIME] = 0; +} +#endif + +int qemu_openpty_raw(int *aslave, char *pty_name) +{ + int amaster; + struct termios tty; +#if defined(__OpenBSD__) || defined(__DragonFly__) + char pty_buf[PATH_MAX]; +#define q_ptsname(x) pty_buf +#else + char *pty_buf = NULL; +#define q_ptsname(x) ptsname(x) +#endif + + if (openpty(&amaster, aslave, pty_buf, NULL, NULL) < 0) { + return -1; + } + + /* Set raw attributes on the pty. */ + tcgetattr(*aslave, &tty); + cfmakeraw(&tty); + tcsetattr(*aslave, TCSAFLUSH, &tty); + + if (pty_name) { + strcpy(pty_name, q_ptsname(amaster)); + } + + return amaster; +} diff --git a/util/qemu-option.c b/util/qemu-option.c new file mode 100644 index 000000000..eedd08929 --- /dev/null +++ b/util/qemu-option.c @@ -0,0 +1,1226 @@ +/* + * Commandline option parsing functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2009 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" + +#include "qapi/error.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qnum.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qerror.h" +#include "qemu/option_int.h" +#include "qemu/cutils.h" +#include "qemu/id.h" +#include "qemu/help_option.h" + +/* + * Extracts the name of an option from the parameter string (@p points at the + * first byte of the option name) + * + * The option name is @len characters long and is copied into @option. The + * caller is responsible for free'ing @option when no longer required. + * + * The return value is the position of the delimiter/zero byte after the option + * name in @p. + */ +static const char *get_opt_name(const char *p, char **option, size_t len) +{ + *option = g_strndup(p, len); + return p + len; +} + +/* + * Extracts the value of an option from the parameter string p (p points at the + * first byte of the option value) + * + * This function is comparable to get_opt_name with the difference that the + * delimiter is fixed to be comma which starts a new option. To specify an + * option value that contains commas, double each comma. + */ +const char *get_opt_value(const char *p, char **value) +{ + size_t capacity = 0, length; + const char *offset; + + *value = NULL; + while (1) { + offset = qemu_strchrnul(p, ','); + length = offset - p; + if (*offset != '\0' && *(offset + 1) == ',') { + length++; + } + *value = g_renew(char, *value, capacity + length + 1); + strncpy(*value + capacity, p, length); + (*value)[capacity + length] = '\0'; + capacity += length; + if (*offset == '\0' || + *(offset + 1) != ',') { + break; + } + + p += (offset - p) + 2; + } + + return offset; +} + +static bool parse_option_number(const char *name, const char *value, + uint64_t *ret, Error **errp) +{ + uint64_t number; + int err; + + err = qemu_strtou64(value, NULL, 0, &number); + if (err == -ERANGE) { + error_setg(errp, "Value '%s' is too large for parameter '%s'", + value, name); + return false; + } + if (err) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, "a number"); + return false; + } + *ret = number; + return true; +} + +static const QemuOptDesc *find_desc_by_name(const QemuOptDesc *desc, + const char *name) +{ + int i; + + for (i = 0; desc[i].name != NULL; i++) { + if (strcmp(desc[i].name, name) == 0) { + return &desc[i]; + } + } + + return NULL; +} + +static const char *find_default_by_name(QemuOpts *opts, const char *name) +{ + const QemuOptDesc *desc = find_desc_by_name(opts->list->desc, name); + + return desc ? desc->def_value_str : NULL; +} + +bool parse_option_size(const char *name, const char *value, + uint64_t *ret, Error **errp) +{ + uint64_t size; + int err; + + err = qemu_strtosz(value, NULL, &size); + if (err == -ERANGE) { + error_setg(errp, "Value '%s' is out of range for parameter '%s'", + value, name); + return false; + } + if (err) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, name, + "a non-negative number below 2^64"); + error_append_hint(errp, "Optional suffix k, M, G, T, P or E means" + " kilo-, mega-, giga-, tera-, peta-\n" + "and exabytes, respectively.\n"); + return false; + } + *ret = size; + return true; +} + +static const char *opt_type_to_string(enum QemuOptType type) +{ + switch (type) { + case QEMU_OPT_STRING: + return "str"; + case QEMU_OPT_BOOL: + return "bool (on/off)"; + case QEMU_OPT_NUMBER: + return "num"; + case QEMU_OPT_SIZE: + return "size"; + } + + g_assert_not_reached(); +} + +/** + * Print the list of options available in the given list. If + * @print_caption is true, a caption (including the list name, if it + * exists) is printed. The options itself will be indented, so + * @print_caption should only be set to false if the caller prints its + * own custom caption (so that the indentation makes sense). + */ +void qemu_opts_print_help(QemuOptsList *list, bool print_caption) +{ + QemuOptDesc *desc; + int i; + GPtrArray *array = g_ptr_array_new(); + + assert(list); + desc = list->desc; + while (desc && desc->name) { + GString *str = g_string_new(NULL); + g_string_append_printf(str, " %s=<%s>", desc->name, + opt_type_to_string(desc->type)); + if (desc->help) { + if (str->len < 24) { + g_string_append_printf(str, "%*s", 24 - (int)str->len, ""); + } + g_string_append_printf(str, " - %s", desc->help); + } + g_ptr_array_add(array, g_string_free(str, false)); + desc++; + } + + g_ptr_array_sort(array, (GCompareFunc)qemu_pstrcmp0); + if (print_caption && array->len > 0) { + if (list->name) { + printf("%s options:\n", list->name); + } else { + printf("Options:\n"); + } + } else if (array->len == 0) { + if (list->name) { + printf("There are no options for %s.\n", list->name); + } else { + printf("No options available.\n"); + } + } + for (i = 0; i < array->len; i++) { + printf("%s\n", (char *)array->pdata[i]); + } + g_ptr_array_set_free_func(array, g_free); + g_ptr_array_free(array, true); + +} +/* ------------------------------------------------------------------ */ + +QemuOpt *qemu_opt_find(QemuOpts *opts, const char *name) +{ + QemuOpt *opt; + + QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) { + if (strcmp(opt->name, name) != 0) + continue; + return opt; + } + return NULL; +} + +static void qemu_opt_del(QemuOpt *opt) +{ + QTAILQ_REMOVE(&opt->opts->head, opt, next); + g_free(opt->name); + g_free(opt->str); + g_free(opt); +} + +/* qemu_opt_set allows many settings for the same option. + * This function deletes all settings for an option. + */ +static void qemu_opt_del_all(QemuOpts *opts, const char *name) +{ + QemuOpt *opt, *next_opt; + + QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next_opt) { + if (!strcmp(opt->name, name)) { + qemu_opt_del(opt); + } + } +} + +const char *qemu_opt_get(QemuOpts *opts, const char *name) +{ + QemuOpt *opt; + + if (opts == NULL) { + return NULL; + } + + opt = qemu_opt_find(opts, name); + if (!opt) { + return find_default_by_name(opts, name); + } + + return opt->str; +} + +void qemu_opt_iter_init(QemuOptsIter *iter, QemuOpts *opts, const char *name) +{ + iter->opts = opts; + iter->opt = QTAILQ_FIRST(&opts->head); + iter->name = name; +} + +const char *qemu_opt_iter_next(QemuOptsIter *iter) +{ + QemuOpt *ret = iter->opt; + if (iter->name) { + while (ret && !g_str_equal(iter->name, ret->name)) { + ret = QTAILQ_NEXT(ret, next); + } + } + iter->opt = ret ? QTAILQ_NEXT(ret, next) : NULL; + return ret ? ret->str : NULL; +} + +/* Get a known option (or its default) and remove it from the list + * all in one action. Return a malloced string of the option value. + * Result must be freed by caller with g_free(). + */ +char *qemu_opt_get_del(QemuOpts *opts, const char *name) +{ + QemuOpt *opt; + char *str; + + if (opts == NULL) { + return NULL; + } + + opt = qemu_opt_find(opts, name); + if (!opt) { + return g_strdup(find_default_by_name(opts, name)); + } + str = opt->str; + opt->str = NULL; + qemu_opt_del_all(opts, name); + return str; +} + +bool qemu_opt_has_help_opt(QemuOpts *opts) +{ + QemuOpt *opt; + + QTAILQ_FOREACH_REVERSE(opt, &opts->head, next) { + if (is_help_option(opt->name)) { + return true; + } + } + return false; +} + +static bool qemu_opt_get_bool_helper(QemuOpts *opts, const char *name, + bool defval, bool del) +{ + QemuOpt *opt; + const char *def_val; + bool ret = defval; + + if (opts == NULL) { + return ret; + } + + opt = qemu_opt_find(opts, name); + if (opt == NULL) { + def_val = find_default_by_name(opts, name); + if (def_val) { + qapi_bool_parse(name, def_val, &ret, &error_abort); + } + return ret; + } + assert(opt->desc && opt->desc->type == QEMU_OPT_BOOL); + ret = opt->value.boolean; + if (del) { + qemu_opt_del_all(opts, name); + } + return ret; +} + +bool qemu_opt_get_bool(QemuOpts *opts, const char *name, bool defval) +{ + return qemu_opt_get_bool_helper(opts, name, defval, false); +} + +bool qemu_opt_get_bool_del(QemuOpts *opts, const char *name, bool defval) +{ + return qemu_opt_get_bool_helper(opts, name, defval, true); +} + +static uint64_t qemu_opt_get_number_helper(QemuOpts *opts, const char *name, + uint64_t defval, bool del) +{ + QemuOpt *opt; + const char *def_val; + uint64_t ret = defval; + + if (opts == NULL) { + return ret; + } + + opt = qemu_opt_find(opts, name); + if (opt == NULL) { + def_val = find_default_by_name(opts, name); + if (def_val) { + parse_option_number(name, def_val, &ret, &error_abort); + } + return ret; + } + assert(opt->desc && opt->desc->type == QEMU_OPT_NUMBER); + ret = opt->value.uint; + if (del) { + qemu_opt_del_all(opts, name); + } + return ret; +} + +uint64_t qemu_opt_get_number(QemuOpts *opts, const char *name, uint64_t defval) +{ + return qemu_opt_get_number_helper(opts, name, defval, false); +} + +uint64_t qemu_opt_get_number_del(QemuOpts *opts, const char *name, + uint64_t defval) +{ + return qemu_opt_get_number_helper(opts, name, defval, true); +} + +static uint64_t qemu_opt_get_size_helper(QemuOpts *opts, const char *name, + uint64_t defval, bool del) +{ + QemuOpt *opt; + const char *def_val; + uint64_t ret = defval; + + if (opts == NULL) { + return ret; + } + + opt = qemu_opt_find(opts, name); + if (opt == NULL) { + def_val = find_default_by_name(opts, name); + if (def_val) { + parse_option_size(name, def_val, &ret, &error_abort); + } + return ret; + } + assert(opt->desc && opt->desc->type == QEMU_OPT_SIZE); + ret = opt->value.uint; + if (del) { + qemu_opt_del_all(opts, name); + } + return ret; +} + +uint64_t qemu_opt_get_size(QemuOpts *opts, const char *name, uint64_t defval) +{ + return qemu_opt_get_size_helper(opts, name, defval, false); +} + +uint64_t qemu_opt_get_size_del(QemuOpts *opts, const char *name, + uint64_t defval) +{ + return qemu_opt_get_size_helper(opts, name, defval, true); +} + +static bool qemu_opt_parse(QemuOpt *opt, Error **errp) +{ + if (opt->desc == NULL) + return true; + + switch (opt->desc->type) { + case QEMU_OPT_STRING: + /* nothing */ + return true; + case QEMU_OPT_BOOL: + return qapi_bool_parse(opt->name, opt->str, &opt->value.boolean, errp); + case QEMU_OPT_NUMBER: + return parse_option_number(opt->name, opt->str, &opt->value.uint, + errp); + case QEMU_OPT_SIZE: + return parse_option_size(opt->name, opt->str, &opt->value.uint, + errp); + default: + abort(); + } +} + +static bool opts_accepts_any(const QemuOptsList *list) +{ + return list->desc[0].name == NULL; +} + +int qemu_opt_unset(QemuOpts *opts, const char *name) +{ + QemuOpt *opt = qemu_opt_find(opts, name); + + assert(opts_accepts_any(opts->list)); + + if (opt == NULL) { + return -1; + } else { + qemu_opt_del(opt); + return 0; + } +} + +static QemuOpt *opt_create(QemuOpts *opts, const char *name, char *value) +{ + QemuOpt *opt = g_malloc0(sizeof(*opt)); + + opt->name = g_strdup(name); + opt->str = value; + opt->opts = opts; + QTAILQ_INSERT_TAIL(&opts->head, opt, next); + + return opt; +} + +static bool opt_validate(QemuOpt *opt, Error **errp) +{ + const QemuOptDesc *desc; + const QemuOptsList *list = opt->opts->list; + + desc = find_desc_by_name(list->desc, opt->name); + if (!desc && !opts_accepts_any(list)) { + error_setg(errp, QERR_INVALID_PARAMETER, opt->name); + return false; + } + + opt->desc = desc; + if (!qemu_opt_parse(opt, errp)) { + return false; + } + + return true; +} + +bool qemu_opt_set(QemuOpts *opts, const char *name, const char *value, + Error **errp) +{ + QemuOpt *opt = opt_create(opts, name, g_strdup(value)); + + if (!opt_validate(opt, errp)) { + qemu_opt_del(opt); + return false; + } + return true; +} + +bool qemu_opt_set_bool(QemuOpts *opts, const char *name, bool val, + Error **errp) +{ + QemuOpt *opt; + const QemuOptDesc *desc; + const QemuOptsList *list = opts->list; + + desc = find_desc_by_name(list->desc, name); + if (!desc && !opts_accepts_any(list)) { + error_setg(errp, QERR_INVALID_PARAMETER, name); + return false; + } + + opt = g_malloc0(sizeof(*opt)); + opt->name = g_strdup(name); + opt->opts = opts; + opt->desc = desc; + opt->value.boolean = !!val; + opt->str = g_strdup(val ? "on" : "off"); + QTAILQ_INSERT_TAIL(&opts->head, opt, next); + return true; +} + +bool qemu_opt_set_number(QemuOpts *opts, const char *name, int64_t val, + Error **errp) +{ + QemuOpt *opt; + const QemuOptDesc *desc; + const QemuOptsList *list = opts->list; + + desc = find_desc_by_name(list->desc, name); + if (!desc && !opts_accepts_any(list)) { + error_setg(errp, QERR_INVALID_PARAMETER, name); + return false; + } + + opt = g_malloc0(sizeof(*opt)); + opt->name = g_strdup(name); + opt->opts = opts; + opt->desc = desc; + opt->value.uint = val; + opt->str = g_strdup_printf("%" PRId64, val); + QTAILQ_INSERT_TAIL(&opts->head, opt, next); + return true; +} + +/** + * For each member of @opts, call @func(@opaque, name, value, @errp). + * @func() may store an Error through @errp, but must return non-zero then. + * When @func() returns non-zero, break the loop and return that value. + * Return zero when the loop completes. + */ +int qemu_opt_foreach(QemuOpts *opts, qemu_opt_loopfunc func, void *opaque, + Error **errp) +{ + QemuOpt *opt; + int rc; + + QTAILQ_FOREACH(opt, &opts->head, next) { + rc = func(opaque, opt->name, opt->str, errp); + if (rc) { + return rc; + } + assert(!errp || !*errp); + } + return 0; +} + +QemuOpts *qemu_opts_find(QemuOptsList *list, const char *id) +{ + QemuOpts *opts; + + QTAILQ_FOREACH(opts, &list->head, next) { + if (!opts->id && !id) { + return opts; + } + if (opts->id && id && !strcmp(opts->id, id)) { + return opts; + } + } + return NULL; +} + +QemuOpts *qemu_opts_create(QemuOptsList *list, const char *id, + int fail_if_exists, Error **errp) +{ + QemuOpts *opts = NULL; + + if (list->merge_lists) { + if (id) { + error_setg(errp, QERR_INVALID_PARAMETER, "id"); + return NULL; + } + opts = qemu_opts_find(list, NULL); + if (opts) { + return opts; + } + } else if (id) { + assert(fail_if_exists); + if (!id_wellformed(id)) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", + "an identifier"); + error_append_hint(errp, "Identifiers consist of letters, digits, " + "'-', '.', '_', starting with a letter.\n"); + return NULL; + } + opts = qemu_opts_find(list, id); + if (opts != NULL) { + error_setg(errp, "Duplicate ID '%s' for %s", id, list->name); + return NULL; + } + } + opts = g_malloc0(sizeof(*opts)); + opts->id = g_strdup(id); + opts->list = list; + loc_save(&opts->loc); + QTAILQ_INIT(&opts->head); + QTAILQ_INSERT_TAIL(&list->head, opts, next); + return opts; +} + +void qemu_opts_reset(QemuOptsList *list) +{ + QemuOpts *opts, *next_opts; + + QTAILQ_FOREACH_SAFE(opts, &list->head, next, next_opts) { + qemu_opts_del(opts); + } +} + +void qemu_opts_loc_restore(QemuOpts *opts) +{ + loc_restore(&opts->loc); +} + +const char *qemu_opts_id(QemuOpts *opts) +{ + return opts->id; +} + +/* The id string will be g_free()d by qemu_opts_del */ +void qemu_opts_set_id(QemuOpts *opts, char *id) +{ + opts->id = id; +} + +void qemu_opts_del(QemuOpts *opts) +{ + QemuOpt *opt; + + if (opts == NULL) { + return; + } + + for (;;) { + opt = QTAILQ_FIRST(&opts->head); + if (opt == NULL) + break; + qemu_opt_del(opt); + } + QTAILQ_REMOVE(&opts->list->head, opts, next); + g_free(opts->id); + g_free(opts); +} + +/* print value, escaping any commas in value */ +static void escaped_print(const char *value) +{ + const char *ptr; + + for (ptr = value; *ptr; ++ptr) { + if (*ptr == ',') { + putchar(','); + } + putchar(*ptr); + } +} + +void qemu_opts_print(QemuOpts *opts, const char *separator) +{ + QemuOpt *opt; + QemuOptDesc *desc = opts->list->desc; + const char *sep = ""; + + if (opts->id) { + printf("id=%s", opts->id); /* passed id_wellformed -> no commas */ + sep = separator; + } + + if (desc[0].name == NULL) { + QTAILQ_FOREACH(opt, &opts->head, next) { + printf("%s%s=", sep, opt->name); + escaped_print(opt->str); + sep = separator; + } + return; + } + for (; desc && desc->name; desc++) { + const char *value; + opt = qemu_opt_find(opts, desc->name); + + value = opt ? opt->str : desc->def_value_str; + if (!value) { + continue; + } + if (desc->type == QEMU_OPT_STRING) { + printf("%s%s=", sep, desc->name); + escaped_print(value); + } else if ((desc->type == QEMU_OPT_SIZE || + desc->type == QEMU_OPT_NUMBER) && opt) { + printf("%s%s=%" PRId64, sep, desc->name, opt->value.uint); + } else { + printf("%s%s=%s", sep, desc->name, value); + } + sep = separator; + } +} + +static const char *get_opt_name_value(const char *params, + const char *firstname, + bool warn_on_flag, + bool *help_wanted, + char **name, char **value) +{ + const char *p; + const char *prefix = ""; + size_t len; + bool is_help = false; + + len = strcspn(params, "=,"); + if (params[len] != '=') { + /* found "foo,more" */ + if (firstname) { + /* implicitly named first option */ + *name = g_strdup(firstname); + p = get_opt_value(params, value); + } else { + /* option without value, must be a flag */ + p = get_opt_name(params, name, len); + if (strncmp(*name, "no", 2) == 0) { + memmove(*name, *name + 2, strlen(*name + 2) + 1); + *value = g_strdup("off"); + prefix = "no"; + } else { + *value = g_strdup("on"); + is_help = is_help_option(*name); + } + if (!is_help && warn_on_flag) { + warn_report("short-form boolean option '%s%s' deprecated", prefix, *name); + if (g_str_equal(*name, "delay")) { + error_printf("Please use nodelay=%s instead\n", prefix[0] ? "on" : "off"); + } else { + error_printf("Please use %s=%s instead\n", *name, *value); + } + } + } + } else { + /* found "foo=bar,more" */ + p = get_opt_name(params, name, len); + assert(*p == '='); + p++; + p = get_opt_value(p, value); + } + + assert(!*p || *p == ','); + if (help_wanted && is_help) { + *help_wanted = true; + } + if (*p == ',') { + p++; + } + return p; +} + +static bool opts_do_parse(QemuOpts *opts, const char *params, + const char *firstname, + bool warn_on_flag, bool *help_wanted, Error **errp) +{ + char *option, *value; + const char *p; + QemuOpt *opt; + + for (p = params; *p;) { + p = get_opt_name_value(p, firstname, warn_on_flag, help_wanted, &option, &value); + if (help_wanted && *help_wanted) { + g_free(option); + g_free(value); + return false; + } + firstname = NULL; + + if (!strcmp(option, "id")) { + g_free(option); + g_free(value); + continue; + } + + opt = opt_create(opts, option, value); + g_free(option); + if (!opt_validate(opt, errp)) { + qemu_opt_del(opt); + return false; + } + } + + return true; +} + +static char *opts_parse_id(const char *params) +{ + const char *p; + char *name, *value; + + for (p = params; *p;) { + p = get_opt_name_value(p, NULL, false, NULL, &name, &value); + if (!strcmp(name, "id")) { + g_free(name); + return value; + } + g_free(name); + g_free(value); + } + + return NULL; +} + +bool has_help_option(const char *params) +{ + const char *p; + char *name, *value; + bool ret = false; + + for (p = params; *p;) { + p = get_opt_name_value(p, NULL, false, &ret, &name, &value); + g_free(name); + g_free(value); + if (ret) { + return true; + } + } + + return false; +} + +/** + * Store options parsed from @params into @opts. + * If @firstname is non-null, the first key=value in @params may omit + * key=, and is treated as if key was @firstname. + * On error, store an error object through @errp if non-null. + */ +bool qemu_opts_do_parse(QemuOpts *opts, const char *params, + const char *firstname, Error **errp) +{ + return opts_do_parse(opts, params, firstname, false, NULL, errp); +} + +static QemuOpts *opts_parse(QemuOptsList *list, const char *params, + bool permit_abbrev, + bool warn_on_flag, bool *help_wanted, Error **errp) +{ + const char *firstname; + char *id = opts_parse_id(params); + QemuOpts *opts; + + assert(!permit_abbrev || list->implied_opt_name); + firstname = permit_abbrev ? list->implied_opt_name : NULL; + + opts = qemu_opts_create(list, id, !list->merge_lists, errp); + g_free(id); + if (opts == NULL) { + return NULL; + } + + if (!opts_do_parse(opts, params, firstname, + warn_on_flag, help_wanted, errp)) { + qemu_opts_del(opts); + return NULL; + } + + return opts; +} + +/** + * Create a QemuOpts in @list and with options parsed from @params. + * If @permit_abbrev, the first key=value in @params may omit key=, + * and is treated as if key was @list->implied_opt_name. + * On error, store an error object through @errp if non-null. + * Return the new QemuOpts on success, null pointer on error. + */ +QemuOpts *qemu_opts_parse(QemuOptsList *list, const char *params, + bool permit_abbrev, Error **errp) +{ + return opts_parse(list, params, permit_abbrev, false, NULL, errp); +} + +/** + * Create a QemuOpts in @list and with options parsed from @params. + * If @permit_abbrev, the first key=value in @params may omit key=, + * and is treated as if key was @list->implied_opt_name. + * Report errors with error_report_err(). This is inappropriate in + * QMP context. Do not use this function there! + * Return the new QemuOpts on success, null pointer on error. + */ +QemuOpts *qemu_opts_parse_noisily(QemuOptsList *list, const char *params, + bool permit_abbrev) +{ + Error *err = NULL; + QemuOpts *opts; + bool help_wanted = false; + + opts = opts_parse(list, params, permit_abbrev, true, + opts_accepts_any(list) ? NULL : &help_wanted, + &err); + if (!opts) { + assert(!!err + !!help_wanted == 1); + if (help_wanted) { + qemu_opts_print_help(list, true); + } else { + error_report_err(err); + } + } + return opts; +} + +static bool qemu_opts_from_qdict_entry(QemuOpts *opts, + const QDictEntry *entry, + Error **errp) +{ + const char *key = qdict_entry_key(entry); + QObject *obj = qdict_entry_value(entry); + char buf[32]; + g_autofree char *tmp = NULL; + const char *value; + + if (!strcmp(key, "id")) { + return true; + } + + switch (qobject_type(obj)) { + case QTYPE_QSTRING: + value = qstring_get_str(qobject_to(QString, obj)); + break; + case QTYPE_QNUM: + tmp = qnum_to_string(qobject_to(QNum, obj)); + value = tmp; + break; + case QTYPE_QBOOL: + pstrcpy(buf, sizeof(buf), + qbool_get_bool(qobject_to(QBool, obj)) ? "on" : "off"); + value = buf; + break; + default: + return true; + } + + return qemu_opt_set(opts, key, value, errp); +} + +/* + * Create QemuOpts from a QDict. + * Use value of key "id" as ID if it exists and is a QString. Only + * QStrings, QNums and QBools are copied. Entries with other types + * are silently ignored. + */ +QemuOpts *qemu_opts_from_qdict(QemuOptsList *list, const QDict *qdict, + Error **errp) +{ + QemuOpts *opts; + const QDictEntry *entry; + + opts = qemu_opts_create(list, qdict_get_try_str(qdict, "id"), 1, errp); + if (!opts) { + return NULL; + } + + for (entry = qdict_first(qdict); + entry; + entry = qdict_next(qdict, entry)) { + if (!qemu_opts_from_qdict_entry(opts, entry, errp)) { + qemu_opts_del(opts); + return NULL; + } + } + + return opts; +} + +/* + * Adds all QDict entries to the QemuOpts that can be added and removes them + * from the QDict. When this function returns, the QDict contains only those + * entries that couldn't be added to the QemuOpts. + */ +bool qemu_opts_absorb_qdict(QemuOpts *opts, QDict *qdict, Error **errp) +{ + const QDictEntry *entry, *next; + + entry = qdict_first(qdict); + + while (entry != NULL) { + next = qdict_next(qdict, entry); + + if (opts_accepts_any(opts->list) || + find_desc_by_name(opts->list->desc, entry->key)) { + if (!qemu_opts_from_qdict_entry(opts, entry, errp)) { + return false; + } + qdict_del(qdict, entry->key); + } + + entry = next; + } + + return true; +} + +/* + * Convert from QemuOpts to QDict. The QDict values are of type QString. + * + * If @list is given, only add those options to the QDict that are contained in + * the list. If @del is true, any options added to the QDict are removed from + * the QemuOpts, otherwise they remain there. + * + * If two options in @opts have the same name, they are processed in order + * so that the last one wins (consistent with the reverse iteration in + * qemu_opt_find()), but all of them are deleted if @del is true. + * + * TODO We'll want to use types appropriate for opt->desc->type, but + * this is enough for now. + */ +QDict *qemu_opts_to_qdict_filtered(QemuOpts *opts, QDict *qdict, + QemuOptsList *list, bool del) +{ + QemuOpt *opt, *next; + + if (!qdict) { + qdict = qdict_new(); + } + if (opts->id) { + qdict_put_str(qdict, "id", opts->id); + } + QTAILQ_FOREACH_SAFE(opt, &opts->head, next, next) { + if (list) { + QemuOptDesc *desc; + bool found = false; + for (desc = list->desc; desc->name; desc++) { + if (!strcmp(desc->name, opt->name)) { + found = true; + break; + } + } + if (!found) { + continue; + } + } + qdict_put_str(qdict, opt->name, opt->str); + if (del) { + qemu_opt_del(opt); + } + } + return qdict; +} + +/* Copy all options in a QemuOpts to the given QDict. See + * qemu_opts_to_qdict_filtered() for details. */ +QDict *qemu_opts_to_qdict(QemuOpts *opts, QDict *qdict) +{ + return qemu_opts_to_qdict_filtered(opts, qdict, NULL, false); +} + +/* Validate parsed opts against descriptions where no + * descriptions were provided in the QemuOptsList. + */ +bool qemu_opts_validate(QemuOpts *opts, const QemuOptDesc *desc, Error **errp) +{ + QemuOpt *opt; + + assert(opts_accepts_any(opts->list)); + + QTAILQ_FOREACH(opt, &opts->head, next) { + opt->desc = find_desc_by_name(desc, opt->name); + if (!opt->desc) { + error_setg(errp, QERR_INVALID_PARAMETER, opt->name); + return false; + } + + if (!qemu_opt_parse(opt, errp)) { + return false; + } + } + + return true; +} + +/** + * For each member of @list, call @func(@opaque, member, @errp). + * Call it with the current location temporarily set to the member's. + * @func() may store an Error through @errp, but must return non-zero then. + * When @func() returns non-zero, break the loop and return that value. + * Return zero when the loop completes. + */ +int qemu_opts_foreach(QemuOptsList *list, qemu_opts_loopfunc func, + void *opaque, Error **errp) +{ + Location loc; + QemuOpts *opts, *next; + int rc = 0; + + loc_push_none(&loc); + QTAILQ_FOREACH_SAFE(opts, &list->head, next, next) { + loc_restore(&opts->loc); + rc = func(opaque, opts, errp); + if (rc) { + break; + } + assert(!errp || !*errp); + } + loc_pop(&loc); + return rc; +} + +static size_t count_opts_list(QemuOptsList *list) +{ + QemuOptDesc *desc = NULL; + size_t num_opts = 0; + + if (!list) { + return 0; + } + + desc = list->desc; + while (desc && desc->name) { + num_opts++; + desc++; + } + + return num_opts; +} + +void qemu_opts_free(QemuOptsList *list) +{ + g_free(list); +} + +/* Realloc dst option list and append options from an option list (list) + * to it. dst could be NULL or a malloced list. + * The lifetime of dst must be shorter than the input list because the + * QemuOptDesc->name, ->help, and ->def_value_str strings are shared. + */ +QemuOptsList *qemu_opts_append(QemuOptsList *dst, + QemuOptsList *list) +{ + size_t num_opts, num_dst_opts; + QemuOptDesc *desc; + bool need_init = false; + bool need_head_update; + + if (!list) { + return dst; + } + + /* If dst is NULL, after realloc, some area of dst should be initialized + * before adding options to it. + */ + if (!dst) { + need_init = true; + need_head_update = true; + } else { + /* Moreover, even if dst is not NULL, the realloc may move it to a + * different address in which case we may get a stale tail pointer + * in dst->head. */ + need_head_update = QTAILQ_EMPTY(&dst->head); + } + + num_opts = count_opts_list(dst); + num_dst_opts = num_opts; + num_opts += count_opts_list(list); + dst = g_realloc(dst, sizeof(QemuOptsList) + + (num_opts + 1) * sizeof(QemuOptDesc)); + if (need_init) { + dst->name = NULL; + dst->implied_opt_name = NULL; + dst->merge_lists = false; + } + if (need_head_update) { + QTAILQ_INIT(&dst->head); + } + dst->desc[num_dst_opts].name = NULL; + + /* append list->desc to dst->desc */ + if (list) { + desc = list->desc; + while (desc && desc->name) { + if (find_desc_by_name(dst->desc, desc->name) == NULL) { + dst->desc[num_dst_opts++] = *desc; + dst->desc[num_dst_opts].name = NULL; + } + desc++; + } + } + + return dst; +} diff --git a/util/qemu-print.c b/util/qemu-print.c new file mode 100644 index 000000000..69ba612f5 --- /dev/null +++ b/util/qemu-print.c @@ -0,0 +1,70 @@ +/* + * Print to stream or current monitor + * + * Copyright (C) 2019 Red Hat Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "monitor/monitor.h" +#include "qemu/qemu-print.h" + +/* + * Print like vprintf(). + * Print to current monitor if we have one, else to stdout. + */ +int qemu_vprintf(const char *fmt, va_list ap) +{ + Monitor *cur_mon = monitor_cur(); + if (cur_mon) { + return monitor_vprintf(cur_mon, fmt, ap); + } + return vprintf(fmt, ap); +} + +/* + * Print like printf(). + * Print to current monitor if we have one, else to stdout. + */ +int qemu_printf(const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = qemu_vprintf(fmt, ap); + va_end(ap); + return ret; +} + +/* + * Print like vfprintf() + * Print to @stream if non-null, else to current monitor. + */ +int qemu_vfprintf(FILE *stream, const char *fmt, va_list ap) +{ + if (!stream) { + return monitor_vprintf(monitor_cur(), fmt, ap); + } + return vfprintf(stream, fmt, ap); +} + +/* + * Print like fprintf(). + * Print to @stream if non-null, else to current monitor. + */ +int qemu_fprintf(FILE *stream, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = qemu_vfprintf(stream, fmt, ap); + va_end(ap); + return ret; +} diff --git a/util/qemu-progress.c b/util/qemu-progress.c new file mode 100644 index 000000000..20d51f8c1 --- /dev/null +++ b/util/qemu-progress.c @@ -0,0 +1,162 @@ +/* + * QEMU progress printing utility functions + * + * Copyright (C) 2011 Jes Sorensen <Jes.Sorensen@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" + +struct progress_state { + float current; + float last_print; + float min_skip; + void (*print)(void); + void (*end)(void); +}; + +static struct progress_state state; +static volatile sig_atomic_t print_pending; + +/* + * Simple progress print function. + * @percent relative percent of current operation + * @max percent of total operation + */ +static void progress_simple_print(void) +{ + printf(" (%3.2f/100%%)\r", state.current); + fflush(stdout); +} + +static void progress_simple_end(void) +{ + printf("\n"); +} + +static void progress_simple_init(void) +{ + state.print = progress_simple_print; + state.end = progress_simple_end; +} + +#ifdef CONFIG_POSIX +static void sigusr_print(int signal) +{ + print_pending = 1; +} +#endif + +static void progress_dummy_print(void) +{ + if (print_pending) { + fprintf(stderr, " (%3.2f/100%%)\n", state.current); + print_pending = 0; + } +} + +static void progress_dummy_end(void) +{ +} + +static void progress_dummy_init(void) +{ +#ifdef CONFIG_POSIX + struct sigaction action; + sigset_t set; + + memset(&action, 0, sizeof(action)); + sigfillset(&action.sa_mask); + action.sa_handler = sigusr_print; + action.sa_flags = 0; + sigaction(SIGUSR1, &action, NULL); +#ifdef SIGINFO + sigaction(SIGINFO, &action, NULL); +#endif + + /* + * SIGUSR1 is SIG_IPI and gets blocked in qemu_init_main_loop(). In the + * tools that use the progress report SIGUSR1 isn't used in this meaning + * and instead should print the progress, so reenable it. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + pthread_sigmask(SIG_UNBLOCK, &set, NULL); +#endif + + state.print = progress_dummy_print; + state.end = progress_dummy_end; +} + +/* + * Initialize progress reporting. + * If @enabled is false, actual reporting is suppressed. The user can + * still trigger a report by sending a SIGUSR1. + * Reports are also suppressed unless we've had at least @min_skip + * percent progress since the last report. + */ +void qemu_progress_init(int enabled, float min_skip) +{ + state.min_skip = min_skip; + if (enabled) { + progress_simple_init(); + } else { + progress_dummy_init(); + } +} + +void qemu_progress_end(void) +{ + state.end(); +} + +/* + * Report progress. + * @delta is how much progress we made. + * If @max is zero, @delta is an absolute value of the total job done. + * Else, @delta is a progress delta since the last call, as a fraction + * of @max. I.e. the delta is @delta * @max / 100. This allows + * relative accounting of functions which may be a different fraction of + * the full job, depending on the context they are called in. I.e. + * a function might be considered 40% of the full job if used from + * bdrv_img_create() but only 20% if called from img_convert(). + */ +void qemu_progress_print(float delta, int max) +{ + float current; + + if (max == 0) { + current = delta; + } else { + current = state.current + delta / 100 * max; + } + if (current > 100) { + current = 100; + } + state.current = current; + + if (current > (state.last_print + state.min_skip) || + current < (state.last_print - state.min_skip) || + current == 100 || current == 0) { + state.last_print = state.current; + state.print(); + } +} diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c new file mode 100644 index 000000000..0585e7a62 --- /dev/null +++ b/util/qemu-sockets.c @@ -0,0 +1,1482 @@ +/* + * inet and unix socket functions for qemu + * + * (c) 2008 Gerd Hoffmann <kraxel@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#include "qemu/osdep.h" + +#ifdef CONFIG_AF_VSOCK +#include <linux/vm_sockets.h> +#endif /* CONFIG_AF_VSOCK */ + +#include "qemu-common.h" +#include "monitor/monitor.h" +#include "qapi/clone-visitor.h" +#include "qapi/error.h" +#include "qapi/qapi-visit-sockets.h" +#include "qemu/sockets.h" +#include "qemu/main-loop.h" +#include "qapi/qobject-input-visitor.h" +#include "qapi/qobject-output-visitor.h" +#include "qemu/cutils.h" +#include "trace.h" + +#ifndef AI_ADDRCONFIG +# define AI_ADDRCONFIG 0 +#endif + +#ifndef AI_V4MAPPED +# define AI_V4MAPPED 0 +#endif + +#ifndef AI_NUMERICSERV +# define AI_NUMERICSERV 0 +#endif + + +static int inet_getport(struct addrinfo *e) +{ + struct sockaddr_in *i4; + struct sockaddr_in6 *i6; + + switch (e->ai_family) { + case PF_INET6: + i6 = (void*)e->ai_addr; + return ntohs(i6->sin6_port); + case PF_INET: + i4 = (void*)e->ai_addr; + return ntohs(i4->sin_port); + default: + return 0; + } +} + +static void inet_setport(struct addrinfo *e, int port) +{ + struct sockaddr_in *i4; + struct sockaddr_in6 *i6; + + switch (e->ai_family) { + case PF_INET6: + i6 = (void*)e->ai_addr; + i6->sin6_port = htons(port); + break; + case PF_INET: + i4 = (void*)e->ai_addr; + i4->sin_port = htons(port); + break; + } +} + +NetworkAddressFamily inet_netfamily(int family) +{ + switch (family) { + case PF_INET6: return NETWORK_ADDRESS_FAMILY_IPV6; + case PF_INET: return NETWORK_ADDRESS_FAMILY_IPV4; + case PF_UNIX: return NETWORK_ADDRESS_FAMILY_UNIX; +#ifdef CONFIG_AF_VSOCK + case PF_VSOCK: return NETWORK_ADDRESS_FAMILY_VSOCK; +#endif /* CONFIG_AF_VSOCK */ + } + return NETWORK_ADDRESS_FAMILY_UNKNOWN; +} + +bool fd_is_socket(int fd) +{ + int optval; + socklen_t optlen = sizeof(optval); + return !qemu_getsockopt(fd, SOL_SOCKET, SO_TYPE, &optval, &optlen); +} + + +/* + * Matrix we're trying to apply + * + * ipv4 ipv6 family + * - - PF_UNSPEC + * - f PF_INET + * - t PF_INET6 + * f - PF_INET6 + * f f <error> + * f t PF_INET6 + * t - PF_INET + * t f PF_INET + * t t PF_INET6/PF_UNSPEC + * + * NB, this matrix is only about getting the necessary results + * from getaddrinfo(). Some of the cases require further work + * after reading results from getaddrinfo in order to fully + * apply the logic the end user wants. + * + * In the first and last cases, we must set IPV6_V6ONLY=0 + * when binding, to allow a single listener to potentially + * accept both IPv4+6 addresses. + */ +int inet_ai_family_from_address(InetSocketAddress *addr, + Error **errp) +{ + if (addr->has_ipv6 && addr->has_ipv4 && + !addr->ipv6 && !addr->ipv4) { + error_setg(errp, "Cannot disable IPv4 and IPv6 at same time"); + return PF_UNSPEC; + } + if ((addr->has_ipv6 && addr->ipv6) && (addr->has_ipv4 && addr->ipv4)) { + /* + * Some backends can only do a single listener. In that case + * we want empty hostname to resolve to "::" and then use the + * flag IPV6_V6ONLY==0 to get both protocols on 1 socket. This + * doesn't work for addresses other than "", so they're just + * inevitably broken until multiple listeners can be used, + * and thus we honour getaddrinfo automatic protocol detection + * Once all backends do multi-listener, remove the PF_INET6 + * branch entirely. + */ + if (!addr->host || g_str_equal(addr->host, "")) { + return PF_INET6; + } else { + return PF_UNSPEC; + } + } + if ((addr->has_ipv6 && addr->ipv6) || (addr->has_ipv4 && !addr->ipv4)) { + return PF_INET6; + } + if ((addr->has_ipv4 && addr->ipv4) || (addr->has_ipv6 && !addr->ipv6)) { + return PF_INET; + } + return PF_UNSPEC; +} + +static int create_fast_reuse_socket(struct addrinfo *e) +{ + int slisten = qemu_socket(e->ai_family, e->ai_socktype, e->ai_protocol); + if (slisten < 0) { + return -1; + } + socket_set_fast_reuse(slisten); + return slisten; +} + +static int try_bind(int socket, InetSocketAddress *saddr, struct addrinfo *e) +{ +#ifndef IPV6_V6ONLY + return bind(socket, e->ai_addr, e->ai_addrlen); +#else + /* + * Deals with first & last cases in matrix in comment + * for inet_ai_family_from_address(). + */ + int v6only = + ((!saddr->has_ipv4 && !saddr->has_ipv6) || + (saddr->has_ipv4 && saddr->ipv4 && + saddr->has_ipv6 && saddr->ipv6)) ? 0 : 1; + int stat; + + rebind: + if (e->ai_family == PF_INET6) { + qemu_setsockopt(socket, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, + sizeof(v6only)); + } + + stat = bind(socket, e->ai_addr, e->ai_addrlen); + if (!stat) { + return 0; + } + + /* If we got EADDRINUSE from an IPv6 bind & v6only is unset, + * it could be that the IPv4 port is already claimed, so retry + * with v6only set + */ + if (e->ai_family == PF_INET6 && errno == EADDRINUSE && !v6only) { + v6only = 1; + goto rebind; + } + return stat; +#endif +} + +static int inet_listen_saddr(InetSocketAddress *saddr, + int port_offset, + int num, + Error **errp) +{ + struct addrinfo ai,*res,*e; + char port[33]; + char uaddr[INET6_ADDRSTRLEN+1]; + char uport[33]; + int rc, port_min, port_max, p; + int slisten = -1; + int saved_errno = 0; + bool socket_created = false; + Error *err = NULL; + + if (saddr->keep_alive) { + error_setg(errp, "keep-alive option is not supported for passive " + "sockets"); + return -1; + } + + memset(&ai,0, sizeof(ai)); + ai.ai_flags = AI_PASSIVE; + if (saddr->has_numeric && saddr->numeric) { + ai.ai_flags |= AI_NUMERICHOST | AI_NUMERICSERV; + } + ai.ai_family = inet_ai_family_from_address(saddr, &err); + ai.ai_socktype = SOCK_STREAM; + + if (err) { + error_propagate(errp, err); + return -1; + } + + if (saddr->host == NULL) { + error_setg(errp, "host not specified"); + return -1; + } + if (saddr->port != NULL) { + pstrcpy(port, sizeof(port), saddr->port); + } else { + port[0] = '\0'; + } + + /* lookup */ + if (port_offset) { + unsigned long long baseport; + if (strlen(port) == 0) { + error_setg(errp, "port not specified"); + return -1; + } + if (parse_uint_full(port, &baseport, 10) < 0) { + error_setg(errp, "can't convert to a number: %s", port); + return -1; + } + if (baseport > 65535 || + baseport + port_offset > 65535) { + error_setg(errp, "port %s out of range", port); + return -1; + } + snprintf(port, sizeof(port), "%d", (int)baseport + port_offset); + } + rc = getaddrinfo(strlen(saddr->host) ? saddr->host : NULL, + strlen(port) ? port : NULL, &ai, &res); + if (rc != 0) { + error_setg(errp, "address resolution failed for %s:%s: %s", + saddr->host, port, gai_strerror(rc)); + return -1; + } + + /* create socket + bind/listen */ + for (e = res; e != NULL; e = e->ai_next) { +#ifdef HAVE_IPPROTO_MPTCP + if (saddr->has_mptcp && saddr->mptcp) { + e->ai_protocol = IPPROTO_MPTCP; + } +#endif + getnameinfo((struct sockaddr*)e->ai_addr,e->ai_addrlen, + uaddr,INET6_ADDRSTRLEN,uport,32, + NI_NUMERICHOST | NI_NUMERICSERV); + + port_min = inet_getport(e); + port_max = saddr->has_to ? saddr->to + port_offset : port_min; + for (p = port_min; p <= port_max; p++) { + inet_setport(e, p); + + slisten = create_fast_reuse_socket(e); + if (slisten < 0) { + /* First time we expect we might fail to create the socket + * eg if 'e' has AF_INET6 but ipv6 kmod is not loaded. + * Later iterations should always succeed if first iteration + * worked though, so treat that as fatal. + */ + if (p == port_min) { + continue; + } else { + error_setg_errno(errp, errno, + "Failed to recreate failed listening socket"); + goto listen_failed; + } + } + socket_created = true; + + rc = try_bind(slisten, saddr, e); + if (rc < 0) { + if (errno != EADDRINUSE) { + error_setg_errno(errp, errno, "Failed to bind socket"); + goto listen_failed; + } + } else { + if (!listen(slisten, num)) { + goto listen_ok; + } + if (errno != EADDRINUSE) { + error_setg_errno(errp, errno, "Failed to listen on socket"); + goto listen_failed; + } + } + /* Someone else managed to bind to the same port and beat us + * to listen on it! Socket semantics does not allow us to + * recover from this situation, so we need to recreate the + * socket to allow bind attempts for subsequent ports: + */ + closesocket(slisten); + slisten = -1; + } + } + error_setg_errno(errp, errno, + socket_created ? + "Failed to find an available port" : + "Failed to create a socket"); +listen_failed: + saved_errno = errno; + if (slisten >= 0) { + closesocket(slisten); + } + freeaddrinfo(res); + errno = saved_errno; + return -1; + +listen_ok: + freeaddrinfo(res); + return slisten; +} + +#ifdef _WIN32 +#define QEMU_SOCKET_RC_INPROGRESS(rc) \ + ((rc) == -EINPROGRESS || (rc) == -EWOULDBLOCK || (rc) == -WSAEALREADY) +#else +#define QEMU_SOCKET_RC_INPROGRESS(rc) \ + ((rc) == -EINPROGRESS) +#endif + +static int inet_connect_addr(const InetSocketAddress *saddr, + struct addrinfo *addr, Error **errp) +{ + int sock, rc; + + sock = qemu_socket(addr->ai_family, addr->ai_socktype, addr->ai_protocol); + if (sock < 0) { + error_setg_errno(errp, errno, "Failed to create socket family %d", + addr->ai_family); + return -1; + } + socket_set_fast_reuse(sock); + + /* connect to peer */ + do { + rc = 0; + if (connect(sock, addr->ai_addr, addr->ai_addrlen) < 0) { + rc = -errno; + } + } while (rc == -EINTR); + + if (rc < 0) { + error_setg_errno(errp, errno, "Failed to connect to '%s:%s'", + saddr->host, saddr->port); + closesocket(sock); + return -1; + } + + return sock; +} + +static struct addrinfo *inet_parse_connect_saddr(InetSocketAddress *saddr, + Error **errp) +{ + struct addrinfo ai, *res; + int rc; + Error *err = NULL; + static int useV4Mapped = 1; + + memset(&ai, 0, sizeof(ai)); + + ai.ai_flags = AI_CANONNAME | AI_ADDRCONFIG; + if (qatomic_read(&useV4Mapped)) { + ai.ai_flags |= AI_V4MAPPED; + } + ai.ai_family = inet_ai_family_from_address(saddr, &err); + ai.ai_socktype = SOCK_STREAM; + + if (err) { + error_propagate(errp, err); + return NULL; + } + + if (saddr->host == NULL || saddr->port == NULL) { + error_setg(errp, "host and/or port not specified"); + return NULL; + } + + /* lookup */ + rc = getaddrinfo(saddr->host, saddr->port, &ai, &res); + + /* At least FreeBSD and OS-X 10.6 declare AI_V4MAPPED but + * then don't implement it in their getaddrinfo(). Detect + * this and retry without the flag since that's preferable + * to a fatal error + */ + if (rc == EAI_BADFLAGS && + (ai.ai_flags & AI_V4MAPPED)) { + qatomic_set(&useV4Mapped, 0); + ai.ai_flags &= ~AI_V4MAPPED; + rc = getaddrinfo(saddr->host, saddr->port, &ai, &res); + } + if (rc != 0) { + error_setg(errp, "address resolution failed for %s:%s: %s", + saddr->host, saddr->port, gai_strerror(rc)); + return NULL; + } + return res; +} + +/** + * Create a socket and connect it to an address. + * + * @saddr: Inet socket address specification + * @errp: set on error + * + * Returns: -1 on error, file descriptor on success. + */ +int inet_connect_saddr(InetSocketAddress *saddr, Error **errp) +{ + Error *local_err = NULL; + struct addrinfo *res, *e; + int sock = -1; + + res = inet_parse_connect_saddr(saddr, errp); + if (!res) { + return -1; + } + + for (e = res; e != NULL; e = e->ai_next) { + error_free(local_err); + local_err = NULL; + +#ifdef HAVE_IPPROTO_MPTCP + if (saddr->has_mptcp && saddr->mptcp) { + e->ai_protocol = IPPROTO_MPTCP; + } +#endif + + sock = inet_connect_addr(saddr, e, &local_err); + if (sock >= 0) { + break; + } + } + + freeaddrinfo(res); + + if (sock < 0) { + error_propagate(errp, local_err); + return sock; + } + + if (saddr->keep_alive) { + int val = 1; + int ret = qemu_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, + &val, sizeof(val)); + + if (ret < 0) { + error_setg_errno(errp, errno, "Unable to set KEEPALIVE"); + close(sock); + return -1; + } + } + + return sock; +} + +static int inet_dgram_saddr(InetSocketAddress *sraddr, + InetSocketAddress *sladdr, + Error **errp) +{ + struct addrinfo ai, *peer = NULL, *local = NULL; + const char *addr; + const char *port; + int sock = -1, rc; + Error *err = NULL; + + /* lookup peer addr */ + memset(&ai,0, sizeof(ai)); + ai.ai_flags = AI_CANONNAME | AI_V4MAPPED | AI_ADDRCONFIG; + ai.ai_family = inet_ai_family_from_address(sraddr, &err); + ai.ai_socktype = SOCK_DGRAM; + + if (err) { + error_propagate(errp, err); + goto err; + } + + addr = sraddr->host; + port = sraddr->port; + if (addr == NULL || strlen(addr) == 0) { + addr = "localhost"; + } + if (port == NULL || strlen(port) == 0) { + error_setg(errp, "remote port not specified"); + goto err; + } + + if ((rc = getaddrinfo(addr, port, &ai, &peer)) != 0) { + error_setg(errp, "address resolution failed for %s:%s: %s", addr, port, + gai_strerror(rc)); + goto err; + } + + /* lookup local addr */ + memset(&ai,0, sizeof(ai)); + ai.ai_flags = AI_PASSIVE; + ai.ai_family = peer->ai_family; + ai.ai_socktype = SOCK_DGRAM; + + if (sladdr) { + addr = sladdr->host; + port = sladdr->port; + if (addr == NULL || strlen(addr) == 0) { + addr = NULL; + } + if (!port || strlen(port) == 0) { + port = "0"; + } + } else { + addr = NULL; + port = "0"; + } + + if ((rc = getaddrinfo(addr, port, &ai, &local)) != 0) { + error_setg(errp, "address resolution failed for %s:%s: %s", addr, port, + gai_strerror(rc)); + goto err; + } + + /* create socket */ + sock = qemu_socket(peer->ai_family, peer->ai_socktype, peer->ai_protocol); + if (sock < 0) { + error_setg_errno(errp, errno, "Failed to create socket family %d", + peer->ai_family); + goto err; + } + socket_set_fast_reuse(sock); + + /* bind socket */ + if (bind(sock, local->ai_addr, local->ai_addrlen) < 0) { + error_setg_errno(errp, errno, "Failed to bind socket"); + goto err; + } + + /* connect to peer */ + if (connect(sock,peer->ai_addr,peer->ai_addrlen) < 0) { + error_setg_errno(errp, errno, "Failed to connect to '%s:%s'", + addr, port); + goto err; + } + + freeaddrinfo(local); + freeaddrinfo(peer); + return sock; + +err: + if (sock != -1) { + closesocket(sock); + } + if (local) { + freeaddrinfo(local); + } + if (peer) { + freeaddrinfo(peer); + } + + return -1; +} + +/* compatibility wrapper */ +static int inet_parse_flag(const char *flagname, const char *optstr, bool *val, + Error **errp) +{ + char *end; + size_t len; + + end = strstr(optstr, ","); + if (end) { + if (end[1] == ',') { /* Reject 'ipv6=on,,foo' */ + error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr); + return -1; + } + len = end - optstr; + } else { + len = strlen(optstr); + } + if (len == 0 || (len == 3 && strncmp(optstr, "=on", len) == 0)) { + *val = true; + } else if (len == 4 && strncmp(optstr, "=off", len) == 0) { + *val = false; + } else { + error_setg(errp, "error parsing '%s' flag '%s'", flagname, optstr); + return -1; + } + return 0; +} + +int inet_parse(InetSocketAddress *addr, const char *str, Error **errp) +{ + const char *optstr, *h; + char host[65]; + char port[33]; + int to; + int pos; + char *begin; + + memset(addr, 0, sizeof(*addr)); + + /* parse address */ + if (str[0] == ':') { + /* no host given */ + host[0] = '\0'; + if (sscanf(str, ":%32[^,]%n", port, &pos) != 1) { + error_setg(errp, "error parsing port in address '%s'", str); + return -1; + } + } else if (str[0] == '[') { + /* IPv6 addr */ + if (sscanf(str, "[%64[^]]]:%32[^,]%n", host, port, &pos) != 2) { + error_setg(errp, "error parsing IPv6 address '%s'", str); + return -1; + } + } else { + /* hostname or IPv4 addr */ + if (sscanf(str, "%64[^:]:%32[^,]%n", host, port, &pos) != 2) { + error_setg(errp, "error parsing address '%s'", str); + return -1; + } + } + + addr->host = g_strdup(host); + addr->port = g_strdup(port); + + /* parse options */ + optstr = str + pos; + h = strstr(optstr, ",to="); + if (h) { + h += 4; + if (sscanf(h, "%d%n", &to, &pos) != 1 || + (h[pos] != '\0' && h[pos] != ',')) { + error_setg(errp, "error parsing to= argument"); + return -1; + } + addr->has_to = true; + addr->to = to; + } + begin = strstr(optstr, ",ipv4"); + if (begin) { + if (inet_parse_flag("ipv4", begin + 5, &addr->ipv4, errp) < 0) { + return -1; + } + addr->has_ipv4 = true; + } + begin = strstr(optstr, ",ipv6"); + if (begin) { + if (inet_parse_flag("ipv6", begin + 5, &addr->ipv6, errp) < 0) { + return -1; + } + addr->has_ipv6 = true; + } + begin = strstr(optstr, ",keep-alive"); + if (begin) { + if (inet_parse_flag("keep-alive", begin + strlen(",keep-alive"), + &addr->keep_alive, errp) < 0) + { + return -1; + } + addr->has_keep_alive = true; + } +#ifdef HAVE_IPPROTO_MPTCP + begin = strstr(optstr, ",mptcp"); + if (begin) { + if (inet_parse_flag("mptcp", begin + strlen(",mptcp"), + &addr->mptcp, errp) < 0) + { + return -1; + } + addr->has_mptcp = true; + } +#endif + return 0; +} + + +/** + * Create a blocking socket and connect it to an address. + * + * @str: address string + * @errp: set in case of an error + * + * Returns -1 in case of error, file descriptor on success + **/ +int inet_connect(const char *str, Error **errp) +{ + int sock = -1; + InetSocketAddress *addr = g_new(InetSocketAddress, 1); + + if (!inet_parse(addr, str, errp)) { + sock = inet_connect_saddr(addr, errp); + } + qapi_free_InetSocketAddress(addr); + return sock; +} + +#ifdef CONFIG_AF_VSOCK +static bool vsock_parse_vaddr_to_sockaddr(const VsockSocketAddress *vaddr, + struct sockaddr_vm *svm, + Error **errp) +{ + unsigned long long val; + + memset(svm, 0, sizeof(*svm)); + svm->svm_family = AF_VSOCK; + + if (parse_uint_full(vaddr->cid, &val, 10) < 0 || + val > UINT32_MAX) { + error_setg(errp, "Failed to parse cid '%s'", vaddr->cid); + return false; + } + svm->svm_cid = val; + + if (parse_uint_full(vaddr->port, &val, 10) < 0 || + val > UINT32_MAX) { + error_setg(errp, "Failed to parse port '%s'", vaddr->port); + return false; + } + svm->svm_port = val; + + return true; +} + +static int vsock_connect_addr(const VsockSocketAddress *vaddr, + const struct sockaddr_vm *svm, Error **errp) +{ + int sock, rc; + + sock = qemu_socket(AF_VSOCK, SOCK_STREAM, 0); + if (sock < 0) { + error_setg_errno(errp, errno, "Failed to create socket family %d", + AF_VSOCK); + return -1; + } + + /* connect to peer */ + do { + rc = 0; + if (connect(sock, (const struct sockaddr *)svm, sizeof(*svm)) < 0) { + rc = -errno; + } + } while (rc == -EINTR); + + if (rc < 0) { + error_setg_errno(errp, errno, "Failed to connect to '%s:%s'", + vaddr->cid, vaddr->port); + closesocket(sock); + return -1; + } + + return sock; +} + +static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp) +{ + struct sockaddr_vm svm; + + if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) { + return -1; + } + + return vsock_connect_addr(vaddr, &svm, errp); +} + +static int vsock_listen_saddr(VsockSocketAddress *vaddr, + int num, + Error **errp) +{ + struct sockaddr_vm svm; + int slisten; + + if (!vsock_parse_vaddr_to_sockaddr(vaddr, &svm, errp)) { + return -1; + } + + slisten = qemu_socket(AF_VSOCK, SOCK_STREAM, 0); + if (slisten < 0) { + error_setg_errno(errp, errno, "Failed to create socket"); + return -1; + } + + if (bind(slisten, (const struct sockaddr *)&svm, sizeof(svm)) != 0) { + error_setg_errno(errp, errno, "Failed to bind socket"); + closesocket(slisten); + return -1; + } + + if (listen(slisten, num) != 0) { + error_setg_errno(errp, errno, "Failed to listen on socket"); + closesocket(slisten); + return -1; + } + return slisten; +} + +static int vsock_parse(VsockSocketAddress *addr, const char *str, + Error **errp) +{ + char cid[33]; + char port[33]; + int n; + + if (sscanf(str, "%32[^:]:%32[^,]%n", cid, port, &n) != 2) { + error_setg(errp, "error parsing address '%s'", str); + return -1; + } + if (str[n] != '\0') { + error_setg(errp, "trailing characters in address '%s'", str); + return -1; + } + + addr->cid = g_strdup(cid); + addr->port = g_strdup(port); + return 0; +} +#else +static void vsock_unsupported(Error **errp) +{ + error_setg(errp, "socket family AF_VSOCK unsupported"); +} + +static int vsock_connect_saddr(VsockSocketAddress *vaddr, Error **errp) +{ + vsock_unsupported(errp); + return -1; +} + +static int vsock_listen_saddr(VsockSocketAddress *vaddr, + int num, + Error **errp) +{ + vsock_unsupported(errp); + return -1; +} + +static int vsock_parse(VsockSocketAddress *addr, const char *str, + Error **errp) +{ + vsock_unsupported(errp); + return -1; +} +#endif /* CONFIG_AF_VSOCK */ + +#ifndef _WIN32 + +static bool saddr_is_abstract(UnixSocketAddress *saddr) +{ +#ifdef CONFIG_LINUX + return saddr->abstract; +#else + return false; +#endif +} + +static bool saddr_is_tight(UnixSocketAddress *saddr) +{ +#ifdef CONFIG_LINUX + return !saddr->has_tight || saddr->tight; +#else + return false; +#endif +} + +static int unix_listen_saddr(UnixSocketAddress *saddr, + int num, + Error **errp) +{ + bool abstract = saddr_is_abstract(saddr); + struct sockaddr_un un; + int sock, fd; + char *pathbuf = NULL; + const char *path; + size_t pathlen; + size_t addrlen; + + sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + error_setg_errno(errp, errno, "Failed to create Unix socket"); + return -1; + } + + if (saddr->path[0] || abstract) { + path = saddr->path; + } else { + const char *tmpdir = getenv("TMPDIR"); + tmpdir = tmpdir ? tmpdir : "/tmp"; + path = pathbuf = g_strdup_printf("%s/qemu-socket-XXXXXX", tmpdir); + } + + pathlen = strlen(path); + if (pathlen > sizeof(un.sun_path) || + (abstract && pathlen > (sizeof(un.sun_path) - 1))) { + error_setg(errp, "UNIX socket path '%s' is too long", path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + abstract ? sizeof(un.sun_path) - 1 : + sizeof(un.sun_path)); + goto err; + } + + if (pathbuf != NULL) { + /* + * This dummy fd usage silences the mktemp() unsecure warning. + * Using mkstemp() doesn't make things more secure here + * though. bind() complains about existing files, so we have + * to unlink first and thus re-open the race window. The + * worst case possible is bind() failing, i.e. a DoS attack. + */ + fd = mkstemp(pathbuf); + if (fd < 0) { + error_setg_errno(errp, errno, + "Failed to make a temporary socket %s", pathbuf); + goto err; + } + close(fd); + } + + if (!abstract && unlink(path) < 0 && errno != ENOENT) { + error_setg_errno(errp, errno, + "Failed to unlink socket %s", path); + goto err; + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + addrlen = sizeof(un); + + if (abstract) { + un.sun_path[0] = '\0'; + memcpy(&un.sun_path[1], path, pathlen); + if (saddr_is_tight(saddr)) { + addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen; + } + } else { + memcpy(un.sun_path, path, pathlen); + } + + if (bind(sock, (struct sockaddr *) &un, addrlen) < 0) { + error_setg_errno(errp, errno, "Failed to bind socket to %s", path); + goto err; + } + if (listen(sock, num) < 0) { + error_setg_errno(errp, errno, "Failed to listen on socket"); + goto err; + } + + g_free(pathbuf); + return sock; + +err: + g_free(pathbuf); + closesocket(sock); + return -1; +} + +static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp) +{ + bool abstract = saddr_is_abstract(saddr); + struct sockaddr_un un; + int sock, rc; + size_t pathlen; + size_t addrlen; + + if (saddr->path == NULL) { + error_setg(errp, "unix connect: no path specified"); + return -1; + } + + sock = qemu_socket(PF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + error_setg_errno(errp, errno, "Failed to create socket"); + return -1; + } + + pathlen = strlen(saddr->path); + if (pathlen > sizeof(un.sun_path) || + (abstract && pathlen > (sizeof(un.sun_path) - 1))) { + error_setg(errp, "UNIX socket path '%s' is too long", saddr->path); + error_append_hint(errp, "Path must be less than %zu bytes\n", + abstract ? sizeof(un.sun_path) - 1 : + sizeof(un.sun_path)); + goto err; + } + + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + addrlen = sizeof(un); + + if (abstract) { + un.sun_path[0] = '\0'; + memcpy(&un.sun_path[1], saddr->path, pathlen); + if (saddr_is_tight(saddr)) { + addrlen = offsetof(struct sockaddr_un, sun_path) + 1 + pathlen; + } + } else { + memcpy(un.sun_path, saddr->path, pathlen); + } + /* connect to peer */ + do { + rc = 0; + if (connect(sock, (struct sockaddr *) &un, addrlen) < 0) { + rc = -errno; + } + } while (rc == -EINTR); + + if (rc < 0) { + error_setg_errno(errp, -rc, "Failed to connect to '%s'", + saddr->path); + goto err; + } + + return sock; + + err: + close(sock); + return -1; +} + +#else + +static int unix_listen_saddr(UnixSocketAddress *saddr, + int num, + Error **errp) +{ + error_setg(errp, "unix sockets are not available on windows"); + errno = ENOTSUP; + return -1; +} + +static int unix_connect_saddr(UnixSocketAddress *saddr, Error **errp) +{ + error_setg(errp, "unix sockets are not available on windows"); + errno = ENOTSUP; + return -1; +} +#endif + +/* compatibility wrapper */ +int unix_listen(const char *str, Error **errp) +{ + UnixSocketAddress *saddr; + int sock; + + saddr = g_new0(UnixSocketAddress, 1); + saddr->path = g_strdup(str); + sock = unix_listen_saddr(saddr, 1, errp); + qapi_free_UnixSocketAddress(saddr); + return sock; +} + +int unix_connect(const char *path, Error **errp) +{ + UnixSocketAddress *saddr; + int sock; + + saddr = g_new0(UnixSocketAddress, 1); + saddr->path = g_strdup(path); + sock = unix_connect_saddr(saddr, errp); + qapi_free_UnixSocketAddress(saddr); + return sock; +} + + +SocketAddress *socket_parse(const char *str, Error **errp) +{ + SocketAddress *addr; + + addr = g_new0(SocketAddress, 1); + if (strstart(str, "unix:", NULL)) { + if (str[5] == '\0') { + error_setg(errp, "invalid Unix socket address"); + goto fail; + } else { + addr->type = SOCKET_ADDRESS_TYPE_UNIX; + addr->u.q_unix.path = g_strdup(str + 5); + } + } else if (strstart(str, "fd:", NULL)) { + if (str[3] == '\0') { + error_setg(errp, "invalid file descriptor address"); + goto fail; + } else { + addr->type = SOCKET_ADDRESS_TYPE_FD; + addr->u.fd.str = g_strdup(str + 3); + } + } else if (strstart(str, "vsock:", NULL)) { + addr->type = SOCKET_ADDRESS_TYPE_VSOCK; + if (vsock_parse(&addr->u.vsock, str + strlen("vsock:"), errp)) { + goto fail; + } + } else { + addr->type = SOCKET_ADDRESS_TYPE_INET; + if (inet_parse(&addr->u.inet, str, errp)) { + goto fail; + } + } + return addr; + +fail: + qapi_free_SocketAddress(addr); + return NULL; +} + +static int socket_get_fd(const char *fdstr, Error **errp) +{ + Monitor *cur_mon = monitor_cur(); + int fd; + if (cur_mon) { + fd = monitor_get_fd(cur_mon, fdstr, errp); + if (fd < 0) { + return -1; + } + } else { + if (qemu_strtoi(fdstr, NULL, 10, &fd) < 0) { + error_setg_errno(errp, errno, + "Unable to parse FD number %s", + fdstr); + return -1; + } + } + if (!fd_is_socket(fd)) { + error_setg(errp, "File descriptor '%s' is not a socket", fdstr); + close(fd); + return -1; + } + return fd; +} + +int socket_address_parse_named_fd(SocketAddress *addr, Error **errp) +{ + int fd; + + if (addr->type != SOCKET_ADDRESS_TYPE_FD) { + return 0; + } + + fd = socket_get_fd(addr->u.fd.str, errp); + if (fd < 0) { + return fd; + } + + g_free(addr->u.fd.str); + addr->u.fd.str = g_strdup_printf("%d", fd); + + return 0; +} + +int socket_connect(SocketAddress *addr, Error **errp) +{ + int fd; + + switch (addr->type) { + case SOCKET_ADDRESS_TYPE_INET: + fd = inet_connect_saddr(&addr->u.inet, errp); + break; + + case SOCKET_ADDRESS_TYPE_UNIX: + fd = unix_connect_saddr(&addr->u.q_unix, errp); + break; + + case SOCKET_ADDRESS_TYPE_FD: + fd = socket_get_fd(addr->u.fd.str, errp); + break; + + case SOCKET_ADDRESS_TYPE_VSOCK: + fd = vsock_connect_saddr(&addr->u.vsock, errp); + break; + + default: + abort(); + } + return fd; +} + +int socket_listen(SocketAddress *addr, int num, Error **errp) +{ + int fd; + + trace_socket_listen(num); + switch (addr->type) { + case SOCKET_ADDRESS_TYPE_INET: + fd = inet_listen_saddr(&addr->u.inet, 0, num, errp); + break; + + case SOCKET_ADDRESS_TYPE_UNIX: + fd = unix_listen_saddr(&addr->u.q_unix, num, errp); + break; + + case SOCKET_ADDRESS_TYPE_FD: + fd = socket_get_fd(addr->u.fd.str, errp); + if (fd < 0) { + return -1; + } + + /* + * If the socket is not yet in the listen state, then transition it to + * the listen state now. + * + * If it's already listening then this updates the backlog value as + * requested. + * + * If this socket cannot listen because it's already in another state + * (e.g. unbound or connected) then we'll catch the error here. + */ + if (listen(fd, num) != 0) { + error_setg_errno(errp, errno, "Failed to listen on fd socket"); + closesocket(fd); + return -1; + } + break; + + case SOCKET_ADDRESS_TYPE_VSOCK: + fd = vsock_listen_saddr(&addr->u.vsock, num, errp); + break; + + default: + abort(); + } + return fd; +} + +void socket_listen_cleanup(int fd, Error **errp) +{ + SocketAddress *addr; + + addr = socket_local_address(fd, errp); + if (!addr) { + return; + } + + if (addr->type == SOCKET_ADDRESS_TYPE_UNIX + && addr->u.q_unix.path) { + if (unlink(addr->u.q_unix.path) < 0 && errno != ENOENT) { + error_setg_errno(errp, errno, + "Failed to unlink socket %s", + addr->u.q_unix.path); + } + } + + qapi_free_SocketAddress(addr); +} + +int socket_dgram(SocketAddress *remote, SocketAddress *local, Error **errp) +{ + int fd; + + /* + * TODO SOCKET_ADDRESS_TYPE_FD when fd is AF_INET or AF_INET6 + * (although other address families can do SOCK_DGRAM, too) + */ + switch (remote->type) { + case SOCKET_ADDRESS_TYPE_INET: + fd = inet_dgram_saddr(&remote->u.inet, + local ? &local->u.inet : NULL, errp); + break; + + default: + error_setg(errp, "socket type unsupported for datagram"); + fd = -1; + } + return fd; +} + + +static SocketAddress * +socket_sockaddr_to_address_inet(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + char host[NI_MAXHOST]; + char serv[NI_MAXSERV]; + SocketAddress *addr; + InetSocketAddress *inet; + int ret; + + ret = getnameinfo((struct sockaddr *)sa, salen, + host, sizeof(host), + serv, sizeof(serv), + NI_NUMERICHOST | NI_NUMERICSERV); + if (ret != 0) { + error_setg(errp, "Cannot format numeric socket address: %s", + gai_strerror(ret)); + return NULL; + } + + addr = g_new0(SocketAddress, 1); + addr->type = SOCKET_ADDRESS_TYPE_INET; + inet = &addr->u.inet; + inet->host = g_strdup(host); + inet->port = g_strdup(serv); + if (sa->ss_family == AF_INET) { + inet->has_ipv4 = inet->ipv4 = true; + } else { + inet->has_ipv6 = inet->ipv6 = true; + } + + return addr; +} + + +#ifndef WIN32 +static SocketAddress * +socket_sockaddr_to_address_unix(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + SocketAddress *addr; + struct sockaddr_un *su = (struct sockaddr_un *)sa; + + addr = g_new0(SocketAddress, 1); + addr->type = SOCKET_ADDRESS_TYPE_UNIX; + salen -= offsetof(struct sockaddr_un, sun_path); +#ifdef CONFIG_LINUX + if (salen > 0 && !su->sun_path[0]) { + /* Linux abstract socket */ + addr->u.q_unix.path = g_strndup(su->sun_path + 1, salen - 1); + addr->u.q_unix.has_abstract = true; + addr->u.q_unix.abstract = true; + addr->u.q_unix.has_tight = true; + addr->u.q_unix.tight = salen < sizeof(su->sun_path); + return addr; + } +#endif + + addr->u.q_unix.path = g_strndup(su->sun_path, salen); + return addr; +} +#endif /* WIN32 */ + +#ifdef CONFIG_AF_VSOCK +static SocketAddress * +socket_sockaddr_to_address_vsock(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + SocketAddress *addr; + VsockSocketAddress *vaddr; + struct sockaddr_vm *svm = (struct sockaddr_vm *)sa; + + addr = g_new0(SocketAddress, 1); + addr->type = SOCKET_ADDRESS_TYPE_VSOCK; + vaddr = &addr->u.vsock; + vaddr->cid = g_strdup_printf("%u", svm->svm_cid); + vaddr->port = g_strdup_printf("%u", svm->svm_port); + + return addr; +} +#endif /* CONFIG_AF_VSOCK */ + +SocketAddress * +socket_sockaddr_to_address(struct sockaddr_storage *sa, + socklen_t salen, + Error **errp) +{ + switch (sa->ss_family) { + case AF_INET: + case AF_INET6: + return socket_sockaddr_to_address_inet(sa, salen, errp); + +#ifndef WIN32 + case AF_UNIX: + return socket_sockaddr_to_address_unix(sa, salen, errp); +#endif /* WIN32 */ + +#ifdef CONFIG_AF_VSOCK + case AF_VSOCK: + return socket_sockaddr_to_address_vsock(sa, salen, errp); +#endif + + default: + error_setg(errp, "socket family %d unsupported", + sa->ss_family); + return NULL; + } + return 0; +} + + +SocketAddress *socket_local_address(int fd, Error **errp) +{ + struct sockaddr_storage ss; + socklen_t sslen = sizeof(ss); + + if (getsockname(fd, (struct sockaddr *)&ss, &sslen) < 0) { + error_setg_errno(errp, errno, "%s", + "Unable to query local socket address"); + return NULL; + } + + return socket_sockaddr_to_address(&ss, sslen, errp); +} + + +SocketAddress *socket_remote_address(int fd, Error **errp) +{ + struct sockaddr_storage ss; + socklen_t sslen = sizeof(ss); + + if (getpeername(fd, (struct sockaddr *)&ss, &sslen) < 0) { + error_setg_errno(errp, errno, "%s", + "Unable to query remote socket address"); + return NULL; + } + + return socket_sockaddr_to_address(&ss, sslen, errp); +} + + +SocketAddress *socket_address_flatten(SocketAddressLegacy *addr_legacy) +{ + SocketAddress *addr; + + if (!addr_legacy) { + return NULL; + } + + addr = g_new(SocketAddress, 1); + + switch (addr_legacy->type) { + case SOCKET_ADDRESS_TYPE_INET: + addr->type = SOCKET_ADDRESS_TYPE_INET; + QAPI_CLONE_MEMBERS(InetSocketAddress, &addr->u.inet, + addr_legacy->u.inet.data); + break; + case SOCKET_ADDRESS_TYPE_UNIX: + addr->type = SOCKET_ADDRESS_TYPE_UNIX; + QAPI_CLONE_MEMBERS(UnixSocketAddress, &addr->u.q_unix, + addr_legacy->u.q_unix.data); + break; + case SOCKET_ADDRESS_TYPE_VSOCK: + addr->type = SOCKET_ADDRESS_TYPE_VSOCK; + QAPI_CLONE_MEMBERS(VsockSocketAddress, &addr->u.vsock, + addr_legacy->u.vsock.data); + break; + case SOCKET_ADDRESS_TYPE_FD: + addr->type = SOCKET_ADDRESS_TYPE_FD; + QAPI_CLONE_MEMBERS(String, &addr->u.fd, addr_legacy->u.fd.data); + break; + default: + abort(); + } + + return addr; +} diff --git a/util/qemu-thread-common.h b/util/qemu-thread-common.h new file mode 100644 index 000000000..2af6b1208 --- /dev/null +++ b/util/qemu-thread-common.h @@ -0,0 +1,54 @@ +/* + * Common qemu-thread implementation header file. + * + * Copyright Red Hat, Inc. 2018 + * + * Authors: + * Peter Xu <peterx@redhat.com>, + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef QEMU_THREAD_COMMON_H +#define QEMU_THREAD_COMMON_H + +#include "qemu/thread.h" +#include "trace.h" + +static inline void qemu_mutex_post_init(QemuMutex *mutex) +{ +#ifdef CONFIG_DEBUG_MUTEX + mutex->file = NULL; + mutex->line = 0; +#endif + mutex->initialized = true; +} + +static inline void qemu_mutex_pre_lock(QemuMutex *mutex, + const char *file, int line) +{ + trace_qemu_mutex_lock(mutex, file, line); +} + +static inline void qemu_mutex_post_lock(QemuMutex *mutex, + const char *file, int line) +{ +#ifdef CONFIG_DEBUG_MUTEX + mutex->file = file; + mutex->line = line; +#endif + trace_qemu_mutex_locked(mutex, file, line); +} + +static inline void qemu_mutex_pre_unlock(QemuMutex *mutex, + const char *file, int line) +{ +#ifdef CONFIG_DEBUG_MUTEX + mutex->file = NULL; + mutex->line = 0; +#endif + trace_qemu_mutex_unlock(mutex, file, line); +} + +#endif diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c new file mode 100644 index 000000000..e1225b63b --- /dev/null +++ b/util/qemu-thread-posix.c @@ -0,0 +1,632 @@ +/* + * Wrappers around mutex/cond/thread functions + * + * Copyright Red Hat, Inc. 2009 + * + * Author: + * Marcelo Tosatti <mtosatti@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include "qemu/osdep.h" +#include "qemu/thread.h" +#include "qemu/atomic.h" +#include "qemu/notify.h" +#include "qemu-thread-common.h" +#include "qemu/tsan.h" + +static bool name_threads; + +void qemu_thread_naming(bool enable) +{ + name_threads = enable; + +#if !defined CONFIG_PTHREAD_SETNAME_NP_W_TID && \ + !defined CONFIG_PTHREAD_SETNAME_NP_WO_TID + /* This is a debugging option, not fatal */ + if (enable) { + fprintf(stderr, "qemu: thread naming not supported on this host\n"); + } +#endif +} + +static void error_exit(int err, const char *msg) +{ + fprintf(stderr, "qemu: %s: %s\n", msg, strerror(err)); + abort(); +} + +static void compute_abs_deadline(struct timespec *ts, int ms) +{ + struct timeval tv; + gettimeofday(&tv, NULL); + ts->tv_nsec = tv.tv_usec * 1000 + (ms % 1000) * 1000000; + ts->tv_sec = tv.tv_sec + ms / 1000; + if (ts->tv_nsec >= 1000000000) { + ts->tv_sec++; + ts->tv_nsec -= 1000000000; + } +} + +void qemu_mutex_init(QemuMutex *mutex) +{ + int err; + + err = pthread_mutex_init(&mutex->lock, NULL); + if (err) + error_exit(err, __func__); + qemu_mutex_post_init(mutex); +} + +void qemu_mutex_destroy(QemuMutex *mutex) +{ + int err; + + assert(mutex->initialized); + mutex->initialized = false; + err = pthread_mutex_destroy(&mutex->lock); + if (err) + error_exit(err, __func__); +} + +void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line) +{ + int err; + + assert(mutex->initialized); + qemu_mutex_pre_lock(mutex, file, line); + err = pthread_mutex_lock(&mutex->lock); + if (err) + error_exit(err, __func__); + qemu_mutex_post_lock(mutex, file, line); +} + +int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line) +{ + int err; + + assert(mutex->initialized); + err = pthread_mutex_trylock(&mutex->lock); + if (err == 0) { + qemu_mutex_post_lock(mutex, file, line); + return 0; + } + if (err != EBUSY) { + error_exit(err, __func__); + } + return -EBUSY; +} + +void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line) +{ + int err; + + assert(mutex->initialized); + qemu_mutex_pre_unlock(mutex, file, line); + err = pthread_mutex_unlock(&mutex->lock); + if (err) + error_exit(err, __func__); +} + +void qemu_rec_mutex_init(QemuRecMutex *mutex) +{ + int err; + pthread_mutexattr_t attr; + + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE); + err = pthread_mutex_init(&mutex->m.lock, &attr); + pthread_mutexattr_destroy(&attr); + if (err) { + error_exit(err, __func__); + } + mutex->m.initialized = true; +} + +void qemu_rec_mutex_destroy(QemuRecMutex *mutex) +{ + qemu_mutex_destroy(&mutex->m); +} + +void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + qemu_mutex_lock_impl(&mutex->m, file, line); +} + +int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + return qemu_mutex_trylock_impl(&mutex->m, file, line); +} + +void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + qemu_mutex_unlock_impl(&mutex->m, file, line); +} + +void qemu_cond_init(QemuCond *cond) +{ + int err; + + err = pthread_cond_init(&cond->cond, NULL); + if (err) + error_exit(err, __func__); + cond->initialized = true; +} + +void qemu_cond_destroy(QemuCond *cond) +{ + int err; + + assert(cond->initialized); + cond->initialized = false; + err = pthread_cond_destroy(&cond->cond); + if (err) + error_exit(err, __func__); +} + +void qemu_cond_signal(QemuCond *cond) +{ + int err; + + assert(cond->initialized); + err = pthread_cond_signal(&cond->cond); + if (err) + error_exit(err, __func__); +} + +void qemu_cond_broadcast(QemuCond *cond) +{ + int err; + + assert(cond->initialized); + err = pthread_cond_broadcast(&cond->cond); + if (err) + error_exit(err, __func__); +} + +void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line) +{ + int err; + + assert(cond->initialized); + qemu_mutex_pre_unlock(mutex, file, line); + err = pthread_cond_wait(&cond->cond, &mutex->lock); + qemu_mutex_post_lock(mutex, file, line); + if (err) + error_exit(err, __func__); +} + +bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms, + const char *file, const int line) +{ + int err; + struct timespec ts; + + assert(cond->initialized); + trace_qemu_mutex_unlock(mutex, file, line); + compute_abs_deadline(&ts, ms); + err = pthread_cond_timedwait(&cond->cond, &mutex->lock, &ts); + trace_qemu_mutex_locked(mutex, file, line); + if (err && err != ETIMEDOUT) { + error_exit(err, __func__); + } + return err != ETIMEDOUT; +} + +void qemu_sem_init(QemuSemaphore *sem, int init) +{ + int rc; + +#ifndef CONFIG_SEM_TIMEDWAIT + rc = pthread_mutex_init(&sem->lock, NULL); + if (rc != 0) { + error_exit(rc, __func__); + } + rc = pthread_cond_init(&sem->cond, NULL); + if (rc != 0) { + error_exit(rc, __func__); + } + if (init < 0) { + error_exit(EINVAL, __func__); + } + sem->count = init; +#else + rc = sem_init(&sem->sem, 0, init); + if (rc < 0) { + error_exit(errno, __func__); + } +#endif + sem->initialized = true; +} + +void qemu_sem_destroy(QemuSemaphore *sem) +{ + int rc; + + assert(sem->initialized); + sem->initialized = false; +#ifndef CONFIG_SEM_TIMEDWAIT + rc = pthread_cond_destroy(&sem->cond); + if (rc < 0) { + error_exit(rc, __func__); + } + rc = pthread_mutex_destroy(&sem->lock); + if (rc < 0) { + error_exit(rc, __func__); + } +#else + rc = sem_destroy(&sem->sem); + if (rc < 0) { + error_exit(errno, __func__); + } +#endif +} + +void qemu_sem_post(QemuSemaphore *sem) +{ + int rc; + + assert(sem->initialized); +#ifndef CONFIG_SEM_TIMEDWAIT + pthread_mutex_lock(&sem->lock); + if (sem->count == UINT_MAX) { + rc = EINVAL; + } else { + sem->count++; + rc = pthread_cond_signal(&sem->cond); + } + pthread_mutex_unlock(&sem->lock); + if (rc != 0) { + error_exit(rc, __func__); + } +#else + rc = sem_post(&sem->sem); + if (rc < 0) { + error_exit(errno, __func__); + } +#endif +} + +int qemu_sem_timedwait(QemuSemaphore *sem, int ms) +{ + int rc; + struct timespec ts; + + assert(sem->initialized); +#ifndef CONFIG_SEM_TIMEDWAIT + rc = 0; + compute_abs_deadline(&ts, ms); + pthread_mutex_lock(&sem->lock); + while (sem->count == 0) { + rc = pthread_cond_timedwait(&sem->cond, &sem->lock, &ts); + if (rc == ETIMEDOUT) { + break; + } + if (rc != 0) { + error_exit(rc, __func__); + } + } + if (rc != ETIMEDOUT) { + --sem->count; + } + pthread_mutex_unlock(&sem->lock); + return (rc == ETIMEDOUT ? -1 : 0); +#else + if (ms <= 0) { + /* This is cheaper than sem_timedwait. */ + do { + rc = sem_trywait(&sem->sem); + } while (rc == -1 && errno == EINTR); + if (rc == -1 && errno == EAGAIN) { + return -1; + } + } else { + compute_abs_deadline(&ts, ms); + do { + rc = sem_timedwait(&sem->sem, &ts); + } while (rc == -1 && errno == EINTR); + if (rc == -1 && errno == ETIMEDOUT) { + return -1; + } + } + if (rc < 0) { + error_exit(errno, __func__); + } + return 0; +#endif +} + +void qemu_sem_wait(QemuSemaphore *sem) +{ + int rc; + + assert(sem->initialized); +#ifndef CONFIG_SEM_TIMEDWAIT + pthread_mutex_lock(&sem->lock); + while (sem->count == 0) { + rc = pthread_cond_wait(&sem->cond, &sem->lock); + if (rc != 0) { + error_exit(rc, __func__); + } + } + --sem->count; + pthread_mutex_unlock(&sem->lock); +#else + do { + rc = sem_wait(&sem->sem); + } while (rc == -1 && errno == EINTR); + if (rc < 0) { + error_exit(errno, __func__); + } +#endif +} + +#ifdef __linux__ +#include "qemu/futex.h" +#else +static inline void qemu_futex_wake(QemuEvent *ev, int n) +{ + assert(ev->initialized); + pthread_mutex_lock(&ev->lock); + if (n == 1) { + pthread_cond_signal(&ev->cond); + } else { + pthread_cond_broadcast(&ev->cond); + } + pthread_mutex_unlock(&ev->lock); +} + +static inline void qemu_futex_wait(QemuEvent *ev, unsigned val) +{ + assert(ev->initialized); + pthread_mutex_lock(&ev->lock); + if (ev->value == val) { + pthread_cond_wait(&ev->cond, &ev->lock); + } + pthread_mutex_unlock(&ev->lock); +} +#endif + +/* Valid transitions: + * - free->set, when setting the event + * - busy->set, when setting the event, followed by qemu_futex_wake + * - set->free, when resetting the event + * - free->busy, when waiting + * + * set->busy does not happen (it can be observed from the outside but + * it really is set->free->busy). + * + * busy->free provably cannot happen; to enforce it, the set->free transition + * is done with an OR, which becomes a no-op if the event has concurrently + * transitioned to free or busy. + */ + +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + +void qemu_event_init(QemuEvent *ev, bool init) +{ +#ifndef __linux__ + pthread_mutex_init(&ev->lock, NULL); + pthread_cond_init(&ev->cond, NULL); +#endif + + ev->value = (init ? EV_SET : EV_FREE); + ev->initialized = true; +} + +void qemu_event_destroy(QemuEvent *ev) +{ + assert(ev->initialized); + ev->initialized = false; +#ifndef __linux__ + pthread_mutex_destroy(&ev->lock); + pthread_cond_destroy(&ev->cond); +#endif +} + +void qemu_event_set(QemuEvent *ev) +{ + /* qemu_event_set has release semantics, but because it *loads* + * ev->value we need a full memory barrier here. + */ + assert(ev->initialized); + smp_mb(); + if (qatomic_read(&ev->value) != EV_SET) { + if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) { + /* There were waiters, wake them up. */ + qemu_futex_wake(ev, INT_MAX); + } + } +} + +void qemu_event_reset(QemuEvent *ev) +{ + unsigned value; + + assert(ev->initialized); + value = qatomic_read(&ev->value); + smp_mb_acquire(); + if (value == EV_SET) { + /* + * If there was a concurrent reset (or even reset+wait), + * do nothing. Otherwise change EV_SET->EV_FREE. + */ + qatomic_or(&ev->value, EV_FREE); + } +} + +void qemu_event_wait(QemuEvent *ev) +{ + unsigned value; + + assert(ev->initialized); + value = qatomic_read(&ev->value); + smp_mb_acquire(); + if (value != EV_SET) { + if (value == EV_FREE) { + /* + * Leave the event reset and tell qemu_event_set that there + * are waiters. No need to retry, because there cannot be + * a concurrent busy->free transition. After the CAS, the + * event will be either set or busy. + */ + if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { + return; + } + } + qemu_futex_wait(ev, EV_BUSY); + } +} + +static __thread NotifierList thread_exit; + +/* + * Note that in this implementation you can register a thread-exit + * notifier for the main thread, but it will never be called. + * This is OK because main thread exit can only happen when the + * entire process is exiting, and the API allows notifiers to not + * be called on process exit. + */ +void qemu_thread_atexit_add(Notifier *notifier) +{ + notifier_list_add(&thread_exit, notifier); +} + +void qemu_thread_atexit_remove(Notifier *notifier) +{ + notifier_remove(notifier); +} + +static void qemu_thread_atexit_notify(void *arg) +{ + /* + * Called when non-main thread exits (via qemu_thread_exit() + * or by returning from its start routine.) + */ + notifier_list_notify(&thread_exit, NULL); +} + +typedef struct { + void *(*start_routine)(void *); + void *arg; + char *name; +} QemuThreadArgs; + +static void *qemu_thread_start(void *args) +{ + QemuThreadArgs *qemu_thread_args = args; + void *(*start_routine)(void *) = qemu_thread_args->start_routine; + void *arg = qemu_thread_args->arg; + void *r; + + /* Attempt to set the threads name; note that this is for debug, so + * we're not going to fail if we can't set it. + */ + if (name_threads && qemu_thread_args->name) { +# if defined(CONFIG_PTHREAD_SETNAME_NP_W_TID) + pthread_setname_np(pthread_self(), qemu_thread_args->name); +# elif defined(CONFIG_PTHREAD_SETNAME_NP_WO_TID) + pthread_setname_np(qemu_thread_args->name); +# endif + } + QEMU_TSAN_ANNOTATE_THREAD_NAME(qemu_thread_args->name); + g_free(qemu_thread_args->name); + g_free(qemu_thread_args); + + /* + * GCC 11 with glibc 2.17 on PowerPC reports + * + * qemu-thread-posix.c:540:5: error: ‘__sigsetjmp’ accessing 656 bytes + * in a region of size 528 [-Werror=stringop-overflow=] + * 540 | pthread_cleanup_push(qemu_thread_atexit_notify, NULL); + * | ^~~~~~~~~~~~~~~~~~~~ + * + * which is clearly nonsense. + */ +#pragma GCC diagnostic push +#ifndef __clang__ +#pragma GCC diagnostic ignored "-Wstringop-overflow" +#endif + + pthread_cleanup_push(qemu_thread_atexit_notify, NULL); + r = start_routine(arg); + pthread_cleanup_pop(1); + +#pragma GCC diagnostic pop + + return r; +} + +void qemu_thread_create(QemuThread *thread, const char *name, + void *(*start_routine)(void*), + void *arg, int mode) +{ + sigset_t set, oldset; + int err; + pthread_attr_t attr; + QemuThreadArgs *qemu_thread_args; + + err = pthread_attr_init(&attr); + if (err) { + error_exit(err, __func__); + } + + if (mode == QEMU_THREAD_DETACHED) { + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + } + + /* Leave signal handling to the iothread. */ + sigfillset(&set); + /* Blocking the signals can result in undefined behaviour. */ + sigdelset(&set, SIGSEGV); + sigdelset(&set, SIGFPE); + sigdelset(&set, SIGILL); + /* TODO avoid SIGBUS loss on macOS */ + pthread_sigmask(SIG_SETMASK, &set, &oldset); + + qemu_thread_args = g_new0(QemuThreadArgs, 1); + qemu_thread_args->name = g_strdup(name); + qemu_thread_args->start_routine = start_routine; + qemu_thread_args->arg = arg; + + err = pthread_create(&thread->thread, &attr, + qemu_thread_start, qemu_thread_args); + + if (err) + error_exit(err, __func__); + + pthread_sigmask(SIG_SETMASK, &oldset, NULL); + + pthread_attr_destroy(&attr); +} + +void qemu_thread_get_self(QemuThread *thread) +{ + thread->thread = pthread_self(); +} + +bool qemu_thread_is_self(QemuThread *thread) +{ + return pthread_equal(pthread_self(), thread->thread); +} + +void qemu_thread_exit(void *retval) +{ + pthread_exit(retval); +} + +void *qemu_thread_join(QemuThread *thread) +{ + int err; + void *ret; + + err = pthread_join(thread->thread, &ret); + if (err) { + error_exit(err, __func__); + } + return ret; +} diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c new file mode 100644 index 000000000..52eb19f35 --- /dev/null +++ b/util/qemu-thread-win32.c @@ -0,0 +1,461 @@ +/* + * Win32 implementation for mutex/cond/thread functions + * + * Copyright Red Hat, Inc. 2010 + * + * Author: + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "qemu/thread.h" +#include "qemu/notify.h" +#include "qemu-thread-common.h" +#include <process.h> + +static bool name_threads; + +void qemu_thread_naming(bool enable) +{ + /* But note we don't actually name them on Windows yet */ + name_threads = enable; + + fprintf(stderr, "qemu: thread naming not supported on this host\n"); +} + +static void error_exit(int err, const char *msg) +{ + char *pstr; + + FormatMessage(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_ALLOCATE_BUFFER, + NULL, err, 0, (LPTSTR)&pstr, 2, NULL); + fprintf(stderr, "qemu: %s: %s\n", msg, pstr); + LocalFree(pstr); + abort(); +} + +void qemu_mutex_init(QemuMutex *mutex) +{ + InitializeSRWLock(&mutex->lock); + qemu_mutex_post_init(mutex); +} + +void qemu_mutex_destroy(QemuMutex *mutex) +{ + assert(mutex->initialized); + mutex->initialized = false; + InitializeSRWLock(&mutex->lock); +} + +void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line) +{ + assert(mutex->initialized); + qemu_mutex_pre_lock(mutex, file, line); + AcquireSRWLockExclusive(&mutex->lock); + qemu_mutex_post_lock(mutex, file, line); +} + +int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line) +{ + int owned; + + assert(mutex->initialized); + owned = TryAcquireSRWLockExclusive(&mutex->lock); + if (owned) { + qemu_mutex_post_lock(mutex, file, line); + return 0; + } + return -EBUSY; +} + +void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line) +{ + assert(mutex->initialized); + qemu_mutex_pre_unlock(mutex, file, line); + ReleaseSRWLockExclusive(&mutex->lock); +} + +void qemu_rec_mutex_init(QemuRecMutex *mutex) +{ + InitializeCriticalSection(&mutex->lock); + mutex->initialized = true; +} + +void qemu_rec_mutex_destroy(QemuRecMutex *mutex) +{ + assert(mutex->initialized); + mutex->initialized = false; + DeleteCriticalSection(&mutex->lock); +} + +void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + assert(mutex->initialized); + EnterCriticalSection(&mutex->lock); +} + +int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + assert(mutex->initialized); + return !TryEnterCriticalSection(&mutex->lock); +} + +void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line) +{ + assert(mutex->initialized); + LeaveCriticalSection(&mutex->lock); +} + +void qemu_cond_init(QemuCond *cond) +{ + memset(cond, 0, sizeof(*cond)); + InitializeConditionVariable(&cond->var); + cond->initialized = true; +} + +void qemu_cond_destroy(QemuCond *cond) +{ + assert(cond->initialized); + cond->initialized = false; + InitializeConditionVariable(&cond->var); +} + +void qemu_cond_signal(QemuCond *cond) +{ + assert(cond->initialized); + WakeConditionVariable(&cond->var); +} + +void qemu_cond_broadcast(QemuCond *cond) +{ + assert(cond->initialized); + WakeAllConditionVariable(&cond->var); +} + +void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line) +{ + assert(cond->initialized); + qemu_mutex_pre_unlock(mutex, file, line); + SleepConditionVariableSRW(&cond->var, &mutex->lock, INFINITE, 0); + qemu_mutex_post_lock(mutex, file, line); +} + +bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms, + const char *file, const int line) +{ + int rc = 0; + + assert(cond->initialized); + trace_qemu_mutex_unlock(mutex, file, line); + if (!SleepConditionVariableSRW(&cond->var, &mutex->lock, ms, 0)) { + rc = GetLastError(); + } + trace_qemu_mutex_locked(mutex, file, line); + if (rc && rc != ERROR_TIMEOUT) { + error_exit(rc, __func__); + } + return rc != ERROR_TIMEOUT; +} + +void qemu_sem_init(QemuSemaphore *sem, int init) +{ + /* Manual reset. */ + sem->sema = CreateSemaphore(NULL, init, LONG_MAX, NULL); + sem->initialized = true; +} + +void qemu_sem_destroy(QemuSemaphore *sem) +{ + assert(sem->initialized); + sem->initialized = false; + CloseHandle(sem->sema); +} + +void qemu_sem_post(QemuSemaphore *sem) +{ + assert(sem->initialized); + ReleaseSemaphore(sem->sema, 1, NULL); +} + +int qemu_sem_timedwait(QemuSemaphore *sem, int ms) +{ + int rc; + + assert(sem->initialized); + rc = WaitForSingleObject(sem->sema, ms); + if (rc == WAIT_OBJECT_0) { + return 0; + } + if (rc != WAIT_TIMEOUT) { + error_exit(GetLastError(), __func__); + } + return -1; +} + +void qemu_sem_wait(QemuSemaphore *sem) +{ + assert(sem->initialized); + if (WaitForSingleObject(sem->sema, INFINITE) != WAIT_OBJECT_0) { + error_exit(GetLastError(), __func__); + } +} + +/* Wrap a Win32 manual-reset event with a fast userspace path. The idea + * is to reset the Win32 event lazily, as part of a test-reset-test-wait + * sequence. Such a sequence is, indeed, how QemuEvents are used by + * RCU and other subsystems! + * + * Valid transitions: + * - free->set, when setting the event + * - busy->set, when setting the event, followed by SetEvent + * - set->free, when resetting the event + * - free->busy, when waiting + * + * set->busy does not happen (it can be observed from the outside but + * it really is set->free->busy). + * + * busy->free provably cannot happen; to enforce it, the set->free transition + * is done with an OR, which becomes a no-op if the event has concurrently + * transitioned to free or busy (and is faster than cmpxchg). + */ + +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + +void qemu_event_init(QemuEvent *ev, bool init) +{ + /* Manual reset. */ + ev->event = CreateEvent(NULL, TRUE, TRUE, NULL); + ev->value = (init ? EV_SET : EV_FREE); + ev->initialized = true; +} + +void qemu_event_destroy(QemuEvent *ev) +{ + assert(ev->initialized); + ev->initialized = false; + CloseHandle(ev->event); +} + +void qemu_event_set(QemuEvent *ev) +{ + assert(ev->initialized); + /* qemu_event_set has release semantics, but because it *loads* + * ev->value we need a full memory barrier here. + */ + smp_mb(); + if (qatomic_read(&ev->value) != EV_SET) { + if (qatomic_xchg(&ev->value, EV_SET) == EV_BUSY) { + /* There were waiters, wake them up. */ + SetEvent(ev->event); + } + } +} + +void qemu_event_reset(QemuEvent *ev) +{ + unsigned value; + + assert(ev->initialized); + value = qatomic_read(&ev->value); + smp_mb_acquire(); + if (value == EV_SET) { + /* If there was a concurrent reset (or even reset+wait), + * do nothing. Otherwise change EV_SET->EV_FREE. + */ + qatomic_or(&ev->value, EV_FREE); + } +} + +void qemu_event_wait(QemuEvent *ev) +{ + unsigned value; + + assert(ev->initialized); + value = qatomic_read(&ev->value); + smp_mb_acquire(); + if (value != EV_SET) { + if (value == EV_FREE) { + /* qemu_event_set is not yet going to call SetEvent, but we are + * going to do another check for EV_SET below when setting EV_BUSY. + * At that point it is safe to call WaitForSingleObject. + */ + ResetEvent(ev->event); + + /* Tell qemu_event_set that there are waiters. No need to retry + * because there cannot be a concurrent busy->free transition. + * After the CAS, the event will be either set or busy. + */ + if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) { + value = EV_SET; + } else { + value = EV_BUSY; + } + } + if (value == EV_BUSY) { + WaitForSingleObject(ev->event, INFINITE); + } + } +} + +struct QemuThreadData { + /* Passed to win32_start_routine. */ + void *(*start_routine)(void *); + void *arg; + short mode; + NotifierList exit; + + /* Only used for joinable threads. */ + bool exited; + void *ret; + CRITICAL_SECTION cs; +}; + +static bool atexit_registered; +static NotifierList main_thread_exit; + +static __thread QemuThreadData *qemu_thread_data; + +static void run_main_thread_exit(void) +{ + notifier_list_notify(&main_thread_exit, NULL); +} + +void qemu_thread_atexit_add(Notifier *notifier) +{ + if (!qemu_thread_data) { + if (!atexit_registered) { + atexit_registered = true; + atexit(run_main_thread_exit); + } + notifier_list_add(&main_thread_exit, notifier); + } else { + notifier_list_add(&qemu_thread_data->exit, notifier); + } +} + +void qemu_thread_atexit_remove(Notifier *notifier) +{ + notifier_remove(notifier); +} + +static unsigned __stdcall win32_start_routine(void *arg) +{ + QemuThreadData *data = (QemuThreadData *) arg; + void *(*start_routine)(void *) = data->start_routine; + void *thread_arg = data->arg; + + qemu_thread_data = data; + qemu_thread_exit(start_routine(thread_arg)); + abort(); +} + +void qemu_thread_exit(void *arg) +{ + QemuThreadData *data = qemu_thread_data; + + notifier_list_notify(&data->exit, NULL); + if (data->mode == QEMU_THREAD_JOINABLE) { + data->ret = arg; + EnterCriticalSection(&data->cs); + data->exited = true; + LeaveCriticalSection(&data->cs); + } else { + g_free(data); + } + _endthreadex(0); +} + +void *qemu_thread_join(QemuThread *thread) +{ + QemuThreadData *data; + void *ret; + HANDLE handle; + + data = thread->data; + if (data->mode == QEMU_THREAD_DETACHED) { + return NULL; + } + + /* + * Because multiple copies of the QemuThread can exist via + * qemu_thread_get_self, we need to store a value that cannot + * leak there. The simplest, non racy way is to store the TID, + * discard the handle that _beginthreadex gives back, and + * get another copy of the handle here. + */ + handle = qemu_thread_get_handle(thread); + if (handle) { + WaitForSingleObject(handle, INFINITE); + CloseHandle(handle); + } + ret = data->ret; + DeleteCriticalSection(&data->cs); + g_free(data); + return ret; +} + +void qemu_thread_create(QemuThread *thread, const char *name, + void *(*start_routine)(void *), + void *arg, int mode) +{ + HANDLE hThread; + struct QemuThreadData *data; + + data = g_malloc(sizeof *data); + data->start_routine = start_routine; + data->arg = arg; + data->mode = mode; + data->exited = false; + notifier_list_init(&data->exit); + + if (data->mode != QEMU_THREAD_DETACHED) { + InitializeCriticalSection(&data->cs); + } + + hThread = (HANDLE) _beginthreadex(NULL, 0, win32_start_routine, + data, 0, &thread->tid); + if (!hThread) { + error_exit(GetLastError(), __func__); + } + CloseHandle(hThread); + thread->data = data; +} + +void qemu_thread_get_self(QemuThread *thread) +{ + thread->data = qemu_thread_data; + thread->tid = GetCurrentThreadId(); +} + +HANDLE qemu_thread_get_handle(QemuThread *thread) +{ + QemuThreadData *data; + HANDLE handle; + + data = thread->data; + if (data->mode == QEMU_THREAD_DETACHED) { + return NULL; + } + + EnterCriticalSection(&data->cs); + if (!data->exited) { + handle = OpenThread(SYNCHRONIZE | THREAD_SUSPEND_RESUME | + THREAD_SET_CONTEXT, FALSE, thread->tid); + } else { + handle = NULL; + } + LeaveCriticalSection(&data->cs); + return handle; +} + +bool qemu_thread_is_self(QemuThread *thread) +{ + return GetCurrentThreadId() == thread->tid; +} diff --git a/util/qemu-timer-common.c b/util/qemu-timer-common.c new file mode 100644 index 000000000..cc1326f72 --- /dev/null +++ b/util/qemu-timer-common.c @@ -0,0 +1,63 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qemu/timer.h" + +/***********************************************************/ +/* real time host monotonic timer */ + +int64_t clock_start; + +#ifdef _WIN32 + +int64_t clock_freq; + +static void __attribute__((constructor)) init_get_clock(void) +{ + LARGE_INTEGER freq; + int ret; + ret = QueryPerformanceFrequency(&freq); + if (ret == 0) { + fprintf(stderr, "Could not calibrate ticks\n"); + exit(1); + } + clock_freq = freq.QuadPart; + clock_start = get_clock(); +} + +#else + +int use_rt_clock; + +static void __attribute__((constructor)) init_get_clock(void) +{ + struct timespec ts; + + use_rt_clock = 0; + if (clock_gettime(CLOCK_MONOTONIC, &ts) == 0) { + use_rt_clock = 1; + } + clock_start = get_clock(); +} +#endif diff --git a/util/qemu-timer.c b/util/qemu-timer.c new file mode 100644 index 000000000..f36c75e59 --- /dev/null +++ b/util/qemu-timer.c @@ -0,0 +1,674 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/timer.h" +#include "qemu/lockable.h" +#include "sysemu/cpu-timers.h" +#include "sysemu/replay.h" +#include "sysemu/cpus.h" + +#ifdef CONFIG_POSIX +#include <pthread.h> +#endif + +#ifdef CONFIG_PPOLL +#include <poll.h> +#endif + +#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK +#include <sys/prctl.h> +#endif + +/***********************************************************/ +/* timers */ + +typedef struct QEMUClock { + /* We rely on BQL to protect the timerlists */ + QLIST_HEAD(, QEMUTimerList) timerlists; + + QEMUClockType type; + bool enabled; +} QEMUClock; + +QEMUTimerListGroup main_loop_tlg; +static QEMUClock qemu_clocks[QEMU_CLOCK_MAX]; + +/* A QEMUTimerList is a list of timers attached to a clock. More + * than one QEMUTimerList can be attached to each clock, for instance + * used by different AioContexts / threads. Each clock also has + * a list of the QEMUTimerLists associated with it, in order that + * reenabling the clock can call all the notifiers. + */ + +struct QEMUTimerList { + QEMUClock *clock; + QemuMutex active_timers_lock; + QEMUTimer *active_timers; + QLIST_ENTRY(QEMUTimerList) list; + QEMUTimerListNotifyCB *notify_cb; + void *notify_opaque; + + /* lightweight method to mark the end of timerlist's running */ + QemuEvent timers_done_ev; +}; + +/** + * qemu_clock_ptr: + * @type: type of clock + * + * Translate a clock type into a pointer to QEMUClock object. + * + * Returns: a pointer to the QEMUClock object + */ +static inline QEMUClock *qemu_clock_ptr(QEMUClockType type) +{ + return &qemu_clocks[type]; +} + +static bool timer_expired_ns(QEMUTimer *timer_head, int64_t current_time) +{ + return timer_head && (timer_head->expire_time <= current_time); +} + +QEMUTimerList *timerlist_new(QEMUClockType type, + QEMUTimerListNotifyCB *cb, + void *opaque) +{ + QEMUTimerList *timer_list; + QEMUClock *clock = qemu_clock_ptr(type); + + timer_list = g_malloc0(sizeof(QEMUTimerList)); + qemu_event_init(&timer_list->timers_done_ev, true); + timer_list->clock = clock; + timer_list->notify_cb = cb; + timer_list->notify_opaque = opaque; + qemu_mutex_init(&timer_list->active_timers_lock); + QLIST_INSERT_HEAD(&clock->timerlists, timer_list, list); + return timer_list; +} + +void timerlist_free(QEMUTimerList *timer_list) +{ + assert(!timerlist_has_timers(timer_list)); + if (timer_list->clock) { + QLIST_REMOVE(timer_list, list); + } + qemu_mutex_destroy(&timer_list->active_timers_lock); + g_free(timer_list); +} + +static void qemu_clock_init(QEMUClockType type, QEMUTimerListNotifyCB *notify_cb) +{ + QEMUClock *clock = qemu_clock_ptr(type); + + /* Assert that the clock of type TYPE has not been initialized yet. */ + assert(main_loop_tlg.tl[type] == NULL); + + clock->type = type; + clock->enabled = (type == QEMU_CLOCK_VIRTUAL ? false : true); + QLIST_INIT(&clock->timerlists); + main_loop_tlg.tl[type] = timerlist_new(type, notify_cb, NULL); +} + +bool qemu_clock_use_for_deadline(QEMUClockType type) +{ + return !(icount_enabled() && (type == QEMU_CLOCK_VIRTUAL)); +} + +void qemu_clock_notify(QEMUClockType type) +{ + QEMUTimerList *timer_list; + QEMUClock *clock = qemu_clock_ptr(type); + QLIST_FOREACH(timer_list, &clock->timerlists, list) { + timerlist_notify(timer_list); + } +} + +/* Disabling the clock will wait for related timerlists to stop + * executing qemu_run_timers. Thus, this functions should not + * be used from the callback of a timer that is based on @clock. + * Doing so would cause a deadlock. + * + * Caller should hold BQL. + */ +void qemu_clock_enable(QEMUClockType type, bool enabled) +{ + QEMUClock *clock = qemu_clock_ptr(type); + QEMUTimerList *tl; + bool old = clock->enabled; + clock->enabled = enabled; + if (enabled && !old) { + qemu_clock_notify(type); + } else if (!enabled && old) { + QLIST_FOREACH(tl, &clock->timerlists, list) { + qemu_event_wait(&tl->timers_done_ev); + } + } +} + +bool timerlist_has_timers(QEMUTimerList *timer_list) +{ + return !!qatomic_read(&timer_list->active_timers); +} + +bool qemu_clock_has_timers(QEMUClockType type) +{ + return timerlist_has_timers( + main_loop_tlg.tl[type]); +} + +bool timerlist_expired(QEMUTimerList *timer_list) +{ + int64_t expire_time; + + if (!qatomic_read(&timer_list->active_timers)) { + return false; + } + + WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) { + if (!timer_list->active_timers) { + return false; + } + expire_time = timer_list->active_timers->expire_time; + } + + return expire_time <= qemu_clock_get_ns(timer_list->clock->type); +} + +bool qemu_clock_expired(QEMUClockType type) +{ + return timerlist_expired( + main_loop_tlg.tl[type]); +} + +/* + * As above, but return -1 for no deadline, and do not cap to 2^32 + * as we know the result is always positive. + */ + +int64_t timerlist_deadline_ns(QEMUTimerList *timer_list) +{ + int64_t delta; + int64_t expire_time; + + if (!qatomic_read(&timer_list->active_timers)) { + return -1; + } + + if (!timer_list->clock->enabled) { + return -1; + } + + /* The active timers list may be modified before the caller uses our return + * value but ->notify_cb() is called when the deadline changes. Therefore + * the caller should notice the change and there is no race condition. + */ + WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) { + if (!timer_list->active_timers) { + return -1; + } + expire_time = timer_list->active_timers->expire_time; + } + + delta = expire_time - qemu_clock_get_ns(timer_list->clock->type); + + if (delta <= 0) { + return 0; + } + + return delta; +} + +/* Calculate the soonest deadline across all timerlists attached + * to the clock. This is used for the icount timeout so we + * ignore whether or not the clock should be used in deadline + * calculations. + */ +int64_t qemu_clock_deadline_ns_all(QEMUClockType type, int attr_mask) +{ + int64_t deadline = -1; + int64_t delta; + int64_t expire_time; + QEMUTimer *ts; + QEMUTimerList *timer_list; + QEMUClock *clock = qemu_clock_ptr(type); + + if (!clock->enabled) { + return -1; + } + + QLIST_FOREACH(timer_list, &clock->timerlists, list) { + qemu_mutex_lock(&timer_list->active_timers_lock); + ts = timer_list->active_timers; + /* Skip all external timers */ + while (ts && (ts->attributes & ~attr_mask)) { + ts = ts->next; + } + if (!ts) { + qemu_mutex_unlock(&timer_list->active_timers_lock); + continue; + } + expire_time = ts->expire_time; + qemu_mutex_unlock(&timer_list->active_timers_lock); + + delta = expire_time - qemu_clock_get_ns(type); + if (delta <= 0) { + delta = 0; + } + deadline = qemu_soonest_timeout(deadline, delta); + } + return deadline; +} + +QEMUClockType timerlist_get_clock(QEMUTimerList *timer_list) +{ + return timer_list->clock->type; +} + +QEMUTimerList *qemu_clock_get_main_loop_timerlist(QEMUClockType type) +{ + return main_loop_tlg.tl[type]; +} + +void timerlist_notify(QEMUTimerList *timer_list) +{ + if (timer_list->notify_cb) { + timer_list->notify_cb(timer_list->notify_opaque, timer_list->clock->type); + } else { + qemu_notify_event(); + } +} + +/* Transition function to convert a nanosecond timeout to ms + * This is used where a system does not support ppoll + */ +int qemu_timeout_ns_to_ms(int64_t ns) +{ + int64_t ms; + if (ns < 0) { + return -1; + } + + if (!ns) { + return 0; + } + + /* Always round up, because it's better to wait too long than to wait too + * little and effectively busy-wait + */ + ms = DIV_ROUND_UP(ns, SCALE_MS); + + /* To avoid overflow problems, limit this to 2^31, i.e. approx 25 days */ + return MIN(ms, INT32_MAX); +} + + +/* qemu implementation of g_poll which uses a nanosecond timeout but is + * otherwise identical to g_poll + */ +int qemu_poll_ns(GPollFD *fds, guint nfds, int64_t timeout) +{ +#ifdef CONFIG_PPOLL + if (timeout < 0) { + return ppoll((struct pollfd *)fds, nfds, NULL, NULL); + } else { + struct timespec ts; + int64_t tvsec = timeout / 1000000000LL; + /* Avoid possibly overflowing and specifying a negative number of + * seconds, which would turn a very long timeout into a busy-wait. + */ + if (tvsec > (int64_t)INT32_MAX) { + tvsec = INT32_MAX; + } + ts.tv_sec = tvsec; + ts.tv_nsec = timeout % 1000000000LL; + return ppoll((struct pollfd *)fds, nfds, &ts, NULL); + } +#else + return g_poll(fds, nfds, qemu_timeout_ns_to_ms(timeout)); +#endif +} + + +void timer_init_full(QEMUTimer *ts, + QEMUTimerListGroup *timer_list_group, QEMUClockType type, + int scale, int attributes, + QEMUTimerCB *cb, void *opaque) +{ + if (!timer_list_group) { + timer_list_group = &main_loop_tlg; + } + ts->timer_list = timer_list_group->tl[type]; + ts->cb = cb; + ts->opaque = opaque; + ts->scale = scale; + ts->attributes = attributes; + ts->expire_time = -1; +} + +void timer_deinit(QEMUTimer *ts) +{ + assert(ts->expire_time == -1); + ts->timer_list = NULL; +} + +static void timer_del_locked(QEMUTimerList *timer_list, QEMUTimer *ts) +{ + QEMUTimer **pt, *t; + + ts->expire_time = -1; + pt = &timer_list->active_timers; + for(;;) { + t = *pt; + if (!t) + break; + if (t == ts) { + qatomic_set(pt, t->next); + break; + } + pt = &t->next; + } +} + +static bool timer_mod_ns_locked(QEMUTimerList *timer_list, + QEMUTimer *ts, int64_t expire_time) +{ + QEMUTimer **pt, *t; + + /* add the timer in the sorted list */ + pt = &timer_list->active_timers; + for (;;) { + t = *pt; + if (!timer_expired_ns(t, expire_time)) { + break; + } + pt = &t->next; + } + ts->expire_time = MAX(expire_time, 0); + ts->next = *pt; + qatomic_set(pt, ts); + + return pt == &timer_list->active_timers; +} + +static void timerlist_rearm(QEMUTimerList *timer_list) +{ + /* Interrupt execution to force deadline recalculation. */ + if (icount_enabled() && timer_list->clock->type == QEMU_CLOCK_VIRTUAL) { + icount_start_warp_timer(); + } + timerlist_notify(timer_list); +} + +/* stop a timer, but do not dealloc it */ +void timer_del(QEMUTimer *ts) +{ + QEMUTimerList *timer_list = ts->timer_list; + + if (timer_list) { + qemu_mutex_lock(&timer_list->active_timers_lock); + timer_del_locked(timer_list, ts); + qemu_mutex_unlock(&timer_list->active_timers_lock); + } +} + +/* modify the current timer so that it will be fired when current_time + >= expire_time. The corresponding callback will be called. */ +void timer_mod_ns(QEMUTimer *ts, int64_t expire_time) +{ + QEMUTimerList *timer_list = ts->timer_list; + bool rearm; + + qemu_mutex_lock(&timer_list->active_timers_lock); + timer_del_locked(timer_list, ts); + rearm = timer_mod_ns_locked(timer_list, ts, expire_time); + qemu_mutex_unlock(&timer_list->active_timers_lock); + + if (rearm) { + timerlist_rearm(timer_list); + } +} + +/* modify the current timer so that it will be fired when current_time + >= expire_time or the current deadline, whichever comes earlier. + The corresponding callback will be called. */ +void timer_mod_anticipate_ns(QEMUTimer *ts, int64_t expire_time) +{ + QEMUTimerList *timer_list = ts->timer_list; + bool rearm; + + WITH_QEMU_LOCK_GUARD(&timer_list->active_timers_lock) { + if (ts->expire_time == -1 || ts->expire_time > expire_time) { + if (ts->expire_time != -1) { + timer_del_locked(timer_list, ts); + } + rearm = timer_mod_ns_locked(timer_list, ts, expire_time); + } else { + rearm = false; + } + } + if (rearm) { + timerlist_rearm(timer_list); + } +} + +void timer_mod(QEMUTimer *ts, int64_t expire_time) +{ + timer_mod_ns(ts, expire_time * ts->scale); +} + +void timer_mod_anticipate(QEMUTimer *ts, int64_t expire_time) +{ + timer_mod_anticipate_ns(ts, expire_time * ts->scale); +} + +bool timer_pending(QEMUTimer *ts) +{ + return ts->expire_time >= 0; +} + +bool timer_expired(QEMUTimer *timer_head, int64_t current_time) +{ + return timer_expired_ns(timer_head, current_time * timer_head->scale); +} + +bool timerlist_run_timers(QEMUTimerList *timer_list) +{ + QEMUTimer *ts; + int64_t current_time; + bool progress = false; + QEMUTimerCB *cb; + void *opaque; + + if (!qatomic_read(&timer_list->active_timers)) { + return false; + } + + qemu_event_reset(&timer_list->timers_done_ev); + if (!timer_list->clock->enabled) { + goto out; + } + + switch (timer_list->clock->type) { + case QEMU_CLOCK_REALTIME: + break; + default: + case QEMU_CLOCK_VIRTUAL: + break; + case QEMU_CLOCK_HOST: + if (!replay_checkpoint(CHECKPOINT_CLOCK_HOST)) { + goto out; + } + break; + case QEMU_CLOCK_VIRTUAL_RT: + if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL_RT)) { + goto out; + } + break; + } + + /* + * Extract expired timers from active timers list and process them. + * + * In rr mode we need "filtered" checkpointing for virtual clock. The + * checkpoint must be recorded/replayed before processing any non-EXTERNAL timer, + * and that must only be done once since the clock value stays the same. Because + * non-EXTERNAL timers may appear in the timers list while it being processed, + * the checkpoint can be issued at a time until no timers are left and we are + * done". + */ + current_time = qemu_clock_get_ns(timer_list->clock->type); + qemu_mutex_lock(&timer_list->active_timers_lock); + while ((ts = timer_list->active_timers)) { + if (!timer_expired_ns(ts, current_time)) { + /* No expired timers left. The checkpoint can be skipped + * if no timers fired or they were all external. + */ + break; + } + /* Checkpoint for virtual clock is redundant in cases where + * it's being triggered with only non-EXTERNAL timers, because + * these timers don't change guest state directly. + */ + if (replay_mode != REPLAY_MODE_NONE + && timer_list->clock->type == QEMU_CLOCK_VIRTUAL + && !(ts->attributes & QEMU_TIMER_ATTR_EXTERNAL) + && !replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL)) { + qemu_mutex_unlock(&timer_list->active_timers_lock); + goto out; + } + + /* remove timer from the list before calling the callback */ + timer_list->active_timers = ts->next; + ts->next = NULL; + ts->expire_time = -1; + cb = ts->cb; + opaque = ts->opaque; + + /* run the callback (the timer list can be modified) */ + qemu_mutex_unlock(&timer_list->active_timers_lock); + cb(opaque); + qemu_mutex_lock(&timer_list->active_timers_lock); + + progress = true; + } + qemu_mutex_unlock(&timer_list->active_timers_lock); + +out: + qemu_event_set(&timer_list->timers_done_ev); + return progress; +} + +bool qemu_clock_run_timers(QEMUClockType type) +{ + return timerlist_run_timers(main_loop_tlg.tl[type]); +} + +void timerlistgroup_init(QEMUTimerListGroup *tlg, + QEMUTimerListNotifyCB *cb, void *opaque) +{ + QEMUClockType type; + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + tlg->tl[type] = timerlist_new(type, cb, opaque); + } +} + +void timerlistgroup_deinit(QEMUTimerListGroup *tlg) +{ + QEMUClockType type; + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + timerlist_free(tlg->tl[type]); + } +} + +bool timerlistgroup_run_timers(QEMUTimerListGroup *tlg) +{ + QEMUClockType type; + bool progress = false; + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + progress |= timerlist_run_timers(tlg->tl[type]); + } + return progress; +} + +int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg) +{ + int64_t deadline = -1; + QEMUClockType type; + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + if (qemu_clock_use_for_deadline(type)) { + deadline = qemu_soonest_timeout(deadline, + timerlist_deadline_ns(tlg->tl[type])); + } + } + return deadline; +} + +int64_t qemu_clock_get_ns(QEMUClockType type) +{ + switch (type) { + case QEMU_CLOCK_REALTIME: + return get_clock(); + default: + case QEMU_CLOCK_VIRTUAL: + return cpus_get_virtual_clock(); + case QEMU_CLOCK_HOST: + return REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime()); + case QEMU_CLOCK_VIRTUAL_RT: + return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock()); + } +} + +void init_clocks(QEMUTimerListNotifyCB *notify_cb) +{ + QEMUClockType type; + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + qemu_clock_init(type, notify_cb); + } + +#ifdef CONFIG_PRCTL_PR_SET_TIMERSLACK + prctl(PR_SET_TIMERSLACK, 1, 0, 0, 0); +#endif +} + +uint64_t timer_expire_time_ns(QEMUTimer *ts) +{ + return timer_pending(ts) ? ts->expire_time : -1; +} + +bool qemu_clock_run_all_timers(void) +{ + bool progress = false; + QEMUClockType type; + + for (type = 0; type < QEMU_CLOCK_MAX; type++) { + if (qemu_clock_use_for_deadline(type)) { + progress |= qemu_clock_run_timers(type); + } + } + + return progress; +} diff --git a/util/qht.c b/util/qht.c new file mode 100644 index 000000000..079605121 --- /dev/null +++ b/util/qht.c @@ -0,0 +1,963 @@ +/* + * qht.c - QEMU Hash Table, designed to scale for read-mostly workloads. + * + * Copyright (C) 2016, Emilio G. Cota <cota@braap.org> + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * Assumptions: + * - NULL cannot be inserted/removed as a pointer value. + * - Trying to insert an already-existing hash-pointer pair is OK. However, + * it is not OK to insert into the same hash table different hash-pointer + * pairs that have the same pointer value, but not the hashes. + * - Lookups are performed under an RCU read-critical section; removals + * must wait for a grace period to elapse before freeing removed objects. + * + * Features: + * - Reads (i.e. lookups and iterators) can be concurrent with other reads. + * Lookups that are concurrent with writes to the same bucket will retry + * via a seqlock; iterators acquire all bucket locks and therefore can be + * concurrent with lookups and are serialized wrt writers. + * - Writes (i.e. insertions/removals) can be concurrent with writes to + * different buckets; writes to the same bucket are serialized through a lock. + * - Optional auto-resizing: the hash table resizes up if the load surpasses + * a certain threshold. Resizing is done concurrently with readers; writes + * are serialized with the resize operation. + * + * The key structure is the bucket, which is cacheline-sized. Buckets + * contain a few hash values and pointers; the u32 hash values are stored in + * full so that resizing is fast. Having this structure instead of directly + * chaining items has two advantages: + * - Failed lookups fail fast, and touch a minimum number of cache lines. + * - Resizing the hash table with concurrent lookups is easy. + * + * There are two types of buckets: + * 1. "head" buckets are the ones allocated in the array of buckets in qht_map. + * 2. all "non-head" buckets (i.e. all others) are members of a chain that + * starts from a head bucket. + * Note that the seqlock and spinlock of a head bucket applies to all buckets + * chained to it; these two fields are unused in non-head buckets. + * + * On removals, we move the last valid item in the chain to the position of the + * just-removed entry. This makes lookups slightly faster, since the moment an + * invalid entry is found, the (failed) lookup is over. + * + * Resizing is done by taking all bucket spinlocks (so that no other writers can + * race with us) and then copying all entries into a new hash map. Then, the + * ht->map pointer is set, and the old map is freed once no RCU readers can see + * it anymore. + * + * Writers check for concurrent resizes by comparing ht->map before and after + * acquiring their bucket lock. If they don't match, a resize has occurred + * while the bucket spinlock was being acquired. + * + * Related Work: + * - Idea of cacheline-sized buckets with full hashes taken from: + * David, Guerraoui & Trigonakis, "Asynchronized Concurrency: + * The Secret to Scaling Concurrent Search Data Structures", ASPLOS'15. + * - Why not RCU-based hash tables? They would allow us to get rid of the + * seqlock, but resizing would take forever since RCU read critical + * sections in QEMU take quite a long time. + * More info on relativistic hash tables: + * + Triplett, McKenney & Walpole, "Resizable, Scalable, Concurrent Hash + * Tables via Relativistic Programming", USENIX ATC'11. + * + Corbet, "Relativistic hash tables, part 1: Algorithms", @ lwn.net, 2014. + * https://lwn.net/Articles/612021/ + */ +#include "qemu/osdep.h" +#include "qemu/qht.h" +#include "qemu/atomic.h" +#include "qemu/rcu.h" + +//#define QHT_DEBUG + +/* + * We want to avoid false sharing of cache lines. Most systems have 64-byte + * cache lines so we go with it for simplicity. + * + * Note that systems with smaller cache lines will be fine (the struct is + * almost 64-bytes); systems with larger cache lines might suffer from + * some false sharing. + */ +#define QHT_BUCKET_ALIGN 64 + +/* define these to keep sizeof(qht_bucket) within QHT_BUCKET_ALIGN */ +#if HOST_LONG_BITS == 32 +#define QHT_BUCKET_ENTRIES 6 +#else /* 64-bit */ +#define QHT_BUCKET_ENTRIES 4 +#endif + +enum qht_iter_type { + QHT_ITER_VOID, /* do nothing; use retvoid */ + QHT_ITER_RM, /* remove element if retbool returns true */ +}; + +struct qht_iter { + union { + qht_iter_func_t retvoid; + qht_iter_bool_func_t retbool; + } f; + enum qht_iter_type type; +}; + +/* + * Do _not_ use qemu_mutex_[try]lock directly! Use these macros, otherwise + * the profiler (QSP) will deadlock. + */ +static inline void qht_lock(struct qht *ht) +{ + if (ht->mode & QHT_MODE_RAW_MUTEXES) { + qemu_mutex_lock__raw(&ht->lock); + } else { + qemu_mutex_lock(&ht->lock); + } +} + +static inline int qht_trylock(struct qht *ht) +{ + if (ht->mode & QHT_MODE_RAW_MUTEXES) { + return qemu_mutex_trylock__raw(&(ht)->lock); + } + return qemu_mutex_trylock(&(ht)->lock); +} + +/* this inline is not really necessary, but it helps keep code consistent */ +static inline void qht_unlock(struct qht *ht) +{ + qemu_mutex_unlock(&ht->lock); +} + +/* + * Note: reading partially-updated pointers in @pointers could lead to + * segfaults. We thus access them with qatomic_read/set; this guarantees + * that the compiler makes all those accesses atomic. We also need the + * volatile-like behavior in qatomic_read, since otherwise the compiler + * might refetch the pointer. + * qatomic_read's are of course not necessary when the bucket lock is held. + * + * If both ht->lock and b->lock are grabbed, ht->lock should always + * be grabbed first. + */ +struct qht_bucket { + QemuSpin lock; + QemuSeqLock sequence; + uint32_t hashes[QHT_BUCKET_ENTRIES]; + void *pointers[QHT_BUCKET_ENTRIES]; + struct qht_bucket *next; +} QEMU_ALIGNED(QHT_BUCKET_ALIGN); + +QEMU_BUILD_BUG_ON(sizeof(struct qht_bucket) > QHT_BUCKET_ALIGN); + +/** + * struct qht_map - structure to track an array of buckets + * @rcu: used by RCU. Keep it as the top field in the struct to help valgrind + * find the whole struct. + * @buckets: array of head buckets. It is constant once the map is created. + * @n_buckets: number of head buckets. It is constant once the map is created. + * @n_added_buckets: number of added (i.e. "non-head") buckets + * @n_added_buckets_threshold: threshold to trigger an upward resize once the + * number of added buckets surpasses it. + * + * Buckets are tracked in what we call a "map", i.e. this structure. + */ +struct qht_map { + struct rcu_head rcu; + struct qht_bucket *buckets; + size_t n_buckets; + size_t n_added_buckets; + size_t n_added_buckets_threshold; +}; + +/* trigger a resize when n_added_buckets > n_buckets / div */ +#define QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV 8 + +static void qht_do_resize_reset(struct qht *ht, struct qht_map *new, + bool reset); +static void qht_grow_maybe(struct qht *ht); + +#ifdef QHT_DEBUG + +#define qht_debug_assert(X) do { assert(X); } while (0) + +static void qht_bucket_debug__locked(struct qht_bucket *b) +{ + bool seen_empty = false; + bool corrupt = false; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + seen_empty = true; + continue; + } + if (seen_empty) { + fprintf(stderr, "%s: b: %p, pos: %i, hash: 0x%x, p: %p\n", + __func__, b, i, b->hashes[i], b->pointers[i]); + corrupt = true; + } + } + b = b->next; + } while (b); + qht_debug_assert(!corrupt); +} + +static void qht_map_debug__all_locked(struct qht_map *map) +{ + int i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_debug__locked(&map->buckets[i]); + } +} +#else + +#define qht_debug_assert(X) do { (void)(X); } while (0) + +static inline void qht_bucket_debug__locked(struct qht_bucket *b) +{ } + +static inline void qht_map_debug__all_locked(struct qht_map *map) +{ } +#endif /* QHT_DEBUG */ + +static inline size_t qht_elems_to_buckets(size_t n_elems) +{ + return pow2ceil(n_elems / QHT_BUCKET_ENTRIES); +} + +static inline void qht_head_init(struct qht_bucket *b) +{ + memset(b, 0, sizeof(*b)); + qemu_spin_init(&b->lock); + seqlock_init(&b->sequence); +} + +static inline +struct qht_bucket *qht_map_to_bucket(const struct qht_map *map, uint32_t hash) +{ + return &map->buckets[hash & (map->n_buckets - 1)]; +} + +/* acquire all bucket locks from a map */ +static void qht_map_lock_buckets(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + struct qht_bucket *b = &map->buckets[i]; + + qemu_spin_lock(&b->lock); + } +} + +static void qht_map_unlock_buckets(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + struct qht_bucket *b = &map->buckets[i]; + + qemu_spin_unlock(&b->lock); + } +} + +/* + * Call with at least a bucket lock held. + * @map should be the value read before acquiring the lock (or locks). + */ +static inline bool qht_map_is_stale__locked(const struct qht *ht, + const struct qht_map *map) +{ + return map != ht->map; +} + +/* + * Grab all bucket locks, and set @pmap after making sure the map isn't stale. + * + * Pairs with qht_map_unlock_buckets(), hence the pass-by-reference. + * + * Note: callers cannot have ht->lock held. + */ +static inline +void qht_map_lock_buckets__no_stale(struct qht *ht, struct qht_map **pmap) +{ + struct qht_map *map; + + map = qatomic_rcu_read(&ht->map); + qht_map_lock_buckets(map); + if (likely(!qht_map_is_stale__locked(ht, map))) { + *pmap = map; + return; + } + qht_map_unlock_buckets(map); + + /* we raced with a resize; acquire ht->lock to see the updated ht->map */ + qht_lock(ht); + map = ht->map; + qht_map_lock_buckets(map); + qht_unlock(ht); + *pmap = map; + return; +} + +/* + * Get a head bucket and lock it, making sure its parent map is not stale. + * @pmap is filled with a pointer to the bucket's parent map. + * + * Unlock with qemu_spin_unlock(&b->lock). + * + * Note: callers cannot have ht->lock held. + */ +static inline +struct qht_bucket *qht_bucket_lock__no_stale(struct qht *ht, uint32_t hash, + struct qht_map **pmap) +{ + struct qht_bucket *b; + struct qht_map *map; + + map = qatomic_rcu_read(&ht->map); + b = qht_map_to_bucket(map, hash); + + qemu_spin_lock(&b->lock); + if (likely(!qht_map_is_stale__locked(ht, map))) { + *pmap = map; + return b; + } + qemu_spin_unlock(&b->lock); + + /* we raced with a resize; acquire ht->lock to see the updated ht->map */ + qht_lock(ht); + map = ht->map; + b = qht_map_to_bucket(map, hash); + qemu_spin_lock(&b->lock); + qht_unlock(ht); + *pmap = map; + return b; +} + +static inline bool qht_map_needs_resize(const struct qht_map *map) +{ + return qatomic_read(&map->n_added_buckets) > + map->n_added_buckets_threshold; +} + +static inline void qht_chain_destroy(const struct qht_bucket *head) +{ + struct qht_bucket *curr = head->next; + struct qht_bucket *prev; + + qemu_spin_destroy(&head->lock); + while (curr) { + prev = curr; + curr = curr->next; + qemu_vfree(prev); + } +} + +/* pass only an orphan map */ +static void qht_map_destroy(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_chain_destroy(&map->buckets[i]); + } + qemu_vfree(map->buckets); + g_free(map); +} + +static struct qht_map *qht_map_create(size_t n_buckets) +{ + struct qht_map *map; + size_t i; + + map = g_malloc(sizeof(*map)); + map->n_buckets = n_buckets; + + map->n_added_buckets = 0; + map->n_added_buckets_threshold = n_buckets / + QHT_NR_ADDED_BUCKETS_THRESHOLD_DIV; + + /* let tiny hash tables to at least add one non-head bucket */ + if (unlikely(map->n_added_buckets_threshold == 0)) { + map->n_added_buckets_threshold = 1; + } + + map->buckets = qemu_memalign(QHT_BUCKET_ALIGN, + sizeof(*map->buckets) * n_buckets); + for (i = 0; i < n_buckets; i++) { + qht_head_init(&map->buckets[i]); + } + return map; +} + +void qht_init(struct qht *ht, qht_cmp_func_t cmp, size_t n_elems, + unsigned int mode) +{ + struct qht_map *map; + size_t n_buckets = qht_elems_to_buckets(n_elems); + + g_assert(cmp); + ht->cmp = cmp; + ht->mode = mode; + qemu_mutex_init(&ht->lock); + map = qht_map_create(n_buckets); + qatomic_rcu_set(&ht->map, map); +} + +/* call only when there are no readers/writers left */ +void qht_destroy(struct qht *ht) +{ + qht_map_destroy(ht->map); + memset(ht, 0, sizeof(*ht)); +} + +static void qht_bucket_reset__locked(struct qht_bucket *head) +{ + struct qht_bucket *b = head; + int i; + + seqlock_write_begin(&head->sequence); + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + goto done; + } + qatomic_set(&b->hashes[i], 0); + qatomic_set(&b->pointers[i], NULL); + } + b = b->next; + } while (b); + done: + seqlock_write_end(&head->sequence); +} + +/* call with all bucket locks held */ +static void qht_map_reset__all_locked(struct qht_map *map) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_reset__locked(&map->buckets[i]); + } + qht_map_debug__all_locked(map); +} + +void qht_reset(struct qht *ht) +{ + struct qht_map *map; + + qht_map_lock_buckets__no_stale(ht, &map); + qht_map_reset__all_locked(map); + qht_map_unlock_buckets(map); +} + +static inline void qht_do_resize(struct qht *ht, struct qht_map *new) +{ + qht_do_resize_reset(ht, new, false); +} + +static inline void qht_do_resize_and_reset(struct qht *ht, struct qht_map *new) +{ + qht_do_resize_reset(ht, new, true); +} + +bool qht_reset_size(struct qht *ht, size_t n_elems) +{ + struct qht_map *new = NULL; + struct qht_map *map; + size_t n_buckets; + + n_buckets = qht_elems_to_buckets(n_elems); + + qht_lock(ht); + map = ht->map; + if (n_buckets != map->n_buckets) { + new = qht_map_create(n_buckets); + } + qht_do_resize_and_reset(ht, new); + qht_unlock(ht); + + return !!new; +} + +static inline +void *qht_do_lookup(const struct qht_bucket *head, qht_lookup_func_t func, + const void *userp, uint32_t hash) +{ + const struct qht_bucket *b = head; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (qatomic_read(&b->hashes[i]) == hash) { + /* The pointer is dereferenced before seqlock_read_retry, + * so (unlike qht_insert__locked) we need to use + * qatomic_rcu_read here. + */ + void *p = qatomic_rcu_read(&b->pointers[i]); + + if (likely(p) && likely(func(p, userp))) { + return p; + } + } + } + b = qatomic_rcu_read(&b->next); + } while (b); + + return NULL; +} + +static __attribute__((noinline)) +void *qht_lookup__slowpath(const struct qht_bucket *b, qht_lookup_func_t func, + const void *userp, uint32_t hash) +{ + unsigned int version; + void *ret; + + do { + version = seqlock_read_begin(&b->sequence); + ret = qht_do_lookup(b, func, userp, hash); + } while (seqlock_read_retry(&b->sequence, version)); + return ret; +} + +void *qht_lookup_custom(const struct qht *ht, const void *userp, uint32_t hash, + qht_lookup_func_t func) +{ + const struct qht_bucket *b; + const struct qht_map *map; + unsigned int version; + void *ret; + + map = qatomic_rcu_read(&ht->map); + b = qht_map_to_bucket(map, hash); + + version = seqlock_read_begin(&b->sequence); + ret = qht_do_lookup(b, func, userp, hash); + if (likely(!seqlock_read_retry(&b->sequence, version))) { + return ret; + } + /* + * Removing the do/while from the fastpath gives a 4% perf. increase when + * running a 100%-lookup microbenchmark. + */ + return qht_lookup__slowpath(b, func, userp, hash); +} + +void *qht_lookup(const struct qht *ht, const void *userp, uint32_t hash) +{ + return qht_lookup_custom(ht, userp, hash, ht->cmp); +} + +/* + * call with head->lock held + * @ht is const since it is only used for ht->cmp() + */ +static void *qht_insert__locked(const struct qht *ht, struct qht_map *map, + struct qht_bucket *head, void *p, uint32_t hash, + bool *needs_resize) +{ + struct qht_bucket *b = head; + struct qht_bucket *prev = NULL; + struct qht_bucket *new = NULL; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i]) { + if (unlikely(b->hashes[i] == hash && + ht->cmp(b->pointers[i], p))) { + return b->pointers[i]; + } + } else { + goto found; + } + } + prev = b; + b = b->next; + } while (b); + + b = qemu_memalign(QHT_BUCKET_ALIGN, sizeof(*b)); + memset(b, 0, sizeof(*b)); + new = b; + i = 0; + qatomic_inc(&map->n_added_buckets); + if (unlikely(qht_map_needs_resize(map)) && needs_resize) { + *needs_resize = true; + } + + found: + /* found an empty key: acquire the seqlock and write */ + seqlock_write_begin(&head->sequence); + if (new) { + qatomic_rcu_set(&prev->next, b); + } + /* smp_wmb() implicit in seqlock_write_begin. */ + qatomic_set(&b->hashes[i], hash); + qatomic_set(&b->pointers[i], p); + seqlock_write_end(&head->sequence); + return NULL; +} + +static __attribute__((noinline)) void qht_grow_maybe(struct qht *ht) +{ + struct qht_map *map; + + /* + * If the lock is taken it probably means there's an ongoing resize, + * so bail out. + */ + if (qht_trylock(ht)) { + return; + } + map = ht->map; + /* another thread might have just performed the resize we were after */ + if (qht_map_needs_resize(map)) { + struct qht_map *new = qht_map_create(map->n_buckets * 2); + + qht_do_resize(ht, new); + } + qht_unlock(ht); +} + +bool qht_insert(struct qht *ht, void *p, uint32_t hash, void **existing) +{ + struct qht_bucket *b; + struct qht_map *map; + bool needs_resize = false; + void *prev; + + /* NULL pointers are not supported */ + qht_debug_assert(p); + + b = qht_bucket_lock__no_stale(ht, hash, &map); + prev = qht_insert__locked(ht, map, b, p, hash, &needs_resize); + qht_bucket_debug__locked(b); + qemu_spin_unlock(&b->lock); + + if (unlikely(needs_resize) && ht->mode & QHT_MODE_AUTO_RESIZE) { + qht_grow_maybe(ht); + } + if (likely(prev == NULL)) { + return true; + } + if (existing) { + *existing = prev; + } + return false; +} + +static inline bool qht_entry_is_last(const struct qht_bucket *b, int pos) +{ + if (pos == QHT_BUCKET_ENTRIES - 1) { + if (b->next == NULL) { + return true; + } + return b->next->pointers[0] == NULL; + } + return b->pointers[pos + 1] == NULL; +} + +static void +qht_entry_move(struct qht_bucket *to, int i, struct qht_bucket *from, int j) +{ + qht_debug_assert(!(to == from && i == j)); + qht_debug_assert(to->pointers[i]); + qht_debug_assert(from->pointers[j]); + + qatomic_set(&to->hashes[i], from->hashes[j]); + qatomic_set(&to->pointers[i], from->pointers[j]); + + qatomic_set(&from->hashes[j], 0); + qatomic_set(&from->pointers[j], NULL); +} + +/* + * Find the last valid entry in @orig, and swap it with @orig[pos], which has + * just been invalidated. + */ +static inline void qht_bucket_remove_entry(struct qht_bucket *orig, int pos) +{ + struct qht_bucket *b = orig; + struct qht_bucket *prev = NULL; + int i; + + if (qht_entry_is_last(orig, pos)) { + orig->hashes[pos] = 0; + qatomic_set(&orig->pointers[pos], NULL); + return; + } + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i]) { + continue; + } + if (i > 0) { + return qht_entry_move(orig, pos, b, i - 1); + } + qht_debug_assert(prev); + return qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1); + } + prev = b; + b = b->next; + } while (b); + /* no free entries other than orig[pos], so swap it with the last one */ + qht_entry_move(orig, pos, prev, QHT_BUCKET_ENTRIES - 1); +} + +/* call with b->lock held */ +static inline +bool qht_remove__locked(struct qht_bucket *head, const void *p, uint32_t hash) +{ + struct qht_bucket *b = head; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + void *q = b->pointers[i]; + + if (unlikely(q == NULL)) { + return false; + } + if (q == p) { + qht_debug_assert(b->hashes[i] == hash); + seqlock_write_begin(&head->sequence); + qht_bucket_remove_entry(b, i); + seqlock_write_end(&head->sequence); + return true; + } + } + b = b->next; + } while (b); + return false; +} + +bool qht_remove(struct qht *ht, const void *p, uint32_t hash) +{ + struct qht_bucket *b; + struct qht_map *map; + bool ret; + + /* NULL pointers are not supported */ + qht_debug_assert(p); + + b = qht_bucket_lock__no_stale(ht, hash, &map); + ret = qht_remove__locked(b, p, hash); + qht_bucket_debug__locked(b); + qemu_spin_unlock(&b->lock); + return ret; +} + +static inline void qht_bucket_iter(struct qht_bucket *head, + const struct qht_iter *iter, void *userp) +{ + struct qht_bucket *b = head; + int i; + + do { + for (i = 0; i < QHT_BUCKET_ENTRIES; i++) { + if (b->pointers[i] == NULL) { + return; + } + switch (iter->type) { + case QHT_ITER_VOID: + iter->f.retvoid(b->pointers[i], b->hashes[i], userp); + break; + case QHT_ITER_RM: + if (iter->f.retbool(b->pointers[i], b->hashes[i], userp)) { + /* replace i with the last valid element in the bucket */ + seqlock_write_begin(&head->sequence); + qht_bucket_remove_entry(b, i); + seqlock_write_end(&head->sequence); + qht_bucket_debug__locked(b); + /* reevaluate i, since it just got replaced */ + i--; + continue; + } + break; + default: + g_assert_not_reached(); + } + } + b = b->next; + } while (b); +} + +/* call with all of the map's locks held */ +static inline void qht_map_iter__all_locked(struct qht_map *map, + const struct qht_iter *iter, + void *userp) +{ + size_t i; + + for (i = 0; i < map->n_buckets; i++) { + qht_bucket_iter(&map->buckets[i], iter, userp); + } +} + +static inline void +do_qht_iter(struct qht *ht, const struct qht_iter *iter, void *userp) +{ + struct qht_map *map; + + map = qatomic_rcu_read(&ht->map); + qht_map_lock_buckets(map); + qht_map_iter__all_locked(map, iter, userp); + qht_map_unlock_buckets(map); +} + +void qht_iter(struct qht *ht, qht_iter_func_t func, void *userp) +{ + const struct qht_iter iter = { + .f.retvoid = func, + .type = QHT_ITER_VOID, + }; + + do_qht_iter(ht, &iter, userp); +} + +void qht_iter_remove(struct qht *ht, qht_iter_bool_func_t func, void *userp) +{ + const struct qht_iter iter = { + .f.retbool = func, + .type = QHT_ITER_RM, + }; + + do_qht_iter(ht, &iter, userp); +} + +struct qht_map_copy_data { + struct qht *ht; + struct qht_map *new; +}; + +static void qht_map_copy(void *p, uint32_t hash, void *userp) +{ + struct qht_map_copy_data *data = userp; + struct qht *ht = data->ht; + struct qht_map *new = data->new; + struct qht_bucket *b = qht_map_to_bucket(new, hash); + + /* no need to acquire b->lock because no thread has seen this map yet */ + qht_insert__locked(ht, new, b, p, hash, NULL); +} + +/* + * Atomically perform a resize and/or reset. + * Call with ht->lock held. + */ +static void qht_do_resize_reset(struct qht *ht, struct qht_map *new, bool reset) +{ + struct qht_map *old; + const struct qht_iter iter = { + .f.retvoid = qht_map_copy, + .type = QHT_ITER_VOID, + }; + struct qht_map_copy_data data; + + old = ht->map; + qht_map_lock_buckets(old); + + if (reset) { + qht_map_reset__all_locked(old); + } + + if (new == NULL) { + qht_map_unlock_buckets(old); + return; + } + + g_assert(new->n_buckets != old->n_buckets); + data.ht = ht; + data.new = new; + qht_map_iter__all_locked(old, &iter, &data); + qht_map_debug__all_locked(new); + + qatomic_rcu_set(&ht->map, new); + qht_map_unlock_buckets(old); + call_rcu(old, qht_map_destroy, rcu); +} + +bool qht_resize(struct qht *ht, size_t n_elems) +{ + size_t n_buckets = qht_elems_to_buckets(n_elems); + size_t ret = false; + + qht_lock(ht); + if (n_buckets != ht->map->n_buckets) { + struct qht_map *new; + + new = qht_map_create(n_buckets); + qht_do_resize(ht, new); + ret = true; + } + qht_unlock(ht); + + return ret; +} + +/* pass @stats to qht_statistics_destroy() when done */ +void qht_statistics_init(const struct qht *ht, struct qht_stats *stats) +{ + const struct qht_map *map; + int i; + + map = qatomic_rcu_read(&ht->map); + + stats->used_head_buckets = 0; + stats->entries = 0; + qdist_init(&stats->chain); + qdist_init(&stats->occupancy); + /* bail out if the qht has not yet been initialized */ + if (unlikely(map == NULL)) { + stats->head_buckets = 0; + return; + } + stats->head_buckets = map->n_buckets; + + for (i = 0; i < map->n_buckets; i++) { + const struct qht_bucket *head = &map->buckets[i]; + const struct qht_bucket *b; + unsigned int version; + size_t buckets; + size_t entries; + int j; + + do { + version = seqlock_read_begin(&head->sequence); + buckets = 0; + entries = 0; + b = head; + do { + for (j = 0; j < QHT_BUCKET_ENTRIES; j++) { + if (qatomic_read(&b->pointers[j]) == NULL) { + break; + } + entries++; + } + buckets++; + b = qatomic_rcu_read(&b->next); + } while (b); + } while (seqlock_read_retry(&head->sequence, version)); + + if (entries) { + qdist_inc(&stats->chain, buckets); + qdist_inc(&stats->occupancy, + (double)entries / QHT_BUCKET_ENTRIES / buckets); + stats->used_head_buckets++; + stats->entries += entries; + } else { + qdist_inc(&stats->occupancy, 0); + } + } +} + +void qht_statistics_destroy(struct qht_stats *stats) +{ + qdist_destroy(&stats->occupancy); + qdist_destroy(&stats->chain); +} diff --git a/util/qsp.c b/util/qsp.c new file mode 100644 index 000000000..8562b14a8 --- /dev/null +++ b/util/qsp.c @@ -0,0 +1,813 @@ +/* + * qsp.c - QEMU Synchronization Profiler + * + * Copyright (C) 2018, Emilio G. Cota <cota@braap.org> + * + * License: GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + * QSP profiles the time spent in synchronization primitives, which can + * help diagnose performance problems, e.g. scalability issues when + * contention is high. + * + * The primitives currently supported are mutexes, recursive mutexes and + * condition variables. Note that not all related functions are intercepted; + * instead we profile only those functions that can have a performance impact, + * either due to blocking (e.g. cond_wait, mutex_lock) or cache line + * contention (e.g. mutex_lock, mutex_trylock). + * + * QSP's design focuses on speed and scalability. This is achieved + * by having threads do their profiling entirely on thread-local data. + * The appropriate thread-local data is found via a QHT, i.e. a concurrent hash + * table. To aggregate data in order to generate a report, we iterate over + * all entries in the hash table. Depending on the number of threads and + * synchronization objects this might be expensive, but note that it is + * very rarely called -- reports are generated only when requested by users. + * + * Reports are generated as a table where each row represents a call site. A + * call site is the triplet formed by the __file__ and __LINE__ of the caller + * as well as the address of the "object" (i.e. mutex, rec. mutex or condvar) + * being operated on. Optionally, call sites that operate on different objects + * of the same type can be coalesced, which can be particularly useful when + * profiling dynamically-allocated objects. + * + * Alternative designs considered: + * + * - Use an off-the-shelf profiler such as mutrace. This is not a viable option + * for us because QEMU has __malloc_hook set (by one of the libraries it + * uses); leaving this hook unset is required to avoid deadlock in mutrace. + * + * - Use a glib HT for each thread, protecting each HT with its own lock. + * This isn't simpler than the current design, and is 10% slower in the + * atomic_add-bench microbenchmark (-m option). + * + * - For reports, just use a binary tree as we aggregate data, instead of having + * an intermediate hash table. This would simplify the code only slightly, but + * would perform badly if there were many threads and objects to track. + * + * - Wrap operations on qsp entries with RCU read-side critical sections, so + * that qsp_reset() can delete entries. Unfortunately, the overhead of calling + * rcu_read_lock/unlock slows down atomic_add-bench -m by 24%. Having + * a snapshot that is updated on qsp_reset() avoids this overhead. + * + * Related Work: + * - Lennart Poettering's mutrace: http://0pointer.de/blog/projects/mutrace.html + * - Lozi, David, Thomas, Lawall and Muller. "Remote Core Locking: Migrating + * Critical-Section Execution to Improve the Performance of Multithreaded + * Applications", USENIX ATC'12. + */ + +#include "qemu/osdep.h" +#include "qemu/qemu-print.h" +#include "qemu/thread.h" +#include "qemu/timer.h" +#include "qemu/qht.h" +#include "qemu/rcu.h" +#include "qemu/xxhash.h" + +enum QSPType { + QSP_MUTEX, + QSP_BQL_MUTEX, + QSP_REC_MUTEX, + QSP_CONDVAR, +}; + +struct QSPCallSite { + const void *obj; + const char *file; /* i.e. __FILE__; shortened later */ + int line; + enum QSPType type; +}; +typedef struct QSPCallSite QSPCallSite; + +struct QSPEntry { + void *thread_ptr; + const QSPCallSite *callsite; + aligned_uint64_t n_acqs; + aligned_uint64_t ns; + unsigned int n_objs; /* count of coalesced objs; only used for reporting */ +}; +typedef struct QSPEntry QSPEntry; + +struct QSPSnapshot { + struct rcu_head rcu; + struct qht ht; +}; +typedef struct QSPSnapshot QSPSnapshot; + +/* initial sizing for hash tables */ +#define QSP_INITIAL_SIZE 64 + +/* If this file is moved, QSP_REL_PATH should be updated accordingly */ +#define QSP_REL_PATH "util/qsp.c" + +/* this file's full path. Used to present all call sites with relative paths */ +static size_t qsp_qemu_path_len; + +/* the address of qsp_thread gives us a unique 'thread ID' */ +static __thread int qsp_thread; + +/* + * Call sites are the same for all threads, so we track them in a separate hash + * table to save memory. + */ +static struct qht qsp_callsite_ht; + +static struct qht qsp_ht; +static QSPSnapshot *qsp_snapshot; +static bool qsp_initialized, qsp_initializing; + +static const char * const qsp_typenames[] = { + [QSP_MUTEX] = "mutex", + [QSP_BQL_MUTEX] = "BQL mutex", + [QSP_REC_MUTEX] = "rec_mutex", + [QSP_CONDVAR] = "condvar", +}; + +QemuMutexLockFunc qemu_bql_mutex_lock_func = qemu_mutex_lock_impl; +QemuMutexLockFunc qemu_mutex_lock_func = qemu_mutex_lock_impl; +QemuMutexTrylockFunc qemu_mutex_trylock_func = qemu_mutex_trylock_impl; +QemuRecMutexLockFunc qemu_rec_mutex_lock_func = qemu_rec_mutex_lock_impl; +QemuRecMutexTrylockFunc qemu_rec_mutex_trylock_func = + qemu_rec_mutex_trylock_impl; +QemuCondWaitFunc qemu_cond_wait_func = qemu_cond_wait_impl; +QemuCondTimedWaitFunc qemu_cond_timedwait_func = qemu_cond_timedwait_impl; + +/* + * It pays off to _not_ hash callsite->file; hashing a string is slow, and + * without it we still get a pretty unique hash. + */ +static inline +uint32_t do_qsp_callsite_hash(const QSPCallSite *callsite, uint64_t ab) +{ + uint64_t cd = (uint64_t)(uintptr_t)callsite->obj; + uint32_t e = callsite->line; + uint32_t f = callsite->type; + + return qemu_xxhash6(ab, cd, e, f); +} + +static inline +uint32_t qsp_callsite_hash(const QSPCallSite *callsite) +{ + return do_qsp_callsite_hash(callsite, 0); +} + +static inline uint32_t do_qsp_entry_hash(const QSPEntry *entry, uint64_t a) +{ + return do_qsp_callsite_hash(entry->callsite, a); +} + +static uint32_t qsp_entry_hash(const QSPEntry *entry) +{ + return do_qsp_entry_hash(entry, (uint64_t)(uintptr_t)entry->thread_ptr); +} + +static uint32_t qsp_entry_no_thread_hash(const QSPEntry *entry) +{ + return do_qsp_entry_hash(entry, 0); +} + +/* without the objects we need to hash the file name to get a decent hash */ +static uint32_t qsp_entry_no_thread_obj_hash(const QSPEntry *entry) +{ + const QSPCallSite *callsite = entry->callsite; + uint64_t ab = g_str_hash(callsite->file); + uint64_t cd = callsite->line; + uint32_t e = callsite->type; + + return qemu_xxhash5(ab, cd, e); +} + +static bool qsp_callsite_cmp(const void *ap, const void *bp) +{ + const QSPCallSite *a = ap; + const QSPCallSite *b = bp; + + return a == b || + (a->obj == b->obj && + a->line == b->line && + a->type == b->type && + (a->file == b->file || !strcmp(a->file, b->file))); +} + +static bool qsp_callsite_no_obj_cmp(const void *ap, const void *bp) +{ + const QSPCallSite *a = ap; + const QSPCallSite *b = bp; + + return a == b || + (a->line == b->line && + a->type == b->type && + (a->file == b->file || !strcmp(a->file, b->file))); +} + +static bool qsp_entry_no_thread_cmp(const void *ap, const void *bp) +{ + const QSPEntry *a = ap; + const QSPEntry *b = bp; + + return qsp_callsite_cmp(a->callsite, b->callsite); +} + +static bool qsp_entry_no_thread_obj_cmp(const void *ap, const void *bp) +{ + const QSPEntry *a = ap; + const QSPEntry *b = bp; + + return qsp_callsite_no_obj_cmp(a->callsite, b->callsite); +} + +static bool qsp_entry_cmp(const void *ap, const void *bp) +{ + const QSPEntry *a = ap; + const QSPEntry *b = bp; + + return a->thread_ptr == b->thread_ptr && + qsp_callsite_cmp(a->callsite, b->callsite); +} + +/* + * Normally we'd call this from a constructor function, but we want it to work + * via libutil as well. + */ +static void qsp_do_init(void) +{ + /* make sure this file's path in the tree is up to date with QSP_REL_PATH */ + g_assert(strstr(__FILE__, QSP_REL_PATH)); + qsp_qemu_path_len = strlen(__FILE__) - strlen(QSP_REL_PATH); + + qht_init(&qsp_ht, qsp_entry_cmp, QSP_INITIAL_SIZE, + QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); + qht_init(&qsp_callsite_ht, qsp_callsite_cmp, QSP_INITIAL_SIZE, + QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); +} + +static __attribute__((noinline)) void qsp_init__slowpath(void) +{ + if (qatomic_cmpxchg(&qsp_initializing, false, true) == false) { + qsp_do_init(); + qatomic_set(&qsp_initialized, true); + } else { + while (!qatomic_read(&qsp_initialized)) { + cpu_relax(); + } + } +} + +/* qsp_init() must be called from _all_ exported functions */ +static inline void qsp_init(void) +{ + if (likely(qatomic_read(&qsp_initialized))) { + return; + } + qsp_init__slowpath(); +} + +static QSPCallSite *qsp_callsite_find(const QSPCallSite *orig) +{ + QSPCallSite *callsite; + uint32_t hash; + + hash = qsp_callsite_hash(orig); + callsite = qht_lookup(&qsp_callsite_ht, orig, hash); + if (callsite == NULL) { + void *existing = NULL; + + callsite = g_new(QSPCallSite, 1); + memcpy(callsite, orig, sizeof(*callsite)); + qht_insert(&qsp_callsite_ht, callsite, hash, &existing); + if (unlikely(existing)) { + g_free(callsite); + callsite = existing; + } + } + return callsite; +} + +static QSPEntry * +qsp_entry_create(struct qht *ht, const QSPEntry *entry, uint32_t hash) +{ + QSPEntry *e; + void *existing = NULL; + + e = g_new0(QSPEntry, 1); + e->thread_ptr = entry->thread_ptr; + e->callsite = qsp_callsite_find(entry->callsite); + + qht_insert(ht, e, hash, &existing); + if (unlikely(existing)) { + g_free(e); + e = existing; + } + return e; +} + +static QSPEntry * +qsp_entry_find(struct qht *ht, const QSPEntry *entry, uint32_t hash) +{ + QSPEntry *e; + + e = qht_lookup(ht, entry, hash); + if (e == NULL) { + e = qsp_entry_create(ht, entry, hash); + } + return e; +} + +/* + * Note: Entries are never removed, so callers do not have to be in an RCU + * read-side critical section. + */ +static QSPEntry *qsp_entry_get(const void *obj, const char *file, int line, + enum QSPType type) +{ + QSPCallSite callsite = { + .obj = obj, + .file = file, + .line = line, + .type = type, + }; + QSPEntry orig; + uint32_t hash; + + qsp_init(); + + orig.thread_ptr = &qsp_thread; + orig.callsite = &callsite; + + hash = qsp_entry_hash(&orig); + return qsp_entry_find(&qsp_ht, &orig, hash); +} + +/* + * @e is in the global hash table; it is only written to by the current thread, + * so we write to it atomically (as in "write once") to prevent torn reads. + */ +static inline void do_qsp_entry_record(QSPEntry *e, int64_t delta, bool acq) +{ + qatomic_set_u64(&e->ns, e->ns + delta); + if (acq) { + qatomic_set_u64(&e->n_acqs, e->n_acqs + 1); + } +} + +static inline void qsp_entry_record(QSPEntry *e, int64_t delta) +{ + do_qsp_entry_record(e, delta, true); +} + +#define QSP_GEN_VOID(type_, qsp_t_, func_, impl_) \ + static void func_(type_ *obj, const char *file, int line) \ + { \ + QSPEntry *e; \ + int64_t t0, t1; \ + \ + t0 = get_clock(); \ + impl_(obj, file, line); \ + t1 = get_clock(); \ + \ + e = qsp_entry_get(obj, file, line, qsp_t_); \ + qsp_entry_record(e, t1 - t0); \ + } + +#define QSP_GEN_RET1(type_, qsp_t_, func_, impl_) \ + static int func_(type_ *obj, const char *file, int line) \ + { \ + QSPEntry *e; \ + int64_t t0, t1; \ + int err; \ + \ + t0 = get_clock(); \ + err = impl_(obj, file, line); \ + t1 = get_clock(); \ + \ + e = qsp_entry_get(obj, file, line, qsp_t_); \ + do_qsp_entry_record(e, t1 - t0, !err); \ + return err; \ + } + +QSP_GEN_VOID(QemuMutex, QSP_BQL_MUTEX, qsp_bql_mutex_lock, qemu_mutex_lock_impl) +QSP_GEN_VOID(QemuMutex, QSP_MUTEX, qsp_mutex_lock, qemu_mutex_lock_impl) +QSP_GEN_RET1(QemuMutex, QSP_MUTEX, qsp_mutex_trylock, qemu_mutex_trylock_impl) + +QSP_GEN_VOID(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_lock, + qemu_rec_mutex_lock_impl) +QSP_GEN_RET1(QemuRecMutex, QSP_REC_MUTEX, qsp_rec_mutex_trylock, + qemu_rec_mutex_trylock_impl) + +#undef QSP_GEN_RET1 +#undef QSP_GEN_VOID + +static void +qsp_cond_wait(QemuCond *cond, QemuMutex *mutex, const char *file, int line) +{ + QSPEntry *e; + int64_t t0, t1; + + t0 = get_clock(); + qemu_cond_wait_impl(cond, mutex, file, line); + t1 = get_clock(); + + e = qsp_entry_get(cond, file, line, QSP_CONDVAR); + qsp_entry_record(e, t1 - t0); +} + +static bool +qsp_cond_timedwait(QemuCond *cond, QemuMutex *mutex, int ms, + const char *file, int line) +{ + QSPEntry *e; + int64_t t0, t1; + bool ret; + + t0 = get_clock(); + ret = qemu_cond_timedwait_impl(cond, mutex, ms, file, line); + t1 = get_clock(); + + e = qsp_entry_get(cond, file, line, QSP_CONDVAR); + qsp_entry_record(e, t1 - t0); + return ret; +} + +bool qsp_is_enabled(void) +{ + return qatomic_read(&qemu_mutex_lock_func) == qsp_mutex_lock; +} + +void qsp_enable(void) +{ + qatomic_set(&qemu_mutex_lock_func, qsp_mutex_lock); + qatomic_set(&qemu_mutex_trylock_func, qsp_mutex_trylock); + qatomic_set(&qemu_bql_mutex_lock_func, qsp_bql_mutex_lock); + qatomic_set(&qemu_rec_mutex_lock_func, qsp_rec_mutex_lock); + qatomic_set(&qemu_rec_mutex_trylock_func, qsp_rec_mutex_trylock); + qatomic_set(&qemu_cond_wait_func, qsp_cond_wait); + qatomic_set(&qemu_cond_timedwait_func, qsp_cond_timedwait); +} + +void qsp_disable(void) +{ + qatomic_set(&qemu_mutex_lock_func, qemu_mutex_lock_impl); + qatomic_set(&qemu_mutex_trylock_func, qemu_mutex_trylock_impl); + qatomic_set(&qemu_bql_mutex_lock_func, qemu_mutex_lock_impl); + qatomic_set(&qemu_rec_mutex_lock_func, qemu_rec_mutex_lock_impl); + qatomic_set(&qemu_rec_mutex_trylock_func, qemu_rec_mutex_trylock_impl); + qatomic_set(&qemu_cond_wait_func, qemu_cond_wait_impl); + qatomic_set(&qemu_cond_timedwait_func, qemu_cond_timedwait_impl); +} + +static gint qsp_tree_cmp(gconstpointer ap, gconstpointer bp, gpointer up) +{ + const QSPEntry *a = ap; + const QSPEntry *b = bp; + enum QSPSortBy sort_by = *(enum QSPSortBy *)up; + const QSPCallSite *ca; + const QSPCallSite *cb; + + switch (sort_by) { + case QSP_SORT_BY_TOTAL_WAIT_TIME: + if (a->ns > b->ns) { + return -1; + } else if (a->ns < b->ns) { + return 1; + } + break; + case QSP_SORT_BY_AVG_WAIT_TIME: + { + double avg_a = a->n_acqs ? a->ns / a->n_acqs : 0; + double avg_b = b->n_acqs ? b->ns / b->n_acqs : 0; + + if (avg_a > avg_b) { + return -1; + } else if (avg_a < avg_b) { + return 1; + } + break; + } + default: + g_assert_not_reached(); + } + + ca = a->callsite; + cb = b->callsite; + /* Break the tie with the object's address */ + if (ca->obj < cb->obj) { + return -1; + } else if (ca->obj > cb->obj) { + return 1; + } else { + int cmp; + + /* same obj. Break the tie with the callsite's file */ + cmp = strcmp(ca->file, cb->file); + if (cmp) { + return cmp; + } + /* same callsite file. Break the tie with the callsite's line */ + g_assert(ca->line != cb->line); + if (ca->line < cb->line) { + return -1; + } else if (ca->line > cb->line) { + return 1; + } else { + /* break the tie with the callsite's type */ + return cb->type - ca->type; + } + } +} + +static void qsp_sort(void *p, uint32_t h, void *userp) +{ + QSPEntry *e = p; + GTree *tree = userp; + + g_tree_insert(tree, e, NULL); +} + +static void qsp_aggregate(void *p, uint32_t h, void *up) +{ + struct qht *ht = up; + const QSPEntry *e = p; + QSPEntry *agg; + uint32_t hash; + + hash = qsp_entry_no_thread_hash(e); + agg = qsp_entry_find(ht, e, hash); + /* + * The entry is in the global hash table; read from it atomically (as in + * "read once"). + */ + agg->ns += qatomic_read_u64(&e->ns); + agg->n_acqs += qatomic_read_u64(&e->n_acqs); +} + +static void qsp_iter_diff(void *p, uint32_t hash, void *htp) +{ + struct qht *ht = htp; + QSPEntry *old = p; + QSPEntry *new; + + new = qht_lookup(ht, old, hash); + /* entries are never deleted, so we must have this one */ + g_assert(new != NULL); + /* our reading of the stats happened after the snapshot was taken */ + g_assert(new->n_acqs >= old->n_acqs); + g_assert(new->ns >= old->ns); + + new->n_acqs -= old->n_acqs; + new->ns -= old->ns; + + /* No point in reporting an empty entry */ + if (new->n_acqs == 0 && new->ns == 0) { + bool removed = qht_remove(ht, new, hash); + + g_assert(removed); + g_free(new); + } +} + +static void qsp_diff(struct qht *orig, struct qht *new) +{ + qht_iter(orig, qsp_iter_diff, new); +} + +static void qsp_iter_callsite_coalesce(void *p, uint32_t h, void *htp) +{ + struct qht *ht = htp; + QSPEntry *old = p; + QSPEntry *e; + uint32_t hash; + + hash = qsp_entry_no_thread_obj_hash(old); + e = qht_lookup(ht, old, hash); + if (e == NULL) { + e = qsp_entry_create(ht, old, hash); + e->n_objs = 1; + } else if (e->callsite->obj != old->callsite->obj) { + e->n_objs++; + } + e->ns += old->ns; + e->n_acqs += old->n_acqs; +} + +static void qsp_ht_delete(void *p, uint32_t h, void *htp) +{ + g_free(p); +} + +static void qsp_mktree(GTree *tree, bool callsite_coalesce) +{ + struct qht ht, coalesce_ht; + struct qht *htp; + + /* + * First, see if there's a prior snapshot, so that we read the global hash + * table _after_ the snapshot has been created, which guarantees that + * the entries we'll read will be a superset of the snapshot's entries. + * + * We must remain in an RCU read-side critical section until we're done + * with the snapshot. + */ + WITH_RCU_READ_LOCK_GUARD() { + QSPSnapshot *snap = qatomic_rcu_read(&qsp_snapshot); + + /* Aggregate all results from the global hash table into a local one */ + qht_init(&ht, qsp_entry_no_thread_cmp, QSP_INITIAL_SIZE, + QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); + qht_iter(&qsp_ht, qsp_aggregate, &ht); + + /* compute the difference wrt the snapshot, if any */ + if (snap) { + qsp_diff(&snap->ht, &ht); + } + } + + htp = &ht; + if (callsite_coalesce) { + qht_init(&coalesce_ht, qsp_entry_no_thread_obj_cmp, QSP_INITIAL_SIZE, + QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); + qht_iter(&ht, qsp_iter_callsite_coalesce, &coalesce_ht); + + /* free the previous hash table, and point htp to coalesce_ht */ + qht_iter(&ht, qsp_ht_delete, NULL); + qht_destroy(&ht); + htp = &coalesce_ht; + } + + /* sort the hash table elements by using a tree */ + qht_iter(htp, qsp_sort, tree); + + /* free the hash table, but keep the elements (those are in the tree now) */ + qht_destroy(htp); +} + +/* free string with g_free */ +static char *qsp_at(const QSPCallSite *callsite) +{ + GString *s = g_string_new(NULL); + const char *shortened; + + /* remove the absolute path to qemu */ + if (unlikely(strlen(callsite->file) < qsp_qemu_path_len)) { + shortened = callsite->file; + } else { + shortened = callsite->file + qsp_qemu_path_len; + } + g_string_append_printf(s, "%s:%u", shortened, callsite->line); + return g_string_free(s, FALSE); +} + +struct QSPReportEntry { + const void *obj; + char *callsite_at; + const char *typename; + double time_s; + double ns_avg; + uint64_t n_acqs; + unsigned int n_objs; +}; +typedef struct QSPReportEntry QSPReportEntry; + +struct QSPReport { + QSPReportEntry *entries; + size_t n_entries; + size_t max_n_entries; +}; +typedef struct QSPReport QSPReport; + +static gboolean qsp_tree_report(gpointer key, gpointer value, gpointer udata) +{ + const QSPEntry *e = key; + QSPReport *report = udata; + QSPReportEntry *entry; + + if (report->n_entries == report->max_n_entries) { + return TRUE; + } + entry = &report->entries[report->n_entries]; + report->n_entries++; + + entry->obj = e->callsite->obj; + entry->n_objs = e->n_objs; + entry->callsite_at = qsp_at(e->callsite); + entry->typename = qsp_typenames[e->callsite->type]; + entry->time_s = e->ns * 1e-9; + entry->n_acqs = e->n_acqs; + entry->ns_avg = e->n_acqs ? e->ns / e->n_acqs : 0; + return FALSE; +} + +static void pr_report(const QSPReport *rep) +{ + char *dashes; + size_t max_len = 0; + int callsite_len = 0; + int callsite_rspace; + int n_dashes; + size_t i; + + /* find out the maximum length of all 'callsite' fields */ + for (i = 0; i < rep->n_entries; i++) { + const QSPReportEntry *e = &rep->entries[i]; + size_t len = strlen(e->callsite_at); + + if (len > max_len) { + max_len = len; + } + } + + callsite_len = MAX(max_len, strlen("Call site")); + /* white space to leave to the right of "Call site" */ + callsite_rspace = callsite_len - strlen("Call site"); + + qemu_printf("Type Object Call site%*s Wait Time (s) " + " Count Average (us)\n", callsite_rspace, ""); + + /* build a horizontal rule with dashes */ + n_dashes = 79 + callsite_rspace; + dashes = g_malloc(n_dashes + 1); + memset(dashes, '-', n_dashes); + dashes[n_dashes] = '\0'; + qemu_printf("%s\n", dashes); + + for (i = 0; i < rep->n_entries; i++) { + const QSPReportEntry *e = &rep->entries[i]; + GString *s = g_string_new(NULL); + + g_string_append_printf(s, "%-9s ", e->typename); + if (e->n_objs > 1) { + g_string_append_printf(s, "[%12u]", e->n_objs); + } else { + g_string_append_printf(s, "%14p", e->obj); + } + g_string_append_printf(s, " %s%*s %13.5f %12" PRIu64 " %12.2f\n", + e->callsite_at, + callsite_len - (int)strlen(e->callsite_at), "", + e->time_s, e->n_acqs, e->ns_avg * 1e-3); + qemu_printf("%s", s->str); + g_string_free(s, TRUE); + } + + qemu_printf("%s\n", dashes); + g_free(dashes); +} + +static void report_destroy(QSPReport *rep) +{ + size_t i; + + for (i = 0; i < rep->n_entries; i++) { + QSPReportEntry *e = &rep->entries[i]; + + g_free(e->callsite_at); + } + g_free(rep->entries); +} + +void qsp_report(size_t max, enum QSPSortBy sort_by, + bool callsite_coalesce) +{ + GTree *tree = g_tree_new_full(qsp_tree_cmp, &sort_by, g_free, NULL); + QSPReport rep; + + qsp_init(); + + rep.entries = g_new0(QSPReportEntry, max); + rep.n_entries = 0; + rep.max_n_entries = max; + + qsp_mktree(tree, callsite_coalesce); + g_tree_foreach(tree, qsp_tree_report, &rep); + g_tree_destroy(tree); + + pr_report(&rep); + report_destroy(&rep); +} + +static void qsp_snapshot_destroy(QSPSnapshot *snap) +{ + qht_iter(&snap->ht, qsp_ht_delete, NULL); + qht_destroy(&snap->ht); + g_free(snap); +} + +void qsp_reset(void) +{ + QSPSnapshot *new = g_new(QSPSnapshot, 1); + QSPSnapshot *old; + + qsp_init(); + + qht_init(&new->ht, qsp_entry_cmp, QSP_INITIAL_SIZE, + QHT_MODE_AUTO_RESIZE | QHT_MODE_RAW_MUTEXES); + + /* take a snapshot of the current state */ + qht_iter(&qsp_ht, qsp_aggregate, &new->ht); + + /* replace the previous snapshot, if any */ + old = qatomic_xchg(&qsp_snapshot, new); + if (old) { + call_rcu(old, qsp_snapshot_destroy, rcu); + } +} diff --git a/util/range.c b/util/range.c new file mode 100644 index 000000000..098d9d2dc --- /dev/null +++ b/util/range.c @@ -0,0 +1,72 @@ +/* + * QEMU 64-bit address ranges + * + * Copyright (c) 2015-2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qemu/range.h" + +/* + * Return -1 if @a < @b, 1 @a > @b, and 0 if they touch or overlap. + * Both @a and @b must not be empty. + */ +static inline int range_compare(Range *a, Range *b) +{ + assert(!range_is_empty(a) && !range_is_empty(b)); + + /* Careful, avoid wraparound */ + if (b->lob && b->lob - 1 > a->upb) { + return -1; + } + if (a->lob && a->lob - 1 > b->upb) { + return 1; + } + return 0; +} + +/* Insert @data into @list of ranges; caller no longer owns @data */ +GList *range_list_insert(GList *list, Range *data) +{ + GList *l; + + assert(!range_is_empty(data)); + + /* Skip all list elements strictly less than data */ + for (l = list; l && range_compare(l->data, data) < 0; l = l->next) { + } + + if (!l || range_compare(l->data, data) > 0) { + /* Rest of the list (if any) is strictly greater than @data */ + return g_list_insert_before(list, l, data); + } + + /* Current list element overlaps @data, merge the two */ + range_extend(l->data, data); + g_free(data); + + /* Merge any subsequent list elements that now also overlap */ + while (l->next && range_compare(l->data, l->next->data) == 0) { + GList *new_l; + + range_extend(l->data, l->next->data); + g_free(l->next->data); + new_l = g_list_delete_link(list, l->next); + assert(new_l == list); + } + + return list; +} diff --git a/util/rcu.c b/util/rcu.c new file mode 100644 index 000000000..c91da9f13 --- /dev/null +++ b/util/rcu.c @@ -0,0 +1,455 @@ +/* + * urcu-mb.c + * + * Userspace RCU library with explicit memory barriers + * + * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com> + * Copyright (c) 2009 Paul E. McKenney, IBM Corporation. + * Copyright 2015 Red Hat, Inc. + * + * Ported to QEMU by Paolo Bonzini <pbonzini@redhat.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * IBM's contributions to this file may be relicensed under LGPLv2 or later. + */ + +#include "qemu/osdep.h" +#include "qemu/rcu.h" +#include "qemu/atomic.h" +#include "qemu/thread.h" +#include "qemu/main-loop.h" +#include "qemu/lockable.h" +#if defined(CONFIG_MALLOC_TRIM) +#include <malloc.h> +#endif + +/* + * Global grace period counter. Bit 0 is always one in rcu_gp_ctr. + * Bits 1 and above are defined in synchronize_rcu. + */ +#define RCU_GP_LOCKED (1UL << 0) +#define RCU_GP_CTR (1UL << 1) + +unsigned long rcu_gp_ctr = RCU_GP_LOCKED; + +QemuEvent rcu_gp_event; +static int in_drain_call_rcu; +static QemuMutex rcu_registry_lock; +static QemuMutex rcu_sync_lock; + +/* + * Check whether a quiescent state was crossed between the beginning of + * update_counter_and_wait and now. + */ +static inline int rcu_gp_ongoing(unsigned long *ctr) +{ + unsigned long v; + + v = qatomic_read(ctr); + return v && (v != rcu_gp_ctr); +} + +/* Written to only by each individual reader. Read by both the reader and the + * writers. + */ +__thread struct rcu_reader_data rcu_reader; + +/* Protected by rcu_registry_lock. */ +typedef QLIST_HEAD(, rcu_reader_data) ThreadList; +static ThreadList registry = QLIST_HEAD_INITIALIZER(registry); + +/* Wait for previous parity/grace period to be empty of readers. */ +static void wait_for_readers(void) +{ + ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders); + struct rcu_reader_data *index, *tmp; + + for (;;) { + /* We want to be notified of changes made to rcu_gp_ongoing + * while we walk the list. + */ + qemu_event_reset(&rcu_gp_event); + + /* Instead of using qatomic_mb_set for index->waiting, and + * qatomic_mb_read for index->ctr, memory barriers are placed + * manually since writes to different threads are independent. + * qemu_event_reset has acquire semantics, so no memory barrier + * is needed here. + */ + QLIST_FOREACH(index, ®istry, node) { + qatomic_set(&index->waiting, true); + } + + /* Here, order the stores to index->waiting before the loads of + * index->ctr. Pairs with smp_mb_placeholder() in rcu_read_unlock(), + * ensuring that the loads of index->ctr are sequentially consistent. + */ + smp_mb_global(); + + QLIST_FOREACH_SAFE(index, ®istry, node, tmp) { + if (!rcu_gp_ongoing(&index->ctr)) { + QLIST_REMOVE(index, node); + QLIST_INSERT_HEAD(&qsreaders, index, node); + + /* No need for mb_set here, worst of all we + * get some extra futex wakeups. + */ + qatomic_set(&index->waiting, false); + } else if (qatomic_read(&in_drain_call_rcu)) { + notifier_list_notify(&index->force_rcu, NULL); + } + } + + if (QLIST_EMPTY(®istry)) { + break; + } + + /* Wait for one thread to report a quiescent state and try again. + * Release rcu_registry_lock, so rcu_(un)register_thread() doesn't + * wait too much time. + * + * rcu_register_thread() may add nodes to ®istry; it will not + * wake up synchronize_rcu, but that is okay because at least another + * thread must exit its RCU read-side critical section before + * synchronize_rcu is done. The next iteration of the loop will + * move the new thread's rcu_reader from ®istry to &qsreaders, + * because rcu_gp_ongoing() will return false. + * + * rcu_unregister_thread() may remove nodes from &qsreaders instead + * of ®istry if it runs during qemu_event_wait. That's okay; + * the node then will not be added back to ®istry by QLIST_SWAP + * below. The invariant is that the node is part of one list when + * rcu_registry_lock is released. + */ + qemu_mutex_unlock(&rcu_registry_lock); + qemu_event_wait(&rcu_gp_event); + qemu_mutex_lock(&rcu_registry_lock); + } + + /* put back the reader list in the registry */ + QLIST_SWAP(®istry, &qsreaders, node); +} + +void synchronize_rcu(void) +{ + QEMU_LOCK_GUARD(&rcu_sync_lock); + + /* Write RCU-protected pointers before reading p_rcu_reader->ctr. + * Pairs with smp_mb_placeholder() in rcu_read_lock(). + */ + smp_mb_global(); + + QEMU_LOCK_GUARD(&rcu_registry_lock); + if (!QLIST_EMPTY(®istry)) { + /* In either case, the qatomic_mb_set below blocks stores that free + * old RCU-protected pointers. + */ + if (sizeof(rcu_gp_ctr) < 8) { + /* For architectures with 32-bit longs, a two-subphases algorithm + * ensures we do not encounter overflow bugs. + * + * Switch parity: 0 -> 1, 1 -> 0. + */ + qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR); + wait_for_readers(); + qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR); + } else { + /* Increment current grace period. */ + qatomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR); + } + + wait_for_readers(); + } +} + + +#define RCU_CALL_MIN_SIZE 30 + +/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h + * from liburcu. Note that head is only used by the consumer. + */ +static struct rcu_head dummy; +static struct rcu_head *head = &dummy, **tail = &dummy.next; +static int rcu_call_count; +static QemuEvent rcu_call_ready_event; + +static void enqueue(struct rcu_head *node) +{ + struct rcu_head **old_tail; + + node->next = NULL; + old_tail = qatomic_xchg(&tail, &node->next); + qatomic_mb_set(old_tail, node); +} + +static struct rcu_head *try_dequeue(void) +{ + struct rcu_head *node, *next; + +retry: + /* Test for an empty list, which we do not expect. Note that for + * the consumer head and tail are always consistent. The head + * is consistent because only the consumer reads/writes it. + * The tail, because it is the first step in the enqueuing. + * It is only the next pointers that might be inconsistent. + */ + if (head == &dummy && qatomic_mb_read(&tail) == &dummy.next) { + abort(); + } + + /* If the head node has NULL in its next pointer, the value is + * wrong and we need to wait until its enqueuer finishes the update. + */ + node = head; + next = qatomic_mb_read(&head->next); + if (!next) { + return NULL; + } + + /* Since we are the sole consumer, and we excluded the empty case + * above, the queue will always have at least two nodes: the + * dummy node, and the one being removed. So we do not need to update + * the tail pointer. + */ + head = next; + + /* If we dequeued the dummy node, add it back at the end and retry. */ + if (node == &dummy) { + enqueue(node); + goto retry; + } + + return node; +} + +static void *call_rcu_thread(void *opaque) +{ + struct rcu_head *node; + + rcu_register_thread(); + + for (;;) { + int tries = 0; + int n = qatomic_read(&rcu_call_count); + + /* Heuristically wait for a decent number of callbacks to pile up. + * Fetch rcu_call_count now, we only must process elements that were + * added before synchronize_rcu() starts. + */ + while (n == 0 || (n < RCU_CALL_MIN_SIZE && ++tries <= 5)) { + g_usleep(10000); + if (n == 0) { + qemu_event_reset(&rcu_call_ready_event); + n = qatomic_read(&rcu_call_count); + if (n == 0) { +#if defined(CONFIG_MALLOC_TRIM) + malloc_trim(4 * 1024 * 1024); +#endif + qemu_event_wait(&rcu_call_ready_event); + } + } + n = qatomic_read(&rcu_call_count); + } + + qatomic_sub(&rcu_call_count, n); + synchronize_rcu(); + qemu_mutex_lock_iothread(); + while (n > 0) { + node = try_dequeue(); + while (!node) { + qemu_mutex_unlock_iothread(); + qemu_event_reset(&rcu_call_ready_event); + node = try_dequeue(); + if (!node) { + qemu_event_wait(&rcu_call_ready_event); + node = try_dequeue(); + } + qemu_mutex_lock_iothread(); + } + + n--; + node->func(node); + } + qemu_mutex_unlock_iothread(); + } + abort(); +} + +void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node)) +{ + node->func = func; + enqueue(node); + qatomic_inc(&rcu_call_count); + qemu_event_set(&rcu_call_ready_event); +} + + +struct rcu_drain { + struct rcu_head rcu; + QemuEvent drain_complete_event; +}; + +static void drain_rcu_callback(struct rcu_head *node) +{ + struct rcu_drain *event = (struct rcu_drain *)node; + qemu_event_set(&event->drain_complete_event); +} + +/* + * This function ensures that all pending RCU callbacks + * on the current thread are done executing + + * drops big qemu lock during the wait to allow RCU thread + * to process the callbacks + * + */ + +void drain_call_rcu(void) +{ + struct rcu_drain rcu_drain; + bool locked = qemu_mutex_iothread_locked(); + + memset(&rcu_drain, 0, sizeof(struct rcu_drain)); + qemu_event_init(&rcu_drain.drain_complete_event, false); + + if (locked) { + qemu_mutex_unlock_iothread(); + } + + + /* + * RCU callbacks are invoked in the same order as in which they + * are registered, thus we can be sure that when 'drain_rcu_callback' + * is called, all RCU callbacks that were registered on this thread + * prior to calling this function are completed. + * + * Note that since we have only one global queue of the RCU callbacks, + * we also end up waiting for most of RCU callbacks that were registered + * on the other threads, but this is a side effect that shoudn't be + * assumed. + */ + + qatomic_inc(&in_drain_call_rcu); + call_rcu1(&rcu_drain.rcu, drain_rcu_callback); + qemu_event_wait(&rcu_drain.drain_complete_event); + qatomic_dec(&in_drain_call_rcu); + + if (locked) { + qemu_mutex_lock_iothread(); + } + +} + +void rcu_register_thread(void) +{ + assert(rcu_reader.ctr == 0); + qemu_mutex_lock(&rcu_registry_lock); + QLIST_INSERT_HEAD(®istry, &rcu_reader, node); + qemu_mutex_unlock(&rcu_registry_lock); +} + +void rcu_unregister_thread(void) +{ + qemu_mutex_lock(&rcu_registry_lock); + QLIST_REMOVE(&rcu_reader, node); + qemu_mutex_unlock(&rcu_registry_lock); +} + +void rcu_add_force_rcu_notifier(Notifier *n) +{ + qemu_mutex_lock(&rcu_registry_lock); + notifier_list_add(&rcu_reader.force_rcu, n); + qemu_mutex_unlock(&rcu_registry_lock); +} + +void rcu_remove_force_rcu_notifier(Notifier *n) +{ + qemu_mutex_lock(&rcu_registry_lock); + notifier_remove(n); + qemu_mutex_unlock(&rcu_registry_lock); +} + +static void rcu_init_complete(void) +{ + QemuThread thread; + + qemu_mutex_init(&rcu_registry_lock); + qemu_mutex_init(&rcu_sync_lock); + qemu_event_init(&rcu_gp_event, true); + + qemu_event_init(&rcu_call_ready_event, false); + + /* The caller is assumed to have iothread lock, so the call_rcu thread + * must have been quiescent even after forking, just recreate it. + */ + qemu_thread_create(&thread, "call_rcu", call_rcu_thread, + NULL, QEMU_THREAD_DETACHED); + + rcu_register_thread(); +} + +static int atfork_depth = 1; + +void rcu_enable_atfork(void) +{ + atfork_depth++; +} + +void rcu_disable_atfork(void) +{ + atfork_depth--; +} + +#ifdef CONFIG_POSIX +static void rcu_init_lock(void) +{ + if (atfork_depth < 1) { + return; + } + + qemu_mutex_lock(&rcu_sync_lock); + qemu_mutex_lock(&rcu_registry_lock); +} + +static void rcu_init_unlock(void) +{ + if (atfork_depth < 1) { + return; + } + + qemu_mutex_unlock(&rcu_registry_lock); + qemu_mutex_unlock(&rcu_sync_lock); +} + +static void rcu_init_child(void) +{ + if (atfork_depth < 1) { + return; + } + + memset(®istry, 0, sizeof(registry)); + rcu_init_complete(); +} +#endif + +static void __attribute__((__constructor__)) rcu_init(void) +{ + smp_mb_global_init(); +#ifdef CONFIG_POSIX + pthread_atfork(rcu_init_lock, rcu_init_unlock, rcu_init_child); +#endif + rcu_init_complete(); +} diff --git a/util/readline.c b/util/readline.c new file mode 100644 index 000000000..f1ac6e476 --- /dev/null +++ b/util/readline.c @@ -0,0 +1,549 @@ +/* + * QEMU readline utility + * + * Copyright (c) 2003-2004 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu/osdep.h" +#include "qemu/readline.h" +#include "qemu/ctype.h" +#include "qemu/cutils.h" + +#define IS_NORM 0 +#define IS_ESC 1 +#define IS_CSI 2 +#define IS_SS3 3 + +void readline_show_prompt(ReadLineState *rs) +{ + rs->printf_func(rs->opaque, "%s", rs->prompt); + rs->flush_func(rs->opaque); + rs->last_cmd_buf_index = 0; + rs->last_cmd_buf_size = 0; + rs->esc_state = IS_NORM; +} + +/* update the displayed command line */ +static void readline_update(ReadLineState *rs) +{ + int i, delta, len; + + if (rs->cmd_buf_size != rs->last_cmd_buf_size || + memcmp(rs->cmd_buf, rs->last_cmd_buf, rs->cmd_buf_size) != 0) { + for (i = 0; i < rs->last_cmd_buf_index; i++) { + rs->printf_func(rs->opaque, "\033[D"); + } + rs->cmd_buf[rs->cmd_buf_size] = '\0'; + if (rs->read_password) { + len = strlen(rs->cmd_buf); + for (i = 0; i < len; i++) { + rs->printf_func(rs->opaque, "*"); + } + } else { + rs->printf_func(rs->opaque, "%s", rs->cmd_buf); + } + rs->printf_func(rs->opaque, "\033[K"); + memcpy(rs->last_cmd_buf, rs->cmd_buf, rs->cmd_buf_size); + rs->last_cmd_buf_size = rs->cmd_buf_size; + rs->last_cmd_buf_index = rs->cmd_buf_size; + } + if (rs->cmd_buf_index != rs->last_cmd_buf_index) { + delta = rs->cmd_buf_index - rs->last_cmd_buf_index; + if (delta > 0) { + for (i = 0; i < delta; i++) { + rs->printf_func(rs->opaque, "\033[C"); + } + } else { + delta = -delta; + for (i = 0; i < delta; i++) { + rs->printf_func(rs->opaque, "\033[D"); + } + } + rs->last_cmd_buf_index = rs->cmd_buf_index; + } + rs->flush_func(rs->opaque); +} + +static void readline_insert_char(ReadLineState *rs, int ch) +{ + if (rs->cmd_buf_index < READLINE_CMD_BUF_SIZE) { + memmove(rs->cmd_buf + rs->cmd_buf_index + 1, + rs->cmd_buf + rs->cmd_buf_index, + rs->cmd_buf_size - rs->cmd_buf_index); + rs->cmd_buf[rs->cmd_buf_index] = ch; + rs->cmd_buf_size++; + rs->cmd_buf_index++; + } +} + +static void readline_backward_char(ReadLineState *rs) +{ + if (rs->cmd_buf_index > 0) { + rs->cmd_buf_index--; + } +} + +static void readline_forward_char(ReadLineState *rs) +{ + if (rs->cmd_buf_index < rs->cmd_buf_size) { + rs->cmd_buf_index++; + } +} + +static void readline_delete_char(ReadLineState *rs) +{ + if (rs->cmd_buf_index < rs->cmd_buf_size) { + memmove(rs->cmd_buf + rs->cmd_buf_index, + rs->cmd_buf + rs->cmd_buf_index + 1, + rs->cmd_buf_size - rs->cmd_buf_index - 1); + rs->cmd_buf_size--; + } +} + +static void readline_backspace(ReadLineState *rs) +{ + if (rs->cmd_buf_index > 0) { + readline_backward_char(rs); + readline_delete_char(rs); + } +} + +static void readline_backword(ReadLineState *rs) +{ + int start; + + if (rs->cmd_buf_index == 0 || rs->cmd_buf_index > rs->cmd_buf_size) { + return; + } + + start = rs->cmd_buf_index - 1; + + /* find first word (backwards) */ + while (start > 0) { + if (!qemu_isspace(rs->cmd_buf[start])) { + break; + } + + --start; + } + + /* find first space (backwards) */ + while (start > 0) { + if (qemu_isspace(rs->cmd_buf[start])) { + ++start; + break; + } + + --start; + } + + /* remove word */ + if (start < rs->cmd_buf_index) { + memmove(rs->cmd_buf + start, + rs->cmd_buf + rs->cmd_buf_index, + rs->cmd_buf_size - rs->cmd_buf_index); + rs->cmd_buf_size -= rs->cmd_buf_index - start; + rs->cmd_buf_index = start; + } +} + +static void readline_bol(ReadLineState *rs) +{ + rs->cmd_buf_index = 0; +} + +static void readline_eol(ReadLineState *rs) +{ + rs->cmd_buf_index = rs->cmd_buf_size; +} + +static void readline_up_char(ReadLineState *rs) +{ + int idx; + + if (rs->hist_entry == 0) { + return; + } + if (rs->hist_entry == -1) { + /* Find latest entry */ + for (idx = 0; idx < READLINE_MAX_CMDS; idx++) { + if (rs->history[idx] == NULL) { + break; + } + } + rs->hist_entry = idx; + } + rs->hist_entry--; + if (rs->hist_entry >= 0) { + pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), + rs->history[rs->hist_entry]); + rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf); + } +} + +static void readline_down_char(ReadLineState *rs) +{ + if (rs->hist_entry == -1) { + return; + } + if (rs->hist_entry < READLINE_MAX_CMDS - 1 && + rs->history[++rs->hist_entry] != NULL) { + pstrcpy(rs->cmd_buf, sizeof(rs->cmd_buf), + rs->history[rs->hist_entry]); + } else { + rs->cmd_buf[0] = 0; + rs->hist_entry = -1; + } + rs->cmd_buf_index = rs->cmd_buf_size = strlen(rs->cmd_buf); +} + +static void readline_hist_add(ReadLineState *rs, const char *cmdline) +{ + char *hist_entry, *new_entry; + int idx; + + if (cmdline[0] == '\0') { + return; + } + new_entry = NULL; + if (rs->hist_entry != -1) { + /* We were editing an existing history entry: replace it */ + hist_entry = rs->history[rs->hist_entry]; + idx = rs->hist_entry; + if (strcmp(hist_entry, cmdline) == 0) { + goto same_entry; + } + } + /* Search cmdline in history buffers */ + for (idx = 0; idx < READLINE_MAX_CMDS; idx++) { + hist_entry = rs->history[idx]; + if (hist_entry == NULL) { + break; + } + if (strcmp(hist_entry, cmdline) == 0) { + same_entry: + if (idx == READLINE_MAX_CMDS - 1) { + return; + } + new_entry = hist_entry; + /* Put this entry at the end of history */ + memmove(&rs->history[idx], &rs->history[idx + 1], + (READLINE_MAX_CMDS - (idx + 1)) * sizeof(char *)); + rs->history[READLINE_MAX_CMDS - 1] = NULL; + for (; idx < READLINE_MAX_CMDS; idx++) { + if (rs->history[idx] == NULL) { + break; + } + } + break; + } + } + if (idx == READLINE_MAX_CMDS) { + /* Need to get one free slot */ + g_free(rs->history[0]); + memmove(rs->history, &rs->history[1], + (READLINE_MAX_CMDS - 1) * sizeof(char *)); + rs->history[READLINE_MAX_CMDS - 1] = NULL; + idx = READLINE_MAX_CMDS - 1; + } + if (new_entry == NULL) { + new_entry = g_strdup(cmdline); + } + rs->history[idx] = new_entry; + rs->hist_entry = -1; +} + +/* completion support */ + +void readline_add_completion(ReadLineState *rs, const char *str) +{ + if (rs->nb_completions < READLINE_MAX_COMPLETIONS) { + int i; + for (i = 0; i < rs->nb_completions; i++) { + if (!strcmp(rs->completions[i], str)) { + return; + } + } + rs->completions[rs->nb_completions++] = g_strdup(str); + } +} + +void readline_set_completion_index(ReadLineState *rs, int index) +{ + rs->completion_index = index; +} + +static int completion_comp(const void *a, const void *b) +{ + return strcmp(*(const char **) a, *(const char **) b); +} + +static void readline_completion(ReadLineState *rs) +{ + int len, i, j, max_width, nb_cols, max_prefix; + char *cmdline; + + rs->nb_completions = 0; + + cmdline = g_strndup(rs->cmd_buf, rs->cmd_buf_index); + rs->completion_finder(rs->opaque, cmdline); + g_free(cmdline); + + /* no completion found */ + if (rs->nb_completions <= 0) { + return; + } + if (rs->nb_completions == 1) { + len = strlen(rs->completions[0]); + for (i = rs->completion_index; i < len; i++) { + readline_insert_char(rs, rs->completions[0][i]); + } + /* extra space for next argument. XXX: make it more generic */ + if (len > 0 && rs->completions[0][len - 1] != '/') { + readline_insert_char(rs, ' '); + } + } else { + qsort(rs->completions, rs->nb_completions, sizeof(char *), + completion_comp); + rs->printf_func(rs->opaque, "\n"); + max_width = 0; + max_prefix = 0; + for (i = 0; i < rs->nb_completions; i++) { + len = strlen(rs->completions[i]); + if (i == 0) { + max_prefix = len; + } else { + if (len < max_prefix) { + max_prefix = len; + } + for (j = 0; j < max_prefix; j++) { + if (rs->completions[i][j] != rs->completions[0][j]) { + max_prefix = j; + } + } + } + if (len > max_width) { + max_width = len; + } + } + if (max_prefix > 0) + for (i = rs->completion_index; i < max_prefix; i++) { + readline_insert_char(rs, rs->completions[0][i]); + } + max_width += 2; + if (max_width < 10) { + max_width = 10; + } else if (max_width > 80) { + max_width = 80; + } + nb_cols = 80 / max_width; + j = 0; + for (i = 0; i < rs->nb_completions; i++) { + rs->printf_func(rs->opaque, "%-*s", max_width, rs->completions[i]); + if (++j == nb_cols || i == (rs->nb_completions - 1)) { + rs->printf_func(rs->opaque, "\n"); + j = 0; + } + } + readline_show_prompt(rs); + } + for (i = 0; i < rs->nb_completions; i++) { + g_free(rs->completions[i]); + } +} + +static void readline_clear_screen(ReadLineState *rs) +{ + rs->printf_func(rs->opaque, "\033[2J\033[1;1H"); + readline_show_prompt(rs); +} + +/* return true if command handled */ +void readline_handle_byte(ReadLineState *rs, int ch) +{ + switch (rs->esc_state) { + case IS_NORM: + switch (ch) { + case 1: + readline_bol(rs); + break; + case 4: + readline_delete_char(rs); + break; + case 5: + readline_eol(rs); + break; + case 9: + readline_completion(rs); + break; + case 12: + readline_clear_screen(rs); + break; + case 10: + case 13: + rs->cmd_buf[rs->cmd_buf_size] = '\0'; + if (!rs->read_password) { + readline_hist_add(rs, rs->cmd_buf); + } + rs->printf_func(rs->opaque, "\n"); + rs->cmd_buf_index = 0; + rs->cmd_buf_size = 0; + rs->last_cmd_buf_index = 0; + rs->last_cmd_buf_size = 0; + rs->readline_func(rs->opaque, rs->cmd_buf, rs->readline_opaque); + break; + case 23: + /* ^W */ + readline_backword(rs); + break; + case 27: + rs->esc_state = IS_ESC; + break; + case 127: + case 8: + readline_backspace(rs); + break; + case 155: + rs->esc_state = IS_CSI; + break; + default: + if (ch >= 32) { + readline_insert_char(rs, ch); + } + break; + } + break; + case IS_ESC: + if (ch == '[') { + rs->esc_state = IS_CSI; + rs->esc_param = 0; + } else if (ch == 'O') { + rs->esc_state = IS_SS3; + rs->esc_param = 0; + } else { + rs->esc_state = IS_NORM; + } + break; + case IS_CSI: + switch (ch) { + case 'A': + case 'F': + readline_up_char(rs); + break; + case 'B': + case 'E': + readline_down_char(rs); + break; + case 'D': + readline_backward_char(rs); + break; + case 'C': + readline_forward_char(rs); + break; + case '0' ... '9': + rs->esc_param = rs->esc_param * 10 + (ch - '0'); + goto the_end; + case '~': + switch (rs->esc_param) { + case 1: + readline_bol(rs); + break; + case 3: + readline_delete_char(rs); + break; + case 4: + readline_eol(rs); + break; + } + break; + default: + break; + } + rs->esc_state = IS_NORM; + the_end: + break; + case IS_SS3: + switch (ch) { + case 'F': + readline_eol(rs); + break; + case 'H': + readline_bol(rs); + break; + } + rs->esc_state = IS_NORM; + break; + } + readline_update(rs); +} + +void readline_start(ReadLineState *rs, const char *prompt, int read_password, + ReadLineFunc *readline_func, void *opaque) +{ + pstrcpy(rs->prompt, sizeof(rs->prompt), prompt); + rs->readline_func = readline_func; + rs->readline_opaque = opaque; + rs->read_password = read_password; + readline_restart(rs); +} + +void readline_restart(ReadLineState *rs) +{ + rs->cmd_buf_index = 0; + rs->cmd_buf_size = 0; +} + +const char *readline_get_history(ReadLineState *rs, unsigned int index) +{ + if (index >= READLINE_MAX_CMDS) { + return NULL; + } + return rs->history[index]; +} + +void readline_free(ReadLineState *rs) +{ + int i; + + if (!rs) { + return; + } + for (i = 0; i < READLINE_MAX_CMDS; i++) { + g_free(rs->history[i]); + } + g_free(rs); +} + +ReadLineState *readline_init(ReadLinePrintfFunc *printf_func, + ReadLineFlushFunc *flush_func, + void *opaque, + ReadLineCompletionFunc *completion_finder) +{ + ReadLineState *rs = g_new0(ReadLineState, 1); + + rs->hist_entry = -1; + rs->opaque = opaque; + rs->printf_func = printf_func; + rs->flush_func = flush_func; + rs->completion_finder = completion_finder; + + return rs; +} diff --git a/util/selfmap.c b/util/selfmap.c new file mode 100644 index 000000000..2c14f019c --- /dev/null +++ b/util/selfmap.c @@ -0,0 +1,83 @@ +/* + * Utility function to get QEMU's own process map + * + * Copyright (c) 2020 Linaro Ltd + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" +#include "qemu/selfmap.h" + +GSList *read_self_maps(void) +{ + gchar *maps; + GSList *map_info = NULL; + + if (g_file_get_contents("/proc/self/maps", &maps, NULL, NULL)) { + gchar **lines = g_strsplit(maps, "\n", 0); + int i, entries = g_strv_length(lines); + + for (i = 0; i < entries; i++) { + gchar **fields = g_strsplit(lines[i], " ", 6); + if (g_strv_length(fields) > 4) { + MapInfo *e = g_new0(MapInfo, 1); + int errors = 0; + const char *end; + + errors |= qemu_strtoul(fields[0], &end, 16, &e->start); + errors |= qemu_strtoul(end + 1, NULL, 16, &e->end); + + e->is_read = fields[1][0] == 'r'; + e->is_write = fields[1][1] == 'w'; + e->is_exec = fields[1][2] == 'x'; + e->is_priv = fields[1][3] == 'p'; + + errors |= qemu_strtoul(fields[2], NULL, 16, &e->offset); + e->dev = g_strdup(fields[3]); + errors |= qemu_strtou64(fields[4], NULL, 10, &e->inode); + + if (!errors) { + /* + * The last field may have leading spaces which we + * need to strip. + */ + if (g_strv_length(fields) == 6) { + e->path = g_strdup(g_strchug(fields[5])); + } + map_info = g_slist_prepend(map_info, e); + } else { + g_free(e->dev); + g_free(e); + } + } + + g_strfreev(fields); + } + g_strfreev(lines); + g_free(maps); + } + + /* ensure the map data is in the same order we collected it */ + return g_slist_reverse(map_info); +} + +/** + * free_self_maps: + * @info: a GSlist + * + * Free a list of MapInfo structures. + */ +static void free_info(gpointer data) +{ + MapInfo *e = (MapInfo *) data; + g_free(e->dev); + g_free(e->path); + g_free(e); +} + +void free_self_maps(GSList *info) +{ + g_slist_free_full(info, &free_info); +} diff --git a/util/stats64.c b/util/stats64.c new file mode 100644 index 000000000..897613c94 --- /dev/null +++ b/util/stats64.c @@ -0,0 +1,137 @@ +/* + * Atomic operations on 64-bit quantities. + * + * Copyright (C) 2017 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/atomic.h" +#include "qemu/stats64.h" +#include "qemu/processor.h" + +#ifndef CONFIG_ATOMIC64 +static inline void stat64_rdlock(Stat64 *s) +{ + /* Keep out incoming writers to avoid them starving us. */ + qatomic_add(&s->lock, 2); + + /* If there is a concurrent writer, wait for it. */ + while (qatomic_read(&s->lock) & 1) { + cpu_relax(); + } +} + +static inline void stat64_rdunlock(Stat64 *s) +{ + qatomic_sub(&s->lock, 2); +} + +static inline bool stat64_wrtrylock(Stat64 *s) +{ + return qatomic_cmpxchg(&s->lock, 0, 1) == 0; +} + +static inline void stat64_wrunlock(Stat64 *s) +{ + qatomic_dec(&s->lock); +} + +uint64_t stat64_get(const Stat64 *s) +{ + uint32_t high, low; + + stat64_rdlock((Stat64 *)s); + + /* 64-bit writes always take the lock, so we can read in + * any order. + */ + high = qatomic_read(&s->high); + low = qatomic_read(&s->low); + stat64_rdunlock((Stat64 *)s); + + return ((uint64_t)high << 32) | low; +} + +bool stat64_add32_carry(Stat64 *s, uint32_t low, uint32_t high) +{ + uint32_t old; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + /* 64-bit reads always take the lock, so they don't care about the + * order of our update. By updating s->low first, we can check + * whether we have to carry into s->high. + */ + old = qatomic_fetch_add(&s->low, low); + high += (old + low) < old; + qatomic_add(&s->high, high); + stat64_wrunlock(s); + return true; +} + +bool stat64_min_slow(Stat64 *s, uint64_t value) +{ + uint32_t high, low; + uint64_t orig; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + high = qatomic_read(&s->high); + low = qatomic_read(&s->low); + + orig = ((uint64_t)high << 32) | low; + if (value < orig) { + /* We have to set low before high, just like stat64_min reads + * high before low. The value may become higher temporarily, but + * stat64_get does not notice (it takes the lock) and the only ill + * effect on stat64_min is that the slow path may be triggered + * unnecessarily. + */ + qatomic_set(&s->low, (uint32_t)value); + smp_wmb(); + qatomic_set(&s->high, value >> 32); + } + stat64_wrunlock(s); + return true; +} + +bool stat64_max_slow(Stat64 *s, uint64_t value) +{ + uint32_t high, low; + uint64_t orig; + + if (!stat64_wrtrylock(s)) { + cpu_relax(); + return false; + } + + high = qatomic_read(&s->high); + low = qatomic_read(&s->low); + + orig = ((uint64_t)high << 32) | low; + if (value > orig) { + /* We have to set low before high, just like stat64_max reads + * high before low. The value may become lower temporarily, but + * stat64_get does not notice (it takes the lock) and the only ill + * effect on stat64_max is that the slow path may be triggered + * unnecessarily. + */ + qatomic_set(&s->low, (uint32_t)value); + smp_wmb(); + qatomic_set(&s->high, value >> 32); + } + stat64_wrunlock(s); + return true; +} +#endif diff --git a/util/sys_membarrier.c b/util/sys_membarrier.c new file mode 100644 index 000000000..1362c0c4c --- /dev/null +++ b/util/sys_membarrier.c @@ -0,0 +1,50 @@ +/* + * Process-global memory barriers + * + * Copyright (c) 2018 Red Hat, Inc. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + */ + +#include "qemu/osdep.h" +#include "qemu/sys_membarrier.h" +#include "qemu/error-report.h" + +#ifdef CONFIG_LINUX +#include <linux/membarrier.h> +#include <sys/syscall.h> + +static int +membarrier(int cmd, int flags) +{ + return syscall(__NR_membarrier, cmd, flags); +} +#endif + +void smp_mb_global(void) +{ +#if defined CONFIG_WIN32 + FlushProcessWriteBuffers(); +#elif defined CONFIG_LINUX + membarrier(MEMBARRIER_CMD_SHARED, 0); +#else +#error --enable-membarrier is not supported on this operating system. +#endif +} + +void smp_mb_global_init(void) +{ +#ifdef CONFIG_LINUX + int ret = membarrier(MEMBARRIER_CMD_QUERY, 0); + if (ret < 0) { + error_report("This QEMU binary requires the membarrier system call."); + error_report("Please upgrade your system to a newer version of Linux"); + exit(1); + } + if (!(ret & MEMBARRIER_CMD_SHARED)) { + error_report("This QEMU binary requires MEMBARRIER_CMD_SHARED support."); + error_report("Please upgrade your system to a newer version of Linux"); + exit(1); + } +#endif +} diff --git a/util/systemd.c b/util/systemd.c new file mode 100644 index 000000000..5bcac9b40 --- /dev/null +++ b/util/systemd.c @@ -0,0 +1,79 @@ +/* + * systemd socket activation support + * + * Copyright 2017 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Richard W.M. Jones <rjones@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/systemd.h" +#include "qemu/cutils.h" +#include "qemu/error-report.h" + +#ifndef _WIN32 +unsigned int check_socket_activation(void) +{ + const char *s; + unsigned long pid; + unsigned long nr_fds; + unsigned int i; + int fd; + int f; + int err; + + s = getenv("LISTEN_PID"); + if (s == NULL) { + return 0; + } + err = qemu_strtoul(s, NULL, 10, &pid); + if (err) { + return 0; + } + if (pid != getpid()) { + return 0; + } + + s = getenv("LISTEN_FDS"); + if (s == NULL) { + return 0; + } + err = qemu_strtoul(s, NULL, 10, &nr_fds); + if (err) { + return 0; + } + assert(nr_fds <= UINT_MAX); + + /* So these are not passed to any child processes we might start. */ + unsetenv("LISTEN_FDS"); + unsetenv("LISTEN_PID"); + + /* So the file descriptors don't leak into child processes. */ + for (i = 0; i < nr_fds; ++i) { + fd = FIRST_SOCKET_ACTIVATION_FD + i; + f = fcntl(fd, F_GETFD); + if (f == -1 || fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1) { + /* If we cannot set FD_CLOEXEC then it probably means the file + * descriptor is invalid, so socket activation has gone wrong + * and we should exit. + */ + error_report("Socket activation failed: " + "invalid file descriptor fd = %d: %s", + fd, g_strerror(errno)); + exit(EXIT_FAILURE); + } + } + + return (unsigned int) nr_fds; +} + +#else /* !_WIN32 */ +unsigned int check_socket_activation(void) +{ + return 0; +} +#endif diff --git a/util/thread-pool.c b/util/thread-pool.c new file mode 100644 index 000000000..d763cea50 --- /dev/null +++ b/util/thread-pool.c @@ -0,0 +1,352 @@ +/* + * QEMU block layer thread pool + * + * Copyright IBM, Corp. 2008 + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#include "qemu/osdep.h" +#include "qemu/queue.h" +#include "qemu/thread.h" +#include "qemu/coroutine.h" +#include "trace.h" +#include "block/thread-pool.h" +#include "qemu/main-loop.h" + +static void do_spawn_thread(ThreadPool *pool); + +typedef struct ThreadPoolElement ThreadPoolElement; + +enum ThreadState { + THREAD_QUEUED, + THREAD_ACTIVE, + THREAD_DONE, +}; + +struct ThreadPoolElement { + BlockAIOCB common; + ThreadPool *pool; + ThreadPoolFunc *func; + void *arg; + + /* Moving state out of THREAD_QUEUED is protected by lock. After + * that, only the worker thread can write to it. Reads and writes + * of state and ret are ordered with memory barriers. + */ + enum ThreadState state; + int ret; + + /* Access to this list is protected by lock. */ + QTAILQ_ENTRY(ThreadPoolElement) reqs; + + /* Access to this list is protected by the global mutex. */ + QLIST_ENTRY(ThreadPoolElement) all; +}; + +struct ThreadPool { + AioContext *ctx; + QEMUBH *completion_bh; + QemuMutex lock; + QemuCond worker_stopped; + QemuSemaphore sem; + int max_threads; + QEMUBH *new_thread_bh; + + /* The following variables are only accessed from one AioContext. */ + QLIST_HEAD(, ThreadPoolElement) head; + + /* The following variables are protected by lock. */ + QTAILQ_HEAD(, ThreadPoolElement) request_list; + int cur_threads; + int idle_threads; + int new_threads; /* backlog of threads we need to create */ + int pending_threads; /* threads created but not running yet */ + bool stopping; +}; + +static void *worker_thread(void *opaque) +{ + ThreadPool *pool = opaque; + + qemu_mutex_lock(&pool->lock); + pool->pending_threads--; + do_spawn_thread(pool); + + while (!pool->stopping) { + ThreadPoolElement *req; + int ret; + + do { + pool->idle_threads++; + qemu_mutex_unlock(&pool->lock); + ret = qemu_sem_timedwait(&pool->sem, 10000); + qemu_mutex_lock(&pool->lock); + pool->idle_threads--; + } while (ret == -1 && !QTAILQ_EMPTY(&pool->request_list)); + if (ret == -1 || pool->stopping) { + break; + } + + req = QTAILQ_FIRST(&pool->request_list); + QTAILQ_REMOVE(&pool->request_list, req, reqs); + req->state = THREAD_ACTIVE; + qemu_mutex_unlock(&pool->lock); + + ret = req->func(req->arg); + + req->ret = ret; + /* Write ret before state. */ + smp_wmb(); + req->state = THREAD_DONE; + + qemu_mutex_lock(&pool->lock); + + qemu_bh_schedule(pool->completion_bh); + } + + pool->cur_threads--; + qemu_cond_signal(&pool->worker_stopped); + qemu_mutex_unlock(&pool->lock); + return NULL; +} + +static void do_spawn_thread(ThreadPool *pool) +{ + QemuThread t; + + /* Runs with lock taken. */ + if (!pool->new_threads) { + return; + } + + pool->new_threads--; + pool->pending_threads++; + + qemu_thread_create(&t, "worker", worker_thread, pool, QEMU_THREAD_DETACHED); +} + +static void spawn_thread_bh_fn(void *opaque) +{ + ThreadPool *pool = opaque; + + qemu_mutex_lock(&pool->lock); + do_spawn_thread(pool); + qemu_mutex_unlock(&pool->lock); +} + +static void spawn_thread(ThreadPool *pool) +{ + pool->cur_threads++; + pool->new_threads++; + /* If there are threads being created, they will spawn new workers, so + * we don't spend time creating many threads in a loop holding a mutex or + * starving the current vcpu. + * + * If there are no idle threads, ask the main thread to create one, so we + * inherit the correct affinity instead of the vcpu affinity. + */ + if (!pool->pending_threads) { + qemu_bh_schedule(pool->new_thread_bh); + } +} + +static void thread_pool_completion_bh(void *opaque) +{ + ThreadPool *pool = opaque; + ThreadPoolElement *elem, *next; + + aio_context_acquire(pool->ctx); +restart: + QLIST_FOREACH_SAFE(elem, &pool->head, all, next) { + if (elem->state != THREAD_DONE) { + continue; + } + + trace_thread_pool_complete(pool, elem, elem->common.opaque, + elem->ret); + QLIST_REMOVE(elem, all); + + if (elem->common.cb) { + /* Read state before ret. */ + smp_rmb(); + + /* Schedule ourselves in case elem->common.cb() calls aio_poll() to + * wait for another request that completed at the same time. + */ + qemu_bh_schedule(pool->completion_bh); + + aio_context_release(pool->ctx); + elem->common.cb(elem->common.opaque, elem->ret); + aio_context_acquire(pool->ctx); + + /* We can safely cancel the completion_bh here regardless of someone + * else having scheduled it meanwhile because we reenter the + * completion function anyway (goto restart). + */ + qemu_bh_cancel(pool->completion_bh); + + qemu_aio_unref(elem); + goto restart; + } else { + qemu_aio_unref(elem); + } + } + aio_context_release(pool->ctx); +} + +static void thread_pool_cancel(BlockAIOCB *acb) +{ + ThreadPoolElement *elem = (ThreadPoolElement *)acb; + ThreadPool *pool = elem->pool; + + trace_thread_pool_cancel(elem, elem->common.opaque); + + QEMU_LOCK_GUARD(&pool->lock); + if (elem->state == THREAD_QUEUED && + /* No thread has yet started working on elem. we can try to "steal" + * the item from the worker if we can get a signal from the + * semaphore. Because this is non-blocking, we can do it with + * the lock taken and ensure that elem will remain THREAD_QUEUED. + */ + qemu_sem_timedwait(&pool->sem, 0) == 0) { + QTAILQ_REMOVE(&pool->request_list, elem, reqs); + qemu_bh_schedule(pool->completion_bh); + + elem->state = THREAD_DONE; + elem->ret = -ECANCELED; + } + +} + +static AioContext *thread_pool_get_aio_context(BlockAIOCB *acb) +{ + ThreadPoolElement *elem = (ThreadPoolElement *)acb; + ThreadPool *pool = elem->pool; + return pool->ctx; +} + +static const AIOCBInfo thread_pool_aiocb_info = { + .aiocb_size = sizeof(ThreadPoolElement), + .cancel_async = thread_pool_cancel, + .get_aio_context = thread_pool_get_aio_context, +}; + +BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, + ThreadPoolFunc *func, void *arg, + BlockCompletionFunc *cb, void *opaque) +{ + ThreadPoolElement *req; + + req = qemu_aio_get(&thread_pool_aiocb_info, NULL, cb, opaque); + req->func = func; + req->arg = arg; + req->state = THREAD_QUEUED; + req->pool = pool; + + QLIST_INSERT_HEAD(&pool->head, req, all); + + trace_thread_pool_submit(pool, req, arg); + + qemu_mutex_lock(&pool->lock); + if (pool->idle_threads == 0 && pool->cur_threads < pool->max_threads) { + spawn_thread(pool); + } + QTAILQ_INSERT_TAIL(&pool->request_list, req, reqs); + qemu_mutex_unlock(&pool->lock); + qemu_sem_post(&pool->sem); + return &req->common; +} + +typedef struct ThreadPoolCo { + Coroutine *co; + int ret; +} ThreadPoolCo; + +static void thread_pool_co_cb(void *opaque, int ret) +{ + ThreadPoolCo *co = opaque; + + co->ret = ret; + aio_co_wake(co->co); +} + +int coroutine_fn thread_pool_submit_co(ThreadPool *pool, ThreadPoolFunc *func, + void *arg) +{ + ThreadPoolCo tpc = { .co = qemu_coroutine_self(), .ret = -EINPROGRESS }; + assert(qemu_in_coroutine()); + thread_pool_submit_aio(pool, func, arg, thread_pool_co_cb, &tpc); + qemu_coroutine_yield(); + return tpc.ret; +} + +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg) +{ + thread_pool_submit_aio(pool, func, arg, NULL, NULL); +} + +static void thread_pool_init_one(ThreadPool *pool, AioContext *ctx) +{ + if (!ctx) { + ctx = qemu_get_aio_context(); + } + + memset(pool, 0, sizeof(*pool)); + pool->ctx = ctx; + pool->completion_bh = aio_bh_new(ctx, thread_pool_completion_bh, pool); + qemu_mutex_init(&pool->lock); + qemu_cond_init(&pool->worker_stopped); + qemu_sem_init(&pool->sem, 0); + pool->max_threads = 64; + pool->new_thread_bh = aio_bh_new(ctx, spawn_thread_bh_fn, pool); + + QLIST_INIT(&pool->head); + QTAILQ_INIT(&pool->request_list); +} + +ThreadPool *thread_pool_new(AioContext *ctx) +{ + ThreadPool *pool = g_new(ThreadPool, 1); + thread_pool_init_one(pool, ctx); + return pool; +} + +void thread_pool_free(ThreadPool *pool) +{ + if (!pool) { + return; + } + + assert(QLIST_EMPTY(&pool->head)); + + qemu_mutex_lock(&pool->lock); + + /* Stop new threads from spawning */ + qemu_bh_delete(pool->new_thread_bh); + pool->cur_threads -= pool->new_threads; + pool->new_threads = 0; + + /* Wait for worker threads to terminate */ + pool->stopping = true; + while (pool->cur_threads > 0) { + qemu_sem_post(&pool->sem); + qemu_cond_wait(&pool->worker_stopped, &pool->lock); + } + + qemu_mutex_unlock(&pool->lock); + + qemu_bh_delete(pool->completion_bh); + qemu_sem_destroy(&pool->sem); + qemu_cond_destroy(&pool->worker_stopped); + qemu_mutex_destroy(&pool->lock); + g_free(pool); +} diff --git a/util/throttle.c b/util/throttle.c new file mode 100644 index 000000000..81f247a8d --- /dev/null +++ b/util/throttle.c @@ -0,0 +1,637 @@ +/* + * QEMU throttling infrastructure + * + * Copyright (C) Nodalink, EURL. 2013-2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/throttle.h" +#include "qemu/timer.h" +#include "block/aio.h" + +/* This function make a bucket leak + * + * @bkt: the bucket to make leak + * @delta_ns: the time delta + */ +void throttle_leak_bucket(LeakyBucket *bkt, int64_t delta_ns) +{ + double leak; + + /* compute how much to leak */ + leak = (bkt->avg * (double) delta_ns) / NANOSECONDS_PER_SECOND; + + /* make the bucket leak */ + bkt->level = MAX(bkt->level - leak, 0); + + /* if we allow bursts for more than one second we also need to + * keep track of bkt->burst_level so the bkt->max goal per second + * is attained */ + if (bkt->burst_length > 1) { + leak = (bkt->max * (double) delta_ns) / NANOSECONDS_PER_SECOND; + bkt->burst_level = MAX(bkt->burst_level - leak, 0); + } +} + +/* Calculate the time delta since last leak and make proportionals leaks + * + * @now: the current timestamp in ns + */ +static void throttle_do_leak(ThrottleState *ts, int64_t now) +{ + /* compute the time elapsed since the last leak */ + int64_t delta_ns = now - ts->previous_leak; + int i; + + ts->previous_leak = now; + + if (delta_ns <= 0) { + return; + } + + /* make each bucket leak */ + for (i = 0; i < BUCKETS_COUNT; i++) { + throttle_leak_bucket(&ts->cfg.buckets[i], delta_ns); + } +} + +/* do the real job of computing the time to wait + * + * @limit: the throttling limit + * @extra: the number of operation to delay + * @ret: the time to wait in ns + */ +static int64_t throttle_do_compute_wait(double limit, double extra) +{ + double wait = extra * NANOSECONDS_PER_SECOND; + wait /= limit; + return wait; +} + +/* This function compute the wait time in ns that a leaky bucket should trigger + * + * @bkt: the leaky bucket we operate on + * @ret: the resulting wait time in ns or 0 if the operation can go through + */ +int64_t throttle_compute_wait(LeakyBucket *bkt) +{ + double extra; /* the number of extra units blocking the io */ + double bucket_size; /* I/O before throttling to bkt->avg */ + double burst_bucket_size; /* Before throttling to bkt->max */ + + if (!bkt->avg) { + return 0; + } + + if (!bkt->max) { + /* If bkt->max is 0 we still want to allow short bursts of I/O + * from the guest, otherwise every other request will be throttled + * and performance will suffer considerably. */ + bucket_size = (double) bkt->avg / 10; + burst_bucket_size = 0; + } else { + /* If we have a burst limit then we have to wait until all I/O + * at burst rate has finished before throttling to bkt->avg */ + bucket_size = bkt->max * bkt->burst_length; + burst_bucket_size = (double) bkt->max / 10; + } + + /* If the main bucket is full then we have to wait */ + extra = bkt->level - bucket_size; + if (extra > 0) { + return throttle_do_compute_wait(bkt->avg, extra); + } + + /* If the main bucket is not full yet we still have to check the + * burst bucket in order to enforce the burst limit */ + if (bkt->burst_length > 1) { + assert(bkt->max > 0); /* see throttle_is_valid() */ + extra = bkt->burst_level - burst_bucket_size; + if (extra > 0) { + return throttle_do_compute_wait(bkt->max, extra); + } + } + + return 0; +} + +/* This function compute the time that must be waited while this IO + * + * @is_write: true if the current IO is a write, false if it's a read + * @ret: time to wait + */ +static int64_t throttle_compute_wait_for(ThrottleState *ts, + bool is_write) +{ + BucketType to_check[2][4] = { {THROTTLE_BPS_TOTAL, + THROTTLE_OPS_TOTAL, + THROTTLE_BPS_READ, + THROTTLE_OPS_READ}, + {THROTTLE_BPS_TOTAL, + THROTTLE_OPS_TOTAL, + THROTTLE_BPS_WRITE, + THROTTLE_OPS_WRITE}, }; + int64_t wait, max_wait = 0; + int i; + + for (i = 0; i < 4; i++) { + BucketType index = to_check[is_write][i]; + wait = throttle_compute_wait(&ts->cfg.buckets[index]); + if (wait > max_wait) { + max_wait = wait; + } + } + + return max_wait; +} + +/* compute the timer for this type of operation + * + * @is_write: the type of operation + * @now: the current clock timestamp + * @next_timestamp: the resulting timer + * @ret: true if a timer must be set + */ +static bool throttle_compute_timer(ThrottleState *ts, + bool is_write, + int64_t now, + int64_t *next_timestamp) +{ + int64_t wait; + + /* leak proportionally to the time elapsed */ + throttle_do_leak(ts, now); + + /* compute the wait time if any */ + wait = throttle_compute_wait_for(ts, is_write); + + /* if the code must wait compute when the next timer should fire */ + if (wait) { + *next_timestamp = now + wait; + return true; + } + + /* else no need to wait at all */ + *next_timestamp = now; + return false; +} + +/* Add timers to event loop */ +void throttle_timers_attach_aio_context(ThrottleTimers *tt, + AioContext *new_context) +{ + tt->timers[0] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->read_timer_cb, tt->timer_opaque); + tt->timers[1] = aio_timer_new(new_context, tt->clock_type, SCALE_NS, + tt->write_timer_cb, tt->timer_opaque); +} + +/* + * Initialize the ThrottleConfig structure to a valid state + * @cfg: the config to initialize + */ +void throttle_config_init(ThrottleConfig *cfg) +{ + unsigned i; + memset(cfg, 0, sizeof(*cfg)); + for (i = 0; i < BUCKETS_COUNT; i++) { + cfg->buckets[i].burst_length = 1; + } +} + +/* To be called first on the ThrottleState */ +void throttle_init(ThrottleState *ts) +{ + memset(ts, 0, sizeof(ThrottleState)); + throttle_config_init(&ts->cfg); +} + +/* To be called first on the ThrottleTimers */ +void throttle_timers_init(ThrottleTimers *tt, + AioContext *aio_context, + QEMUClockType clock_type, + QEMUTimerCB *read_timer_cb, + QEMUTimerCB *write_timer_cb, + void *timer_opaque) +{ + memset(tt, 0, sizeof(ThrottleTimers)); + + tt->clock_type = clock_type; + tt->read_timer_cb = read_timer_cb; + tt->write_timer_cb = write_timer_cb; + tt->timer_opaque = timer_opaque; + throttle_timers_attach_aio_context(tt, aio_context); +} + +/* destroy a timer */ +static void throttle_timer_destroy(QEMUTimer **timer) +{ + assert(*timer != NULL); + + timer_free(*timer); + *timer = NULL; +} + +/* Remove timers from event loop */ +void throttle_timers_detach_aio_context(ThrottleTimers *tt) +{ + int i; + + for (i = 0; i < 2; i++) { + throttle_timer_destroy(&tt->timers[i]); + } +} + +/* To be called last on the ThrottleTimers */ +void throttle_timers_destroy(ThrottleTimers *tt) +{ + throttle_timers_detach_aio_context(tt); +} + +/* is any throttling timer configured */ +bool throttle_timers_are_initialized(ThrottleTimers *tt) +{ + if (tt->timers[0]) { + return true; + } + + return false; +} + +/* Does any throttling must be done + * + * @cfg: the throttling configuration to inspect + * @ret: true if throttling must be done else false + */ +bool throttle_enabled(ThrottleConfig *cfg) +{ + int i; + + for (i = 0; i < BUCKETS_COUNT; i++) { + if (cfg->buckets[i].avg > 0) { + return true; + } + } + + return false; +} + +/* check if a throttling configuration is valid + * @cfg: the throttling configuration to inspect + * @ret: true if valid else false + * @errp: error object + */ +bool throttle_is_valid(ThrottleConfig *cfg, Error **errp) +{ + int i; + bool bps_flag, ops_flag; + bool bps_max_flag, ops_max_flag; + + bps_flag = cfg->buckets[THROTTLE_BPS_TOTAL].avg && + (cfg->buckets[THROTTLE_BPS_READ].avg || + cfg->buckets[THROTTLE_BPS_WRITE].avg); + + ops_flag = cfg->buckets[THROTTLE_OPS_TOTAL].avg && + (cfg->buckets[THROTTLE_OPS_READ].avg || + cfg->buckets[THROTTLE_OPS_WRITE].avg); + + bps_max_flag = cfg->buckets[THROTTLE_BPS_TOTAL].max && + (cfg->buckets[THROTTLE_BPS_READ].max || + cfg->buckets[THROTTLE_BPS_WRITE].max); + + ops_max_flag = cfg->buckets[THROTTLE_OPS_TOTAL].max && + (cfg->buckets[THROTTLE_OPS_READ].max || + cfg->buckets[THROTTLE_OPS_WRITE].max); + + if (bps_flag || ops_flag || bps_max_flag || ops_max_flag) { + error_setg(errp, "bps/iops/max total values and read/write values" + " cannot be used at the same time"); + return false; + } + + if (cfg->op_size && + !cfg->buckets[THROTTLE_OPS_TOTAL].avg && + !cfg->buckets[THROTTLE_OPS_READ].avg && + !cfg->buckets[THROTTLE_OPS_WRITE].avg) { + error_setg(errp, "iops size requires an iops value to be set"); + return false; + } + + for (i = 0; i < BUCKETS_COUNT; i++) { + LeakyBucket *bkt = &cfg->buckets[i]; + if (bkt->avg > THROTTLE_VALUE_MAX || bkt->max > THROTTLE_VALUE_MAX) { + error_setg(errp, "bps/iops/max values must be within [0, %lld]", + THROTTLE_VALUE_MAX); + return false; + } + + if (!bkt->burst_length) { + error_setg(errp, "the burst length cannot be 0"); + return false; + } + + if (bkt->burst_length > 1 && !bkt->max) { + error_setg(errp, "burst length set without burst rate"); + return false; + } + + if (bkt->max && bkt->burst_length > THROTTLE_VALUE_MAX / bkt->max) { + error_setg(errp, "burst length too high for this burst rate"); + return false; + } + + if (bkt->max && !bkt->avg) { + error_setg(errp, "bps_max/iops_max require corresponding" + " bps/iops values"); + return false; + } + + if (bkt->max && bkt->max < bkt->avg) { + error_setg(errp, "bps_max/iops_max cannot be lower than bps/iops"); + return false; + } + } + + return true; +} + +/* Used to configure the throttle + * + * @ts: the throttle state we are working on + * @clock_type: the group's clock_type + * @cfg: the config to set + */ +void throttle_config(ThrottleState *ts, + QEMUClockType clock_type, + ThrottleConfig *cfg) +{ + int i; + + ts->cfg = *cfg; + + /* Zero bucket level */ + for (i = 0; i < BUCKETS_COUNT; i++) { + ts->cfg.buckets[i].level = 0; + ts->cfg.buckets[i].burst_level = 0; + } + + ts->previous_leak = qemu_clock_get_ns(clock_type); +} + +/* used to get config + * + * @ts: the throttle state we are working on + * @cfg: the config to write + */ +void throttle_get_config(ThrottleState *ts, ThrottleConfig *cfg) +{ + *cfg = ts->cfg; +} + + +/* Schedule the read or write timer if needed + * + * NOTE: this function is not unit tested due to it's usage of timer_mod + * + * @tt: the timers structure + * @is_write: the type of operation (read/write) + * @ret: true if the timer has been scheduled else false + */ +bool throttle_schedule_timer(ThrottleState *ts, + ThrottleTimers *tt, + bool is_write) +{ + int64_t now = qemu_clock_get_ns(tt->clock_type); + int64_t next_timestamp; + bool must_wait; + + must_wait = throttle_compute_timer(ts, + is_write, + now, + &next_timestamp); + + /* request not throttled */ + if (!must_wait) { + return false; + } + + /* request throttled and timer pending -> do nothing */ + if (timer_pending(tt->timers[is_write])) { + return true; + } + + /* request throttled and timer not pending -> arm timer */ + timer_mod(tt->timers[is_write], next_timestamp); + return true; +} + +/* do the accounting for this operation + * + * @is_write: the type of operation (read/write) + * @size: the size of the operation + */ +void throttle_account(ThrottleState *ts, bool is_write, uint64_t size) +{ + const BucketType bucket_types_size[2][2] = { + { THROTTLE_BPS_TOTAL, THROTTLE_BPS_READ }, + { THROTTLE_BPS_TOTAL, THROTTLE_BPS_WRITE } + }; + const BucketType bucket_types_units[2][2] = { + { THROTTLE_OPS_TOTAL, THROTTLE_OPS_READ }, + { THROTTLE_OPS_TOTAL, THROTTLE_OPS_WRITE } + }; + double units = 1.0; + unsigned i; + + /* if cfg.op_size is defined and smaller than size we compute unit count */ + if (ts->cfg.op_size && size > ts->cfg.op_size) { + units = (double) size / ts->cfg.op_size; + } + + for (i = 0; i < 2; i++) { + LeakyBucket *bkt; + + bkt = &ts->cfg.buckets[bucket_types_size[is_write][i]]; + bkt->level += size; + if (bkt->burst_length > 1) { + bkt->burst_level += size; + } + + bkt = &ts->cfg.buckets[bucket_types_units[is_write][i]]; + bkt->level += units; + if (bkt->burst_length > 1) { + bkt->burst_level += units; + } + } +} + +/* return a ThrottleConfig based on the options in a ThrottleLimits + * + * @arg: the ThrottleLimits object to read from + * @cfg: the ThrottleConfig to edit + * @errp: error object + */ +void throttle_limits_to_config(ThrottleLimits *arg, ThrottleConfig *cfg, + Error **errp) +{ + if (arg->has_bps_total) { + cfg->buckets[THROTTLE_BPS_TOTAL].avg = arg->bps_total; + } + if (arg->has_bps_read) { + cfg->buckets[THROTTLE_BPS_READ].avg = arg->bps_read; + } + if (arg->has_bps_write) { + cfg->buckets[THROTTLE_BPS_WRITE].avg = arg->bps_write; + } + + if (arg->has_iops_total) { + cfg->buckets[THROTTLE_OPS_TOTAL].avg = arg->iops_total; + } + if (arg->has_iops_read) { + cfg->buckets[THROTTLE_OPS_READ].avg = arg->iops_read; + } + if (arg->has_iops_write) { + cfg->buckets[THROTTLE_OPS_WRITE].avg = arg->iops_write; + } + + if (arg->has_bps_total_max) { + cfg->buckets[THROTTLE_BPS_TOTAL].max = arg->bps_total_max; + } + if (arg->has_bps_read_max) { + cfg->buckets[THROTTLE_BPS_READ].max = arg->bps_read_max; + } + if (arg->has_bps_write_max) { + cfg->buckets[THROTTLE_BPS_WRITE].max = arg->bps_write_max; + } + if (arg->has_iops_total_max) { + cfg->buckets[THROTTLE_OPS_TOTAL].max = arg->iops_total_max; + } + if (arg->has_iops_read_max) { + cfg->buckets[THROTTLE_OPS_READ].max = arg->iops_read_max; + } + if (arg->has_iops_write_max) { + cfg->buckets[THROTTLE_OPS_WRITE].max = arg->iops_write_max; + } + + if (arg->has_bps_total_max_length) { + if (arg->bps_total_max_length > UINT_MAX) { + error_setg(errp, "bps-total-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_BPS_TOTAL].burst_length = arg->bps_total_max_length; + } + if (arg->has_bps_read_max_length) { + if (arg->bps_read_max_length > UINT_MAX) { + error_setg(errp, "bps-read-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_BPS_READ].burst_length = arg->bps_read_max_length; + } + if (arg->has_bps_write_max_length) { + if (arg->bps_write_max_length > UINT_MAX) { + error_setg(errp, "bps-write-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_BPS_WRITE].burst_length = arg->bps_write_max_length; + } + if (arg->has_iops_total_max_length) { + if (arg->iops_total_max_length > UINT_MAX) { + error_setg(errp, "iops-total-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_OPS_TOTAL].burst_length = arg->iops_total_max_length; + } + if (arg->has_iops_read_max_length) { + if (arg->iops_read_max_length > UINT_MAX) { + error_setg(errp, "iops-read-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_OPS_READ].burst_length = arg->iops_read_max_length; + } + if (arg->has_iops_write_max_length) { + if (arg->iops_write_max_length > UINT_MAX) { + error_setg(errp, "iops-write-max-length value must be in" + " the range [0, %u]", UINT_MAX); + return; + } + cfg->buckets[THROTTLE_OPS_WRITE].burst_length = arg->iops_write_max_length; + } + + if (arg->has_iops_size) { + cfg->op_size = arg->iops_size; + } + + throttle_is_valid(cfg, errp); +} + +/* write the options of a ThrottleConfig to a ThrottleLimits + * + * @cfg: the ThrottleConfig to read from + * @var: the ThrottleLimits to write to + */ +void throttle_config_to_limits(ThrottleConfig *cfg, ThrottleLimits *var) +{ + var->bps_total = cfg->buckets[THROTTLE_BPS_TOTAL].avg; + var->bps_read = cfg->buckets[THROTTLE_BPS_READ].avg; + var->bps_write = cfg->buckets[THROTTLE_BPS_WRITE].avg; + var->iops_total = cfg->buckets[THROTTLE_OPS_TOTAL].avg; + var->iops_read = cfg->buckets[THROTTLE_OPS_READ].avg; + var->iops_write = cfg->buckets[THROTTLE_OPS_WRITE].avg; + var->bps_total_max = cfg->buckets[THROTTLE_BPS_TOTAL].max; + var->bps_read_max = cfg->buckets[THROTTLE_BPS_READ].max; + var->bps_write_max = cfg->buckets[THROTTLE_BPS_WRITE].max; + var->iops_total_max = cfg->buckets[THROTTLE_OPS_TOTAL].max; + var->iops_read_max = cfg->buckets[THROTTLE_OPS_READ].max; + var->iops_write_max = cfg->buckets[THROTTLE_OPS_WRITE].max; + var->bps_total_max_length = cfg->buckets[THROTTLE_BPS_TOTAL].burst_length; + var->bps_read_max_length = cfg->buckets[THROTTLE_BPS_READ].burst_length; + var->bps_write_max_length = cfg->buckets[THROTTLE_BPS_WRITE].burst_length; + var->iops_total_max_length = cfg->buckets[THROTTLE_OPS_TOTAL].burst_length; + var->iops_read_max_length = cfg->buckets[THROTTLE_OPS_READ].burst_length; + var->iops_write_max_length = cfg->buckets[THROTTLE_OPS_WRITE].burst_length; + var->iops_size = cfg->op_size; + + var->has_bps_total = true; + var->has_bps_read = true; + var->has_bps_write = true; + var->has_iops_total = true; + var->has_iops_read = true; + var->has_iops_write = true; + var->has_bps_total_max = true; + var->has_bps_read_max = true; + var->has_bps_write_max = true; + var->has_iops_total_max = true; + var->has_iops_read_max = true; + var->has_iops_write_max = true; + var->has_bps_read_max_length = true; + var->has_bps_total_max_length = true; + var->has_bps_write_max_length = true; + var->has_iops_total_max_length = true; + var->has_iops_read_max_length = true; + var->has_iops_write_max_length = true; + var->has_iops_size = true; +} diff --git a/util/timed-average.c b/util/timed-average.c new file mode 100644 index 000000000..2b49d532c --- /dev/null +++ b/util/timed-average.c @@ -0,0 +1,231 @@ +/* + * QEMU timed average computation + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "qemu/timed-average.h" + +/* This module computes an average of a set of values within a time + * window. + * + * Algorithm: + * + * - Create two windows with a certain expiration period, and + * offsetted by period / 2. + * - Each time you want to account a new value, do it in both windows. + * - The minimum / maximum / average values are always returned from + * the oldest window. + * + * Example: + * + * t=0 |t=0.5 |t=1 |t=1.5 |t=2 + * wnd0: [0,0.5)|wnd0: [0.5,1.5) | |wnd0: [1.5,2.5) | + * wnd1: [0,1) | |wnd1: [1,2) | | + * + * Values are returned from: + * + * wnd0---------|wnd1------------|wnd0---------|wnd1-------------| + */ + +/* Update the expiration of a time window + * + * @w: the window used + * @now: the current time in nanoseconds + * @period: the expiration period in nanoseconds + */ +static void update_expiration(TimedAverageWindow *w, int64_t now, + int64_t period) +{ + /* time elapsed since the last theoretical expiration */ + int64_t elapsed = (now - w->expiration) % period; + /* time remaininging until the next expiration */ + int64_t remaining = period - elapsed; + /* compute expiration */ + w->expiration = now + remaining; +} + +/* Reset a window + * + * @w: the window to reset + */ +static void window_reset(TimedAverageWindow *w) +{ + w->min = UINT64_MAX; + w->max = 0; + w->sum = 0; + w->count = 0; +} + +/* Get the current window (that is, the one with the earliest + * expiration time). + * + * @ta: the TimedAverage structure + * @ret: a pointer to the current window + */ +static TimedAverageWindow *current_window(TimedAverage *ta) +{ + return &ta->windows[ta->current]; +} + +/* Initialize a TimedAverage structure + * + * @ta: the TimedAverage structure + * @clock_type: the type of clock to use + * @period: the time window period in nanoseconds + */ +void timed_average_init(TimedAverage *ta, QEMUClockType clock_type, + uint64_t period) +{ + int64_t now = qemu_clock_get_ns(clock_type); + + /* Returned values are from the oldest window, so they belong to + * the interval [ta->period/2,ta->period). By adjusting the + * requested period by 4/3, we guarantee that they're in the + * interval [2/3 period,4/3 period), closer to the requested + * period on average */ + ta->period = (uint64_t) period * 4 / 3; + ta->clock_type = clock_type; + ta->current = 0; + + window_reset(&ta->windows[0]); + window_reset(&ta->windows[1]); + + /* Both windows are offsetted by half a period */ + ta->windows[0].expiration = now + ta->period / 2; + ta->windows[1].expiration = now + ta->period; +} + +/* Check if the time windows have expired, updating their counters and + * expiration time if that's the case. + * + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) within the current + * window will be stored here + */ +static void check_expirations(TimedAverage *ta, uint64_t *elapsed) +{ + int64_t now = qemu_clock_get_ns(ta->clock_type); + int i; + + assert(ta->period != 0); + + /* Check if the windows have expired */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + if (w->expiration <= now) { + window_reset(w); + update_expiration(w, now, ta->period); + } + } + + /* Make ta->current point to the oldest window */ + if (ta->windows[0].expiration < ta->windows[1].expiration) { + ta->current = 0; + } else { + ta->current = 1; + } + + /* Calculate the elapsed time within the current window */ + if (elapsed) { + int64_t remaining = ta->windows[ta->current].expiration - now; + *elapsed = ta->period - remaining; + } +} + +/* Account a value + * + * @ta: the TimedAverage structure + * @value: the value to account + */ +void timed_average_account(TimedAverage *ta, uint64_t value) +{ + int i; + check_expirations(ta, NULL); + + /* Do the accounting in both windows at the same time */ + for (i = 0; i < 2; i++) { + TimedAverageWindow *w = &ta->windows[i]; + + w->sum += value; + w->count++; + + if (value < w->min) { + w->min = value; + } + + if (value > w->max) { + w->max = value; + } + } +} + +/* Get the minimum value + * + * @ta: the TimedAverage structure + * @ret: the minimum value + */ +uint64_t timed_average_min(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->min < UINT64_MAX ? w->min : 0; +} + +/* Get the average value + * + * @ta: the TimedAverage structure + * @ret: the average value + */ +uint64_t timed_average_avg(TimedAverage *ta) +{ + TimedAverageWindow *w; + check_expirations(ta, NULL); + w = current_window(ta); + return w->count > 0 ? w->sum / w->count : 0; +} + +/* Get the maximum value + * + * @ta: the TimedAverage structure + * @ret: the maximum value + */ +uint64_t timed_average_max(TimedAverage *ta) +{ + check_expirations(ta, NULL); + return current_window(ta)->max; +} + +/* Get the sum of all accounted values + * @ta: the TimedAverage structure + * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here + * @ret: the sum of all accounted values + */ +uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed) +{ + TimedAverageWindow *w; + check_expirations(ta, elapsed); + w = current_window(ta); + return w->sum; +} diff --git a/util/trace-events b/util/trace-events new file mode 100644 index 000000000..c8f53d7d9 --- /dev/null +++ b/util/trace-events @@ -0,0 +1,106 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# aio-posix.c +run_poll_handlers_begin(void *ctx, int64_t max_ns, int64_t timeout) "ctx %p max_ns %"PRId64 " timeout %"PRId64 +run_poll_handlers_end(void *ctx, bool progress, int64_t timeout) "ctx %p progress %d new timeout %"PRId64 +poll_shrink(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 +poll_grow(void *ctx, int64_t old, int64_t new) "ctx %p old %"PRId64" new %"PRId64 +poll_add(void *ctx, void *node, int fd, unsigned revents) "ctx %p node %p fd %d revents 0x%x" +poll_remove(void *ctx, void *node, int fd) "ctx %p node %p fd %d" + +# async.c +aio_co_schedule(void *ctx, void *co) "ctx %p co %p" +aio_co_schedule_bh_cb(void *ctx, void *co) "ctx %p co %p" + +# thread-pool.c +thread_pool_submit(void *pool, void *req, void *opaque) "pool %p req %p opaque %p" +thread_pool_complete(void *pool, void *req, void *opaque, int ret) "pool %p req %p opaque %p ret %d" +thread_pool_cancel(void *req, void *opaque) "req %p opaque %p" + +# buffer.c +buffer_resize(const char *buf, size_t olen, size_t len) "%s: old %zd, new %zd" +buffer_move_empty(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s" +buffer_move(const char *buf, size_t len, const char *from) "%s: %zd bytes from %s" +buffer_free(const char *buf, size_t len) "%s: capacity %zd" + +# filemonitor-inotify.c +qemu_file_monitor_add_watch(void *mon, const char *dirpath, const char *filename, void *cb, void *opaque, int64_t id) "File monitor %p add watch dir='%s' file='%s' cb=%p opaque=%p id=%" PRId64 +qemu_file_monitor_remove_watch(void *mon, const char *dirpath, int64_t id) "File monitor %p remove watch dir='%s' id=%" PRId64 +qemu_file_monitor_new(void *mon, int fd) "File monitor %p created fd=%d" +qemu_file_monitor_enable_watch(void *mon, const char *dirpath, int id) "File monitor %p enable watch dir='%s' id=%u" +qemu_file_monitor_disable_watch(void *mon, const char *dirpath, int id) "File monitor %p disable watch dir='%s' id=%u" +qemu_file_monitor_event(void *mon, const char *dirpath, const char *filename, int mask, unsigned int id) "File monitor %p event dir='%s' file='%s' mask=0x%x id=%u" +qemu_file_monitor_dispatch(void *mon, const char *dirpath, const char *filename, int ev, void *cb, void *opaque, int64_t id) "File monitor %p dispatch dir='%s' file='%s' ev=%d cb=%p opaque=%p id=%" PRId64 + +# qemu-coroutine.c +qemu_aio_coroutine_enter(void *ctx, void *from, void *to, void *opaque) "ctx %p from %p to %p opaque %p" +qemu_coroutine_yield(void *from, void *to) "from %p to %p" +qemu_coroutine_terminate(void *co) "self %p" + +# qemu-coroutine-lock.c +qemu_co_mutex_lock_uncontended(void *mutex, void *self) "mutex %p self %p" +qemu_co_mutex_lock_entry(void *mutex, void *self) "mutex %p self %p" +qemu_co_mutex_lock_return(void *mutex, void *self) "mutex %p self %p" +qemu_co_mutex_unlock_entry(void *mutex, void *self) "mutex %p self %p" +qemu_co_mutex_unlock_return(void *mutex, void *self) "mutex %p self %p" + +# oslib-posix.c +# oslib-win32.c +qemu_memalign(size_t alignment, size_t size, void *ptr) "alignment %zu size %zu ptr %p" +qemu_anon_ram_alloc(size_t size, void *ptr) "size %zu ptr %p" +qemu_vfree(void *ptr) "ptr %p" +qemu_anon_ram_free(void *ptr, size_t size) "ptr %p size %zu" + +# hbitmap.c +hbitmap_iter_skip_words(const void *hb, void *hbi, uint64_t pos, unsigned long cur) "hb %p hbi %p pos %"PRId64" cur 0x%lx" +hbitmap_reset(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 +hbitmap_set(void *hb, uint64_t start, uint64_t count, uint64_t sbit, uint64_t ebit) "hb %p items %"PRIu64",%"PRIu64" bits %"PRIu64"..%"PRIu64 + +# lockcnt.c +lockcnt_fast_path_attempt(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d" +lockcnt_fast_path_success(const void *lockcnt, int expected, int new) "lockcnt %p fast path %d->%d succeeded" +lockcnt_unlock_attempt(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d" +lockcnt_unlock_success(const void *lockcnt, int expected, int new) "lockcnt %p unlock %d->%d succeeded" +lockcnt_futex_wait_prepare(const void *lockcnt, int expected, int new) "lockcnt %p preparing slow path %d->%d" +lockcnt_futex_wait(const void *lockcnt, int val) "lockcnt %p waiting on %d" +lockcnt_futex_wait_resume(const void *lockcnt, int new) "lockcnt %p after wait: %d" +lockcnt_futex_wake(const void *lockcnt) "lockcnt %p waking up one waiter" + +# qemu-sockets.c +socket_listen(int num) "backlog: %d" + +# qemu-thread-common.h +# qemu-thread-posix.c +# qemu-thread-win32.c +qemu_mutex_lock(void *mutex, const char *file, const int line) "waiting on mutex %p (%s:%d)" +qemu_mutex_locked(void *mutex, const char *file, const int line) "taken mutex %p (%s:%d)" +qemu_mutex_unlock(void *mutex, const char *file, const int line) "released mutex %p (%s:%d)" + +# vfio-helpers.c +qemu_vfio_dma_reset_temporary(void *s) "s %p" +qemu_vfio_ram_block_added(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_ram_block_removed(void *s, void *p, size_t size) "s %p host %p size 0x%zx" +qemu_vfio_dump_mapping(void *host, uint64_t iova, size_t size) "vfio mapping %p to iova 0x%08" PRIx64 " size 0x%zx" +qemu_vfio_find_mapping(void *s, void *p) "s %p host %p" +qemu_vfio_new_mapping(void *s, void *host, size_t size, int index, uint64_t iova) "s %p host %p size 0x%zx index %d iova 0x%"PRIx64 +qemu_vfio_do_mapping(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64 " size 0x%zx" +qemu_vfio_dma_map(void *s, void *host, size_t size, bool temporary, uint64_t *iova) "s %p host %p size 0x%zx temporary %d &iova %p" +qemu_vfio_dma_mapped(void *s, void *host, uint64_t iova, size_t size) "s %p host %p <-> iova 0x%"PRIx64" size 0x%zx" +qemu_vfio_dma_unmap(void *s, void *host) "s %p host %p" +qemu_vfio_pci_read_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "read cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_pci_write_config(void *buf, int ofs, int size, uint64_t region_ofs, uint64_t region_size) "write cfg ptr %p ofs 0x%x size 0x%x (region addr 0x%"PRIx64" size 0x%"PRIx64")" +qemu_vfio_region_info(const char *desc, uint64_t region_ofs, uint64_t region_size, uint32_t cap_offset) "region '%s' addr 0x%"PRIx64" size 0x%"PRIx64" cap_ofs 0x%"PRIx32 +qemu_vfio_pci_map_bar(int index, uint64_t region_ofs, uint64_t region_size, int ofs, void *host) "map region bar#%d addr 0x%"PRIx64" size 0x%"PRIx64" ofs 0x%x host %p" + +#userfaultfd.c +uffd_query_features_nosys(int err) "errno: %i" +uffd_query_features_api_failed(int err) "errno: %i" +uffd_create_fd_nosys(int err) "errno: %i" +uffd_create_fd_api_failed(int err) "errno: %i" +uffd_create_fd_api_noioctl(uint64_t ioctl_req, uint64_t ioctl_supp) "ioctl_req: 0x%" PRIx64 "ioctl_supp: 0x%" PRIx64 +uffd_register_memory_failed(void *addr, uint64_t length, uint64_t mode, int err) "addr: %p length: %" PRIu64 " mode: 0x%" PRIx64 " errno: %i" +uffd_unregister_memory_failed(void *addr, uint64_t length, int err) "addr: %p length: %" PRIu64 " errno: %i" + +# module.c +module_load_module(const char *name) "file %s" +module_lookup_object_type(const char *name) "name %s" diff --git a/util/trace.h b/util/trace.h new file mode 100644 index 000000000..86ff7a390 --- /dev/null +++ b/util/trace.h @@ -0,0 +1 @@ +#include "trace/trace-util.h" diff --git a/util/transactions.c b/util/transactions.c new file mode 100644 index 000000000..2dbdedce9 --- /dev/null +++ b/util/transactions.c @@ -0,0 +1,100 @@ +/* + * Simple transactions API + * + * Copyright (c) 2021 Virtuozzo International GmbH. + * + * Author: + * Sementsov-Ogievskiy Vladimir <vsementsov@virtuozzo.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include "qemu/osdep.h" + +#include "qemu/transactions.h" +#include "qemu/queue.h" + +typedef struct TransactionAction { + TransactionActionDrv *drv; + void *opaque; + QSLIST_ENTRY(TransactionAction) entry; +} TransactionAction; + +struct Transaction { + QSLIST_HEAD(, TransactionAction) actions; +}; + +Transaction *tran_new(void) +{ + Transaction *tran = g_new(Transaction, 1); + + QSLIST_INIT(&tran->actions); + + return tran; +} + +void tran_add(Transaction *tran, TransactionActionDrv *drv, void *opaque) +{ + TransactionAction *act; + + act = g_new(TransactionAction, 1); + *act = (TransactionAction) { + .drv = drv, + .opaque = opaque + }; + + QSLIST_INSERT_HEAD(&tran->actions, act, entry); +} + +void tran_abort(Transaction *tran) +{ + TransactionAction *act, *next; + + QSLIST_FOREACH(act, &tran->actions, entry) { + if (act->drv->abort) { + act->drv->abort(act->opaque); + } + } + + QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) { + if (act->drv->clean) { + act->drv->clean(act->opaque); + } + + g_free(act); + } + + g_free(tran); +} + +void tran_commit(Transaction *tran) +{ + TransactionAction *act, *next; + + QSLIST_FOREACH(act, &tran->actions, entry) { + if (act->drv->commit) { + act->drv->commit(act->opaque); + } + } + + QSLIST_FOREACH_SAFE(act, &tran->actions, entry, next) { + if (act->drv->clean) { + act->drv->clean(act->opaque); + } + + g_free(act); + } + + g_free(tran); +} diff --git a/util/unicode.c b/util/unicode.c new file mode 100644 index 000000000..8580bc598 --- /dev/null +++ b/util/unicode.c @@ -0,0 +1,156 @@ +/* + * Dealing with Unicode + * + * Copyright (C) 2013 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/unicode.h" + +static bool is_valid_codepoint(int codepoint) +{ + if (codepoint > 0x10FFFFu) { + return false; /* beyond Unicode range */ + } + if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF) + || (codepoint & 0xFFFE) == 0xFFFE) { + return false; /* noncharacter */ + } + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { + return false; /* surrogate code point */ + } + return true; +} + +/** + * mod_utf8_codepoint: + * @s: string encoded in modified UTF-8 + * @n: maximum number of bytes to read from @s, if less than 6 + * @end: set to end of sequence on return + * + * Convert the modified UTF-8 sequence at the start of @s. Modified + * UTF-8 is exactly like UTF-8, except U+0000 is encoded as + * "\xC0\x80". + * + * If @n is zero or @s points to a zero byte, the sequence is invalid, + * and @end is set to @s. + * + * If @s points to an impossible byte (0xFE or 0xFF) or a continuation + * byte, the sequence is invalid, and @end is set to @s + 1 + * + * Else, the first byte determines how many continuation bytes are + * expected. If there are fewer, the sequence is invalid, and @end is + * set to @s + 1 + actual number of continuation bytes. Else, the + * sequence is well-formed, and @end is set to @s + 1 + expected + * number of continuation bytes. + * + * A well-formed sequence is valid unless it encodes a codepoint + * outside the Unicode range U+0000..U+10FFFF, one of Unicode's 66 + * noncharacters, a surrogate codepoint, or is overlong. Except the + * overlong sequence "\xC0\x80" is valid. + * + * Conversion succeeds if and only if the sequence is valid. + * + * Returns: the Unicode codepoint on success, -1 on failure. + */ +int mod_utf8_codepoint(const char *s, size_t n, char **end) +{ + static int min_cp[5] = { 0x80, 0x800, 0x10000, 0x200000, 0x4000000 }; + const unsigned char *p; + unsigned byte, mask, len, i; + int cp; + + if (n == 0 || *s == 0) { + /* empty sequence */ + *end = (char *)s; + return -1; + } + + p = (const unsigned char *)s; + byte = *p++; + if (byte < 0x80) { + cp = byte; /* one byte sequence */ + } else if (byte >= 0xFE) { + cp = -1; /* impossible bytes 0xFE, 0xFF */ + } else if ((byte & 0x40) == 0) { + cp = -1; /* unexpected continuation byte */ + } else { + /* multi-byte sequence */ + len = 0; + for (mask = 0x80; byte & mask; mask >>= 1) { + len++; + } + assert(len > 1 && len < 7); + cp = byte & (mask - 1); + for (i = 1; i < len; i++) { + byte = i < n ? *p : 0; + if ((byte & 0xC0) != 0x80) { + cp = -1; /* continuation byte missing */ + goto out; + } + p++; + cp <<= 6; + cp |= byte & 0x3F; + } + if (!is_valid_codepoint(cp)) { + cp = -1; + } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) { + cp = -1; /* overlong, not \xC0\x80 */ + } + } + +out: + *end = (char *)p; + return cp; +} + +/** + * mod_utf8_encode: + * @buf: Destination buffer + * @bufsz: size of @buf, at least 5. + * @codepoint: Unicode codepoint to encode + * + * Convert Unicode codepoint @codepoint to modified UTF-8. + * + * Returns: the length of the UTF-8 sequence on success, -1 when + * @codepoint is invalid. + */ +ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint) +{ + assert(bufsz >= 5); + + if (!is_valid_codepoint(codepoint)) { + return -1; + } + + if (codepoint > 0 && codepoint <= 0x7F) { + buf[0] = codepoint & 0x7F; + buf[1] = 0; + return 1; + } + if (codepoint <= 0x7FF) { + buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F); + buf[1] = 0x80 | (codepoint & 0x3F); + buf[2] = 0; + return 2; + } + if (codepoint <= 0xFFFF) { + buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F); + buf[1] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[2] = 0x80 | (codepoint & 0x3F); + buf[3] = 0; + return 3; + } + buf[0] = 0xF0 | ((codepoint >> 18) & 0x07); + buf[1] = 0x80 | ((codepoint >> 12) & 0x3F); + buf[2] = 0x80 | ((codepoint >> 6) & 0x3F); + buf[3] = 0x80 | (codepoint & 0x3F); + buf[4] = 0; + return 4; +} diff --git a/util/uri.c b/util/uri.c new file mode 100644 index 000000000..ff72c6005 --- /dev/null +++ b/util/uri.c @@ -0,0 +1,2314 @@ +/** + * uri.c: set of generic URI related routines + * + * Reference: RFCs 3986, 2732 and 2373 + * + * Copyright (C) 1998-2003 Daniel Veillard. All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * DANIEL VEILLARD BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * Except as contained in this notice, the name of Daniel Veillard shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization from him. + * + * daniel@veillard.com + * + ** + * + * Copyright (C) 2007, 2009-2010 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors: + * Richard W.M. Jones <rjones@redhat.com> + * + */ + +#include "qemu/osdep.h" +#include "qemu/cutils.h" + +#include "qemu/uri.h" + +static void uri_clean(URI *uri); + +/* + * Old rule from 2396 used in legacy handling code + * alpha = lowalpha | upalpha + */ +#define IS_ALPHA(x) (IS_LOWALPHA(x) || IS_UPALPHA(x)) + +/* + * lowalpha = "a" | "b" | "c" | "d" | "e" | "f" | "g" | "h" | "i" | "j" | + * "k" | "l" | "m" | "n" | "o" | "p" | "q" | "r" | "s" | "t" | + * "u" | "v" | "w" | "x" | "y" | "z" + */ + +#define IS_LOWALPHA(x) (((x) >= 'a') && ((x) <= 'z')) + +/* + * upalpha = "A" | "B" | "C" | "D" | "E" | "F" | "G" | "H" | "I" | "J" | + * "K" | "L" | "M" | "N" | "O" | "P" | "Q" | "R" | "S" | "T" | + * "U" | "V" | "W" | "X" | "Y" | "Z" + */ +#define IS_UPALPHA(x) (((x) >= 'A') && ((x) <= 'Z')) + +#ifdef IS_DIGIT +#undef IS_DIGIT +#endif +/* + * digit = "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" + */ +#define IS_DIGIT(x) (((x) >= '0') && ((x) <= '9')) + +/* + * alphanum = alpha | digit + */ + +#define IS_ALPHANUM(x) (IS_ALPHA(x) || IS_DIGIT(x)) + +/* + * mark = "-" | "_" | "." | "!" | "~" | "*" | "'" | "(" | ")" + */ + +#define IS_MARK(x) (((x) == '-') || ((x) == '_') || ((x) == '.') || \ + ((x) == '!') || ((x) == '~') || ((x) == '*') || ((x) == '\'') || \ + ((x) == '(') || ((x) == ')')) + +/* + * unwise = "{" | "}" | "|" | "\" | "^" | "`" + */ + +#define IS_UNWISE(p) \ + (((*(p) == '{')) || ((*(p) == '}')) || ((*(p) == '|')) || \ + ((*(p) == '\\')) || ((*(p) == '^')) || ((*(p) == '[')) || \ + ((*(p) == ']')) || ((*(p) == '`'))) +/* + * reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | "$" | "," | + * "[" | "]" + */ + +#define IS_RESERVED(x) (((x) == ';') || ((x) == '/') || ((x) == '?') || \ + ((x) == ':') || ((x) == '@') || ((x) == '&') || ((x) == '=') || \ + ((x) == '+') || ((x) == '$') || ((x) == ',') || ((x) == '[') || \ + ((x) == ']')) + +/* + * unreserved = alphanum | mark + */ + +#define IS_UNRESERVED(x) (IS_ALPHANUM(x) || IS_MARK(x)) + +/* + * Skip to next pointer char, handle escaped sequences + */ + +#define NEXT(p) ((*p == '%') ? p += 3 : p++) + +/* + * Productions from the spec. + * + * authority = server | reg_name + * reg_name = 1*( unreserved | escaped | "$" | "," | + * ";" | ":" | "@" | "&" | "=" | "+" ) + * + * path = [ abs_path | opaque_part ] + */ + +/************************************************************************ + * * + * RFC 3986 parser * + * * + ************************************************************************/ + +#define ISA_DIGIT(p) ((*(p) >= '0') && (*(p) <= '9')) +#define ISA_ALPHA(p) (((*(p) >= 'a') && (*(p) <= 'z')) || \ + ((*(p) >= 'A') && (*(p) <= 'Z'))) +#define ISA_HEXDIG(p) \ + (ISA_DIGIT(p) || ((*(p) >= 'a') && (*(p) <= 'f')) || \ + ((*(p) >= 'A') && (*(p) <= 'F'))) + +/* + * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" + * / "*" / "+" / "," / ";" / "=" + */ +#define ISA_SUB_DELIM(p) \ + (((*(p) == '!')) || ((*(p) == '$')) || ((*(p) == '&')) || \ + ((*(p) == '(')) || ((*(p) == ')')) || ((*(p) == '*')) || \ + ((*(p) == '+')) || ((*(p) == ',')) || ((*(p) == ';')) || \ + ((*(p) == '=')) || ((*(p) == '\''))) + +/* + * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" + */ +#define ISA_GEN_DELIM(p) \ + (((*(p) == ':')) || ((*(p) == '/')) || ((*(p) == '?')) || \ + ((*(p) == '#')) || ((*(p) == '[')) || ((*(p) == ']')) || \ + ((*(p) == '@'))) + +/* + * reserved = gen-delims / sub-delims + */ +#define ISA_RESERVED(p) (ISA_GEN_DELIM(p) || (ISA_SUB_DELIM(p))) + +/* + * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + */ +#define ISA_UNRESERVED(p) \ + ((ISA_ALPHA(p)) || (ISA_DIGIT(p)) || ((*(p) == '-')) || \ + ((*(p) == '.')) || ((*(p) == '_')) || ((*(p) == '~'))) + +/* + * pct-encoded = "%" HEXDIG HEXDIG + */ +#define ISA_PCT_ENCODED(p) \ + ((*(p) == '%') && (ISA_HEXDIG(p + 1)) && (ISA_HEXDIG(p + 2))) + +/* + * pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + */ +#define ISA_PCHAR(p) \ + (ISA_UNRESERVED(p) || ISA_PCT_ENCODED(p) || ISA_SUB_DELIM(p) || \ + ((*(p) == ':')) || ((*(p) == '@'))) + +/** + * rfc3986_parse_scheme: + * @uri: pointer to an URI structure + * @str: pointer to the string to analyze + * + * Parse an URI scheme + * + * ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_scheme(URI *uri, const char **str) +{ + const char *cur; + + if (str == NULL) { + return -1; + } + + cur = *str; + if (!ISA_ALPHA(cur)) { + return 2; + } + cur++; + while (ISA_ALPHA(cur) || ISA_DIGIT(cur) || (*cur == '+') || (*cur == '-') || + (*cur == '.')) { + cur++; + } + if (uri != NULL) { + g_free(uri->scheme); + uri->scheme = g_strndup(*str, cur - *str); + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_fragment: + * @uri: pointer to an URI structure + * @str: pointer to the string to analyze + * + * Parse the query part of an URI + * + * fragment = *( pchar / "/" / "?" ) + * NOTE: the strict syntax as defined by 3986 does not allow '[' and ']' + * in the fragment identifier but this is used very broadly for + * xpointer scheme selection, so we are allowing it here to not break + * for example all the DocBook processing chains. + * + * Returns 0 or the error code + */ +static int rfc3986_parse_fragment(URI *uri, const char **str) +{ + const char *cur; + + if (str == NULL) { + return -1; + } + + cur = *str; + + while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') || + (*cur == '[') || (*cur == ']') || + ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) { + NEXT(cur); + } + if (uri != NULL) { + g_free(uri->fragment); + if (uri->cleanup & 2) { + uri->fragment = g_strndup(*str, cur - *str); + } else { + uri->fragment = uri_string_unescape(*str, cur - *str, NULL); + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_query: + * @uri: pointer to an URI structure + * @str: pointer to the string to analyze + * + * Parse the query part of an URI + * + * query = *uric + * + * Returns 0 or the error code + */ +static int rfc3986_parse_query(URI *uri, const char **str) +{ + const char *cur; + + if (str == NULL) { + return -1; + } + + cur = *str; + + while ((ISA_PCHAR(cur)) || (*cur == '/') || (*cur == '?') || + ((uri != NULL) && (uri->cleanup & 1) && (IS_UNWISE(cur)))) { + NEXT(cur); + } + if (uri != NULL) { + g_free(uri->query); + uri->query = g_strndup(*str, cur - *str); + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_port: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse a port part and fills in the appropriate fields + * of the @uri structure + * + * port = *DIGIT + * + * Returns 0 or the error code + */ +static int rfc3986_parse_port(URI *uri, const char **str) +{ + const char *cur = *str; + int port = 0; + + if (ISA_DIGIT(cur)) { + while (ISA_DIGIT(cur)) { + port = port * 10 + (*cur - '0'); + if (port > 65535) { + return 1; + } + cur++; + } + if (uri) { + uri->port = port; + } + *str = cur; + return 0; + } + return 1; +} + +/** + * rfc3986_parse_user_info: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse a user information part and fill in the appropriate fields + * of the @uri structure + * + * userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_user_info(URI *uri, const char **str) +{ + const char *cur; + + cur = *str; + while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur) || + (*cur == ':')) { + NEXT(cur); + } + if (*cur == '@') { + if (uri != NULL) { + g_free(uri->user); + if (uri->cleanup & 2) { + uri->user = g_strndup(*str, cur - *str); + } else { + uri->user = uri_string_unescape(*str, cur - *str, NULL); + } + } + *str = cur; + return 0; + } + return 1; +} + +/** + * rfc3986_parse_dec_octet: + * @str: the string to analyze + * + * dec-octet = DIGIT ; 0-9 + * / %x31-39 DIGIT ; 10-99 + * / "1" 2DIGIT ; 100-199 + * / "2" %x30-34 DIGIT ; 200-249 + * / "25" %x30-35 ; 250-255 + * + * Skip a dec-octet. + * + * Returns 0 if found and skipped, 1 otherwise + */ +static int rfc3986_parse_dec_octet(const char **str) +{ + const char *cur = *str; + + if (!(ISA_DIGIT(cur))) { + return 1; + } + if (!ISA_DIGIT(cur + 1)) { + cur++; + } else if ((*cur != '0') && (ISA_DIGIT(cur + 1)) && (!ISA_DIGIT(cur + 2))) { + cur += 2; + } else if ((*cur == '1') && (ISA_DIGIT(cur + 1)) && (ISA_DIGIT(cur + 2))) { + cur += 3; + } else if ((*cur == '2') && (*(cur + 1) >= '0') && (*(cur + 1) <= '4') && + (ISA_DIGIT(cur + 2))) { + cur += 3; + } else if ((*cur == '2') && (*(cur + 1) == '5') && (*(cur + 2) >= '0') && + (*(cur + 1) <= '5')) { + cur += 3; + } else { + return 1; + } + *str = cur; + return 0; +} +/** + * rfc3986_parse_host: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an host part and fills in the appropriate fields + * of the @uri structure + * + * host = IP-literal / IPv4address / reg-name + * IP-literal = "[" ( IPv6address / IPvFuture ) "]" + * IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + * reg-name = *( unreserved / pct-encoded / sub-delims ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_host(URI *uri, const char **str) +{ + const char *cur = *str; + const char *host; + + host = cur; + /* + * IPv6 and future addressing scheme are enclosed between brackets + */ + if (*cur == '[') { + cur++; + while ((*cur != ']') && (*cur != 0)) { + cur++; + } + if (*cur != ']') { + return 1; + } + cur++; + goto found; + } + /* + * try to parse an IPv4 + */ + if (ISA_DIGIT(cur)) { + if (rfc3986_parse_dec_octet(&cur) != 0) { + goto not_ipv4; + } + if (*cur != '.') { + goto not_ipv4; + } + cur++; + if (rfc3986_parse_dec_octet(&cur) != 0) { + goto not_ipv4; + } + if (*cur != '.') { + goto not_ipv4; + } + if (rfc3986_parse_dec_octet(&cur) != 0) { + goto not_ipv4; + } + if (*cur != '.') { + goto not_ipv4; + } + if (rfc3986_parse_dec_octet(&cur) != 0) { + goto not_ipv4; + } + goto found; + not_ipv4: + cur = *str; + } + /* + * then this should be a hostname which can be empty + */ + while (ISA_UNRESERVED(cur) || ISA_PCT_ENCODED(cur) || ISA_SUB_DELIM(cur)) { + NEXT(cur); + } +found: + if (uri != NULL) { + g_free(uri->authority); + uri->authority = NULL; + g_free(uri->server); + if (cur != host) { + if (uri->cleanup & 2) { + uri->server = g_strndup(host, cur - host); + } else { + uri->server = uri_string_unescape(host, cur - host, NULL); + } + } else { + uri->server = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_authority: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an authority part and fills in the appropriate fields + * of the @uri structure + * + * authority = [ userinfo "@" ] host [ ":" port ] + * + * Returns 0 or the error code + */ +static int rfc3986_parse_authority(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + /* + * try to parse a userinfo and check for the trailing @ + */ + ret = rfc3986_parse_user_info(uri, &cur); + if ((ret != 0) || (*cur != '@')) { + cur = *str; + } else { + cur++; + } + ret = rfc3986_parse_host(uri, &cur); + if (ret != 0) { + return ret; + } + if (*cur == ':') { + cur++; + ret = rfc3986_parse_port(uri, &cur); + if (ret != 0) { + return ret; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_segment: + * @str: the string to analyze + * @forbid: an optional forbidden character + * @empty: allow an empty segment + * + * Parse a segment and fills in the appropriate fields + * of the @uri structure + * + * segment = *pchar + * segment-nz = 1*pchar + * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) + * ; non-zero-length segment without any colon ":" + * + * Returns 0 or the error code + */ +static int rfc3986_parse_segment(const char **str, char forbid, int empty) +{ + const char *cur; + + cur = *str; + if (!ISA_PCHAR(cur)) { + if (empty) { + return 0; + } + return 1; + } + while (ISA_PCHAR(cur) && (*cur != forbid)) { + NEXT(cur); + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_path_ab_empty: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an path absolute or empty and fills in the appropriate fields + * of the @uri structure + * + * path-abempty = *( "/" segment ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_path_ab_empty(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + + while (*cur == '/') { + cur++; + ret = rfc3986_parse_segment(&cur, 0, 1); + if (ret != 0) { + return ret; + } + } + if (uri != NULL) { + g_free(uri->path); + if (*str != cur) { + if (uri->cleanup & 2) { + uri->path = g_strndup(*str, cur - *str); + } else { + uri->path = uri_string_unescape(*str, cur - *str, NULL); + } + } else { + uri->path = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_path_absolute: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an path absolute and fills in the appropriate fields + * of the @uri structure + * + * path-absolute = "/" [ segment-nz *( "/" segment ) ] + * + * Returns 0 or the error code + */ +static int rfc3986_parse_path_absolute(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + + if (*cur != '/') { + return 1; + } + cur++; + ret = rfc3986_parse_segment(&cur, 0, 0); + if (ret == 0) { + while (*cur == '/') { + cur++; + ret = rfc3986_parse_segment(&cur, 0, 1); + if (ret != 0) { + return ret; + } + } + } + if (uri != NULL) { + g_free(uri->path); + if (cur != *str) { + if (uri->cleanup & 2) { + uri->path = g_strndup(*str, cur - *str); + } else { + uri->path = uri_string_unescape(*str, cur - *str, NULL); + } + } else { + uri->path = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_path_rootless: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an path without root and fills in the appropriate fields + * of the @uri structure + * + * path-rootless = segment-nz *( "/" segment ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_path_rootless(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + + ret = rfc3986_parse_segment(&cur, 0, 0); + if (ret != 0) { + return ret; + } + while (*cur == '/') { + cur++; + ret = rfc3986_parse_segment(&cur, 0, 1); + if (ret != 0) { + return ret; + } + } + if (uri != NULL) { + g_free(uri->path); + if (cur != *str) { + if (uri->cleanup & 2) { + uri->path = g_strndup(*str, cur - *str); + } else { + uri->path = uri_string_unescape(*str, cur - *str, NULL); + } + } else { + uri->path = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_path_no_scheme: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an path which is not a scheme and fills in the appropriate fields + * of the @uri structure + * + * path-noscheme = segment-nz-nc *( "/" segment ) + * + * Returns 0 or the error code + */ +static int rfc3986_parse_path_no_scheme(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + + ret = rfc3986_parse_segment(&cur, ':', 0); + if (ret != 0) { + return ret; + } + while (*cur == '/') { + cur++; + ret = rfc3986_parse_segment(&cur, 0, 1); + if (ret != 0) { + return ret; + } + } + if (uri != NULL) { + g_free(uri->path); + if (cur != *str) { + if (uri->cleanup & 2) { + uri->path = g_strndup(*str, cur - *str); + } else { + uri->path = uri_string_unescape(*str, cur - *str, NULL); + } + } else { + uri->path = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_hier_part: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an hierarchical part and fills in the appropriate fields + * of the @uri structure + * + * hier-part = "//" authority path-abempty + * / path-absolute + * / path-rootless + * / path-empty + * + * Returns 0 or the error code + */ +static int rfc3986_parse_hier_part(URI *uri, const char **str) +{ + const char *cur; + int ret; + + cur = *str; + + if ((*cur == '/') && (*(cur + 1) == '/')) { + cur += 2; + ret = rfc3986_parse_authority(uri, &cur); + if (ret != 0) { + return ret; + } + ret = rfc3986_parse_path_ab_empty(uri, &cur); + if (ret != 0) { + return ret; + } + *str = cur; + return 0; + } else if (*cur == '/') { + ret = rfc3986_parse_path_absolute(uri, &cur); + if (ret != 0) { + return ret; + } + } else if (ISA_PCHAR(cur)) { + ret = rfc3986_parse_path_rootless(uri, &cur); + if (ret != 0) { + return ret; + } + } else { + /* path-empty is effectively empty */ + if (uri != NULL) { + g_free(uri->path); + uri->path = NULL; + } + } + *str = cur; + return 0; +} + +/** + * rfc3986_parse_relative_ref: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an URI string and fills in the appropriate fields + * of the @uri structure + * + * relative-ref = relative-part [ "?" query ] [ "#" fragment ] + * relative-part = "//" authority path-abempty + * / path-absolute + * / path-noscheme + * / path-empty + * + * Returns 0 or the error code + */ +static int rfc3986_parse_relative_ref(URI *uri, const char *str) +{ + int ret; + + if ((*str == '/') && (*(str + 1) == '/')) { + str += 2; + ret = rfc3986_parse_authority(uri, &str); + if (ret != 0) { + return ret; + } + ret = rfc3986_parse_path_ab_empty(uri, &str); + if (ret != 0) { + return ret; + } + } else if (*str == '/') { + ret = rfc3986_parse_path_absolute(uri, &str); + if (ret != 0) { + return ret; + } + } else if (ISA_PCHAR(str)) { + ret = rfc3986_parse_path_no_scheme(uri, &str); + if (ret != 0) { + return ret; + } + } else { + /* path-empty is effectively empty */ + if (uri != NULL) { + g_free(uri->path); + uri->path = NULL; + } + } + + if (*str == '?') { + str++; + ret = rfc3986_parse_query(uri, &str); + if (ret != 0) { + return ret; + } + } + if (*str == '#') { + str++; + ret = rfc3986_parse_fragment(uri, &str); + if (ret != 0) { + return ret; + } + } + if (*str != 0) { + uri_clean(uri); + return 1; + } + return 0; +} + +/** + * rfc3986_parse: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an URI string and fills in the appropriate fields + * of the @uri structure + * + * scheme ":" hier-part [ "?" query ] [ "#" fragment ] + * + * Returns 0 or the error code + */ +static int rfc3986_parse(URI *uri, const char *str) +{ + int ret; + + ret = rfc3986_parse_scheme(uri, &str); + if (ret != 0) { + return ret; + } + if (*str != ':') { + return 1; + } + str++; + ret = rfc3986_parse_hier_part(uri, &str); + if (ret != 0) { + return ret; + } + if (*str == '?') { + str++; + ret = rfc3986_parse_query(uri, &str); + if (ret != 0) { + return ret; + } + } + if (*str == '#') { + str++; + ret = rfc3986_parse_fragment(uri, &str); + if (ret != 0) { + return ret; + } + } + if (*str != 0) { + uri_clean(uri); + return 1; + } + return 0; +} + +/** + * rfc3986_parse_uri_reference: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an URI reference string and fills in the appropriate fields + * of the @uri structure + * + * URI-reference = URI / relative-ref + * + * Returns 0 or the error code + */ +static int rfc3986_parse_uri_reference(URI *uri, const char *str) +{ + int ret; + + if (str == NULL) { + return -1; + } + uri_clean(uri); + + /* + * Try first to parse absolute refs, then fallback to relative if + * it fails. + */ + ret = rfc3986_parse(uri, str); + if (ret != 0) { + uri_clean(uri); + ret = rfc3986_parse_relative_ref(uri, str); + if (ret != 0) { + uri_clean(uri); + return ret; + } + } + return 0; +} + +/** + * uri_parse: + * @str: the URI string to analyze + * + * Parse an URI based on RFC 3986 + * + * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + * + * Returns a newly built URI or NULL in case of error + */ +URI *uri_parse(const char *str) +{ + URI *uri; + int ret; + + if (str == NULL) { + return NULL; + } + uri = uri_new(); + ret = rfc3986_parse_uri_reference(uri, str); + if (ret) { + uri_free(uri); + return NULL; + } + return uri; +} + +/** + * uri_parse_into: + * @uri: pointer to an URI structure + * @str: the string to analyze + * + * Parse an URI reference string based on RFC 3986 and fills in the + * appropriate fields of the @uri structure + * + * URI-reference = URI / relative-ref + * + * Returns 0 or the error code + */ +int uri_parse_into(URI *uri, const char *str) +{ + return rfc3986_parse_uri_reference(uri, str); +} + +/** + * uri_parse_raw: + * @str: the URI string to analyze + * @raw: if 1 unescaping of URI pieces are disabled + * + * Parse an URI but allows to keep intact the original fragments. + * + * URI-reference = URI / relative-ref + * + * Returns a newly built URI or NULL in case of error + */ +URI *uri_parse_raw(const char *str, int raw) +{ + URI *uri; + int ret; + + if (str == NULL) { + return NULL; + } + uri = uri_new(); + if (raw) { + uri->cleanup |= 2; + } + ret = uri_parse_into(uri, str); + if (ret) { + uri_free(uri); + return NULL; + } + return uri; +} + +/************************************************************************ + * * + * Generic URI structure functions * + * * + ************************************************************************/ + +/** + * uri_new: + * + * Simply creates an empty URI + * + * Returns the new structure or NULL in case of error + */ +URI *uri_new(void) +{ + return g_new0(URI, 1); +} + +/** + * realloc2n: + * + * Function to handle properly a reallocation when saving an URI + * Also imposes some limit on the length of an URI string output + */ +static char *realloc2n(char *ret, int *max) +{ + char *temp; + int tmp; + + tmp = *max * 2; + temp = g_realloc(ret, (tmp + 1)); + *max = tmp; + return temp; +} + +/** + * uri_to_string: + * @uri: pointer to an URI + * + * Save the URI as an escaped string + * + * Returns a new string (to be deallocated by caller) + */ +char *uri_to_string(URI *uri) +{ + char *ret = NULL; + char *temp; + const char *p; + int len; + int max; + + if (uri == NULL) { + return NULL; + } + + max = 80; + ret = g_malloc(max + 1); + len = 0; + + if (uri->scheme != NULL) { + p = uri->scheme; + while (*p != 0) { + if (len >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = *p++; + } + if (len >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = ':'; + } + if (uri->opaque != NULL) { + p = uri->opaque; + while (*p != 0) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + if (IS_RESERVED(*(p)) || IS_UNRESERVED(*(p))) { + ret[len++] = *p++; + } else { + int val = *(unsigned char *)p++; + int hi = val / 0x10, lo = val % 0x10; + ret[len++] = '%'; + ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0'); + ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0'); + } + } + } else { + if (uri->server != NULL) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '/'; + ret[len++] = '/'; + if (uri->user != NULL) { + p = uri->user; + while (*p != 0) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + if ((IS_UNRESERVED(*(p))) || ((*(p) == ';')) || + ((*(p) == ':')) || ((*(p) == '&')) || ((*(p) == '=')) || + ((*(p) == '+')) || ((*(p) == '$')) || ((*(p) == ','))) { + ret[len++] = *p++; + } else { + int val = *(unsigned char *)p++; + int hi = val / 0x10, lo = val % 0x10; + ret[len++] = '%'; + ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0'); + ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0'); + } + } + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '@'; + } + p = uri->server; + while (*p != 0) { + if (len >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = *p++; + } + if (uri->port > 0) { + if (len + 10 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + len += snprintf(&ret[len], max - len, ":%d", uri->port); + } + } else if (uri->authority != NULL) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '/'; + ret[len++] = '/'; + p = uri->authority; + while (*p != 0) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + if ((IS_UNRESERVED(*(p))) || ((*(p) == '$')) || + ((*(p) == ',')) || ((*(p) == ';')) || ((*(p) == ':')) || + ((*(p) == '@')) || ((*(p) == '&')) || ((*(p) == '=')) || + ((*(p) == '+'))) { + ret[len++] = *p++; + } else { + int val = *(unsigned char *)p++; + int hi = val / 0x10, lo = val % 0x10; + ret[len++] = '%'; + ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0'); + ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0'); + } + } + } else if (uri->scheme != NULL) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '/'; + ret[len++] = '/'; + } + if (uri->path != NULL) { + p = uri->path; + /* + * the colon in file:///d: should not be escaped or + * Windows accesses fail later. + */ + if ((uri->scheme != NULL) && (p[0] == '/') && + (((p[1] >= 'a') && (p[1] <= 'z')) || + ((p[1] >= 'A') && (p[1] <= 'Z'))) && + (p[2] == ':') && (!strcmp(uri->scheme, "file"))) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = *p++; + ret[len++] = *p++; + ret[len++] = *p++; + } + while (*p != 0) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + if ((IS_UNRESERVED(*(p))) || ((*(p) == '/')) || + ((*(p) == ';')) || ((*(p) == '@')) || ((*(p) == '&')) || + ((*(p) == '=')) || ((*(p) == '+')) || ((*(p) == '$')) || + ((*(p) == ','))) { + ret[len++] = *p++; + } else { + int val = *(unsigned char *)p++; + int hi = val / 0x10, lo = val % 0x10; + ret[len++] = '%'; + ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0'); + ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0'); + } + } + } + if (uri->query != NULL) { + if (len + 1 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '?'; + p = uri->query; + while (*p != 0) { + if (len + 1 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = *p++; + } + } + } + if (uri->fragment != NULL) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len++] = '#'; + p = uri->fragment; + while (*p != 0) { + if (len + 3 >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + if ((IS_UNRESERVED(*(p))) || (IS_RESERVED(*(p)))) { + ret[len++] = *p++; + } else { + int val = *(unsigned char *)p++; + int hi = val / 0x10, lo = val % 0x10; + ret[len++] = '%'; + ret[len++] = hi + (hi > 9 ? 'A' - 10 : '0'); + ret[len++] = lo + (lo > 9 ? 'A' - 10 : '0'); + } + } + } + if (len >= max) { + temp = realloc2n(ret, &max); + ret = temp; + } + ret[len] = 0; + return ret; +} + +/** + * uri_clean: + * @uri: pointer to an URI + * + * Make sure the URI struct is free of content + */ +static void uri_clean(URI *uri) +{ + if (uri == NULL) { + return; + } + + g_free(uri->scheme); + uri->scheme = NULL; + g_free(uri->server); + uri->server = NULL; + g_free(uri->user); + uri->user = NULL; + g_free(uri->path); + uri->path = NULL; + g_free(uri->fragment); + uri->fragment = NULL; + g_free(uri->opaque); + uri->opaque = NULL; + g_free(uri->authority); + uri->authority = NULL; + g_free(uri->query); + uri->query = NULL; +} + +/** + * uri_free: + * @uri: pointer to an URI, NULL is ignored + * + * Free up the URI struct + */ +void uri_free(URI *uri) +{ + uri_clean(uri); + g_free(uri); +} + +/************************************************************************ + * * + * Helper functions * + * * + ************************************************************************/ + +/** + * normalize_uri_path: + * @path: pointer to the path string + * + * Applies the 5 normalization steps to a path string--that is, RFC 2396 + * Section 5.2, steps 6.c through 6.g. + * + * Normalization occurs directly on the string, no new allocation is done + * + * Returns 0 or an error code + */ +static int normalize_uri_path(char *path) +{ + char *cur, *out; + + if (path == NULL) { + return -1; + } + + /* Skip all initial "/" chars. We want to get to the beginning of the + * first non-empty segment. + */ + cur = path; + while (cur[0] == '/') { + ++cur; + } + if (cur[0] == '\0') { + return 0; + } + + /* Keep everything we've seen so far. */ + out = cur; + + /* + * Analyze each segment in sequence for cases (c) and (d). + */ + while (cur[0] != '\0') { + /* + * c) All occurrences of "./", where "." is a complete path segment, + * are removed from the buffer string. + */ + if ((cur[0] == '.') && (cur[1] == '/')) { + cur += 2; + /* '//' normalization should be done at this point too */ + while (cur[0] == '/') { + cur++; + } + continue; + } + + /* + * d) If the buffer string ends with "." as a complete path segment, + * that "." is removed. + */ + if ((cur[0] == '.') && (cur[1] == '\0')) { + break; + } + + /* Otherwise keep the segment. */ + while (cur[0] != '/') { + if (cur[0] == '\0') { + goto done_cd; + } + (out++)[0] = (cur++)[0]; + } + /* nomalize // */ + while ((cur[0] == '/') && (cur[1] == '/')) { + cur++; + } + + (out++)[0] = (cur++)[0]; + } +done_cd: + out[0] = '\0'; + + /* Reset to the beginning of the first segment for the next sequence. */ + cur = path; + while (cur[0] == '/') { + ++cur; + } + if (cur[0] == '\0') { + return 0; + } + + /* + * Analyze each segment in sequence for cases (e) and (f). + * + * e) All occurrences of "<segment>/../", where <segment> is a + * complete path segment not equal to "..", are removed from the + * buffer string. Removal of these path segments is performed + * iteratively, removing the leftmost matching pattern on each + * iteration, until no matching pattern remains. + * + * f) If the buffer string ends with "<segment>/..", where <segment> + * is a complete path segment not equal to "..", that + * "<segment>/.." is removed. + * + * To satisfy the "iterative" clause in (e), we need to collapse the + * string every time we find something that needs to be removed. Thus, + * we don't need to keep two pointers into the string: we only need a + * "current position" pointer. + */ + while (1) { + char *segp, *tmp; + + /* At the beginning of each iteration of this loop, "cur" points to + * the first character of the segment we want to examine. + */ + + /* Find the end of the current segment. */ + segp = cur; + while ((segp[0] != '/') && (segp[0] != '\0')) { + ++segp; + } + + /* If this is the last segment, we're done (we need at least two + * segments to meet the criteria for the (e) and (f) cases). + */ + if (segp[0] == '\0') { + break; + } + + /* If the first segment is "..", or if the next segment _isn't_ "..", + * keep this segment and try the next one. + */ + ++segp; + if (((cur[0] == '.') && (cur[1] == '.') && (segp == cur + 3)) || + ((segp[0] != '.') || (segp[1] != '.') || + ((segp[2] != '/') && (segp[2] != '\0')))) { + cur = segp; + continue; + } + + /* If we get here, remove this segment and the next one and back up + * to the previous segment (if there is one), to implement the + * "iteratively" clause. It's pretty much impossible to back up + * while maintaining two pointers into the buffer, so just compact + * the whole buffer now. + */ + + /* If this is the end of the buffer, we're done. */ + if (segp[2] == '\0') { + cur[0] = '\0'; + break; + } + /* Valgrind complained, strcpy(cur, segp + 3); */ + /* string will overlap, do not use strcpy */ + tmp = cur; + segp += 3; + while ((*tmp++ = *segp++) != 0) { + /* No further work */ + } + + /* If there are no previous segments, then keep going from here. */ + segp = cur; + while ((segp > path) && ((--segp)[0] == '/')) { + /* No further work */ + } + if (segp == path) { + continue; + } + + /* "segp" is pointing to the end of a previous segment; find it's + * start. We need to back up to the previous segment and start + * over with that to handle things like "foo/bar/../..". If we + * don't do this, then on the first pass we'll remove the "bar/..", + * but be pointing at the second ".." so we won't realize we can also + * remove the "foo/..". + */ + cur = segp; + while ((cur > path) && (cur[-1] != '/')) { + --cur; + } + } + out[0] = '\0'; + + /* + * g) If the resulting buffer string still begins with one or more + * complete path segments of "..", then the reference is + * considered to be in error. Implementations may handle this + * error by retaining these components in the resolved path (i.e., + * treating them as part of the final URI), by removing them from + * the resolved path (i.e., discarding relative levels above the + * root), or by avoiding traversal of the reference. + * + * We discard them from the final path. + */ + if (path[0] == '/') { + cur = path; + while ((cur[0] == '/') && (cur[1] == '.') && (cur[2] == '.') && + ((cur[3] == '/') || (cur[3] == '\0'))) { + cur += 3; + } + + if (cur != path) { + out = path; + while (cur[0] != '\0') { + (out++)[0] = (cur++)[0]; + } + out[0] = 0; + } + } + + return 0; +} + +static int is_hex(char c) +{ + if (((c >= '0') && (c <= '9')) || ((c >= 'a') && (c <= 'f')) || + ((c >= 'A') && (c <= 'F'))) { + return 1; + } + return 0; +} + +/** + * uri_string_unescape: + * @str: the string to unescape + * @len: the length in bytes to unescape (or <= 0 to indicate full string) + * @target: optional destination buffer + * + * Unescaping routine, but does not check that the string is an URI. The + * output is a direct unsigned char translation of %XX values (no encoding) + * Note that the length of the result can only be smaller or same size as + * the input string. + * + * Returns a copy of the string, but unescaped, will return NULL only in case + * of error + */ +char *uri_string_unescape(const char *str, int len, char *target) +{ + char *ret, *out; + const char *in; + + if (str == NULL) { + return NULL; + } + if (len <= 0) { + len = strlen(str); + } + if (len < 0) { + return NULL; + } + + if (target == NULL) { + ret = g_malloc(len + 1); + } else { + ret = target; + } + in = str; + out = ret; + while (len > 0) { + if ((len > 2) && (*in == '%') && (is_hex(in[1])) && (is_hex(in[2]))) { + in++; + if ((*in >= '0') && (*in <= '9')) { + *out = (*in - '0'); + } else if ((*in >= 'a') && (*in <= 'f')) { + *out = (*in - 'a') + 10; + } else if ((*in >= 'A') && (*in <= 'F')) { + *out = (*in - 'A') + 10; + } + in++; + if ((*in >= '0') && (*in <= '9')) { + *out = *out * 16 + (*in - '0'); + } else if ((*in >= 'a') && (*in <= 'f')) { + *out = *out * 16 + (*in - 'a') + 10; + } else if ((*in >= 'A') && (*in <= 'F')) { + *out = *out * 16 + (*in - 'A') + 10; + } + in++; + len -= 3; + out++; + } else { + *out++ = *in++; + len--; + } + } + *out = 0; + return ret; +} + +/** + * uri_string_escape: + * @str: string to escape + * @list: exception list string of chars not to escape + * + * This routine escapes a string to hex, ignoring reserved characters (a-z) + * and the characters in the exception list. + * + * Returns a new escaped string or NULL in case of error. + */ +char *uri_string_escape(const char *str, const char *list) +{ + char *ret, ch; + char *temp; + const char *in; + int len, out; + + if (str == NULL) { + return NULL; + } + if (str[0] == 0) { + return g_strdup(str); + } + len = strlen(str); + if (!(len > 0)) { + return NULL; + } + + len += 20; + ret = g_malloc(len); + in = str; + out = 0; + while (*in != 0) { + if (len - out <= 3) { + temp = realloc2n(ret, &len); + ret = temp; + } + + ch = *in; + + if ((ch != '@') && (!IS_UNRESERVED(ch)) && (!strchr(list, ch))) { + unsigned char val; + ret[out++] = '%'; + val = ch >> 4; + if (val <= 9) { + ret[out++] = '0' + val; + } else { + ret[out++] = 'A' + val - 0xA; + } + val = ch & 0xF; + if (val <= 9) { + ret[out++] = '0' + val; + } else { + ret[out++] = 'A' + val - 0xA; + } + in++; + } else { + ret[out++] = *in++; + } + } + ret[out] = 0; + return ret; +} + +/************************************************************************ + * * + * Public functions * + * * + ************************************************************************/ + +/** + * uri_resolve: + * @URI: the URI instance found in the document + * @base: the base value + * + * Computes he final URI of the reference done by checking that + * the given URI is valid, and building the final URI using the + * base URI. This is processed according to section 5.2 of the + * RFC 2396 + * + * 5.2. Resolving Relative References to Absolute Form + * + * Returns a new URI string (to be freed by the caller) or NULL in case + * of error. + */ +char *uri_resolve(const char *uri, const char *base) +{ + char *val = NULL; + int ret, len, indx, cur, out; + URI *ref = NULL; + URI *bas = NULL; + URI *res = NULL; + + /* + * 1) The URI reference is parsed into the potential four components and + * fragment identifier, as described in Section 4.3. + * + * NOTE that a completely empty URI is treated by modern browsers + * as a reference to "." rather than as a synonym for the current + * URI. Should we do that here? + */ + if (uri == NULL) { + ret = -1; + } else { + if (*uri) { + ref = uri_new(); + ret = uri_parse_into(ref, uri); + } else { + ret = 0; + } + } + if (ret != 0) { + goto done; + } + if ((ref != NULL) && (ref->scheme != NULL)) { + /* + * The URI is absolute don't modify. + */ + val = g_strdup(uri); + goto done; + } + if (base == NULL) { + ret = -1; + } else { + bas = uri_new(); + ret = uri_parse_into(bas, base); + } + if (ret != 0) { + if (ref) { + val = uri_to_string(ref); + } + goto done; + } + if (ref == NULL) { + /* + * the base fragment must be ignored + */ + g_free(bas->fragment); + bas->fragment = NULL; + val = uri_to_string(bas); + goto done; + } + + /* + * 2) If the path component is empty and the scheme, authority, and + * query components are undefined, then it is a reference to the + * current document and we are done. Otherwise, the reference URI's + * query and fragment components are defined as found (or not found) + * within the URI reference and not inherited from the base URI. + * + * NOTE that in modern browsers, the parsing differs from the above + * in the following aspect: the query component is allowed to be + * defined while still treating this as a reference to the current + * document. + */ + res = uri_new(); + if ((ref->scheme == NULL) && (ref->path == NULL) && + ((ref->authority == NULL) && (ref->server == NULL))) { + res->scheme = g_strdup(bas->scheme); + if (bas->authority != NULL) { + res->authority = g_strdup(bas->authority); + } else if (bas->server != NULL) { + res->server = g_strdup(bas->server); + res->user = g_strdup(bas->user); + res->port = bas->port; + } + res->path = g_strdup(bas->path); + if (ref->query != NULL) { + res->query = g_strdup(ref->query); + } else { + res->query = g_strdup(bas->query); + } + res->fragment = g_strdup(ref->fragment); + goto step_7; + } + + /* + * 3) If the scheme component is defined, indicating that the reference + * starts with a scheme name, then the reference is interpreted as an + * absolute URI and we are done. Otherwise, the reference URI's + * scheme is inherited from the base URI's scheme component. + */ + if (ref->scheme != NULL) { + val = uri_to_string(ref); + goto done; + } + res->scheme = g_strdup(bas->scheme); + + res->query = g_strdup(ref->query); + res->fragment = g_strdup(ref->fragment); + + /* + * 4) If the authority component is defined, then the reference is a + * network-path and we skip to step 7. Otherwise, the reference + * URI's authority is inherited from the base URI's authority + * component, which will also be undefined if the URI scheme does not + * use an authority component. + */ + if ((ref->authority != NULL) || (ref->server != NULL)) { + if (ref->authority != NULL) { + res->authority = g_strdup(ref->authority); + } else { + res->server = g_strdup(ref->server); + res->user = g_strdup(ref->user); + res->port = ref->port; + } + res->path = g_strdup(ref->path); + goto step_7; + } + if (bas->authority != NULL) { + res->authority = g_strdup(bas->authority); + } else if (bas->server != NULL) { + res->server = g_strdup(bas->server); + res->user = g_strdup(bas->user); + res->port = bas->port; + } + + /* + * 5) If the path component begins with a slash character ("/"), then + * the reference is an absolute-path and we skip to step 7. + */ + if ((ref->path != NULL) && (ref->path[0] == '/')) { + res->path = g_strdup(ref->path); + goto step_7; + } + + /* + * 6) If this step is reached, then we are resolving a relative-path + * reference. The relative path needs to be merged with the base + * URI's path. Although there are many ways to do this, we will + * describe a simple method using a separate string buffer. + * + * Allocate a buffer large enough for the result string. + */ + len = 2; /* extra / and 0 */ + if (ref->path != NULL) { + len += strlen(ref->path); + } + if (bas->path != NULL) { + len += strlen(bas->path); + } + res->path = g_malloc(len); + res->path[0] = 0; + + /* + * a) All but the last segment of the base URI's path component is + * copied to the buffer. In other words, any characters after the + * last (right-most) slash character, if any, are excluded. + */ + cur = 0; + out = 0; + if (bas->path != NULL) { + while (bas->path[cur] != 0) { + while ((bas->path[cur] != 0) && (bas->path[cur] != '/')) { + cur++; + } + if (bas->path[cur] == 0) { + break; + } + + cur++; + while (out < cur) { + res->path[out] = bas->path[out]; + out++; + } + } + } + res->path[out] = 0; + + /* + * b) The reference's path component is appended to the buffer + * string. + */ + if (ref->path != NULL && ref->path[0] != 0) { + indx = 0; + /* + * Ensure the path includes a '/' + */ + if ((out == 0) && (bas->server != NULL)) { + res->path[out++] = '/'; + } + while (ref->path[indx] != 0) { + res->path[out++] = ref->path[indx++]; + } + } + res->path[out] = 0; + + /* + * Steps c) to h) are really path normalization steps + */ + normalize_uri_path(res->path); + +step_7: + + /* + * 7) The resulting URI components, including any inherited from the + * base URI, are recombined to give the absolute form of the URI + * reference. + */ + val = uri_to_string(res); + +done: + uri_free(ref); + uri_free(bas); + uri_free(res); + return val; +} + +/** + * uri_resolve_relative: + * @URI: the URI reference under consideration + * @base: the base value + * + * Expresses the URI of the reference in terms relative to the + * base. Some examples of this operation include: + * base = "http://site1.com/docs/book1.html" + * URI input URI returned + * docs/pic1.gif pic1.gif + * docs/img/pic1.gif img/pic1.gif + * img/pic1.gif ../img/pic1.gif + * http://site1.com/docs/pic1.gif pic1.gif + * http://site2.com/docs/pic1.gif http://site2.com/docs/pic1.gif + * + * base = "docs/book1.html" + * URI input URI returned + * docs/pic1.gif pic1.gif + * docs/img/pic1.gif img/pic1.gif + * img/pic1.gif ../img/pic1.gif + * http://site1.com/docs/pic1.gif http://site1.com/docs/pic1.gif + * + * + * Note: if the URI reference is really weird or complicated, it may be + * worthwhile to first convert it into a "nice" one by calling + * uri_resolve (using 'base') before calling this routine, + * since this routine (for reasonable efficiency) assumes URI has + * already been through some validation. + * + * Returns a new URI string (to be freed by the caller) or NULL in case + * error. + */ +char *uri_resolve_relative(const char *uri, const char *base) +{ + char *val = NULL; + int ret; + int ix; + int pos = 0; + int nbslash = 0; + int len; + URI *ref = NULL; + URI *bas = NULL; + char *bptr, *uptr, *vptr; + int remove_path = 0; + + if ((uri == NULL) || (*uri == 0)) { + return NULL; + } + + /* + * First parse URI into a standard form + */ + ref = uri_new(); + /* If URI not already in "relative" form */ + if (uri[0] != '.') { + ret = uri_parse_into(ref, uri); + if (ret != 0) { + goto done; /* Error in URI, return NULL */ + } + } else { + ref->path = g_strdup(uri); + } + + /* + * Next parse base into the same standard form + */ + if ((base == NULL) || (*base == 0)) { + val = g_strdup(uri); + goto done; + } + bas = uri_new(); + if (base[0] != '.') { + ret = uri_parse_into(bas, base); + if (ret != 0) { + goto done; /* Error in base, return NULL */ + } + } else { + bas->path = g_strdup(base); + } + + /* + * If the scheme / server on the URI differs from the base, + * just return the URI + */ + if ((ref->scheme != NULL) && + ((bas->scheme == NULL) || (strcmp(bas->scheme, ref->scheme)) || + (strcmp(bas->server, ref->server)))) { + val = g_strdup(uri); + goto done; + } + if (bas->path == ref->path || + (bas->path && ref->path && !strcmp(bas->path, ref->path))) { + val = g_strdup(""); + goto done; + } + if (bas->path == NULL) { + val = g_strdup(ref->path); + goto done; + } + if (ref->path == NULL) { + ref->path = (char *)"/"; + remove_path = 1; + } + + /* + * At this point (at last!) we can compare the two paths + * + * First we take care of the special case where either of the + * two path components may be missing (bug 316224) + */ + if (bas->path == NULL) { + if (ref->path != NULL) { + uptr = ref->path; + if (*uptr == '/') { + uptr++; + } + /* exception characters from uri_to_string */ + val = uri_string_escape(uptr, "/;&=+$,"); + } + goto done; + } + bptr = bas->path; + if (ref->path == NULL) { + for (ix = 0; bptr[ix] != 0; ix++) { + if (bptr[ix] == '/') { + nbslash++; + } + } + uptr = NULL; + len = 1; /* this is for a string terminator only */ + } else { + /* + * Next we compare the two strings and find where they first differ + */ + if ((ref->path[pos] == '.') && (ref->path[pos + 1] == '/')) { + pos += 2; + } + if ((*bptr == '.') && (bptr[1] == '/')) { + bptr += 2; + } else if ((*bptr == '/') && (ref->path[pos] != '/')) { + bptr++; + } + while ((bptr[pos] == ref->path[pos]) && (bptr[pos] != 0)) { + pos++; + } + + if (bptr[pos] == ref->path[pos]) { + val = g_strdup(""); + goto done; /* (I can't imagine why anyone would do this) */ + } + + /* + * In URI, "back up" to the last '/' encountered. This will be the + * beginning of the "unique" suffix of URI + */ + ix = pos; + if ((ref->path[ix] == '/') && (ix > 0)) { + ix--; + } else if ((ref->path[ix] == 0) && (ix > 1) + && (ref->path[ix - 1] == '/')) { + ix -= 2; + } + for (; ix > 0; ix--) { + if (ref->path[ix] == '/') { + break; + } + } + if (ix == 0) { + uptr = ref->path; + } else { + ix++; + uptr = &ref->path[ix]; + } + + /* + * In base, count the number of '/' from the differing point + */ + if (bptr[pos] != ref->path[pos]) { /* check for trivial URI == base */ + for (; bptr[ix] != 0; ix++) { + if (bptr[ix] == '/') { + nbslash++; + } + } + } + len = strlen(uptr) + 1; + } + + if (nbslash == 0) { + if (uptr != NULL) { + /* exception characters from uri_to_string */ + val = uri_string_escape(uptr, "/;&=+$,"); + } + goto done; + } + + /* + * Allocate just enough space for the returned string - + * length of the remainder of the URI, plus enough space + * for the "../" groups, plus one for the terminator + */ + val = g_malloc(len + 3 * nbslash); + vptr = val; + /* + * Put in as many "../" as needed + */ + for (; nbslash > 0; nbslash--) { + *vptr++ = '.'; + *vptr++ = '.'; + *vptr++ = '/'; + } + /* + * Finish up with the end of the URI + */ + if (uptr != NULL) { + if ((vptr > val) && (len > 0) && (uptr[0] == '/') && + (vptr[-1] == '/')) { + memcpy(vptr, uptr + 1, len - 1); + vptr[len - 2] = 0; + } else { + memcpy(vptr, uptr, len); + vptr[len - 1] = 0; + } + } else { + vptr[len - 1] = 0; + } + + /* escape the freshly-built path */ + vptr = val; + /* exception characters from uri_to_string */ + val = uri_string_escape(vptr, "/;&=+$,"); + g_free(vptr); + +done: + /* + * Free the working variables + */ + if (remove_path != 0) { + ref->path = NULL; + } + uri_free(ref); + uri_free(bas); + + return val; +} + +/* + * Utility functions to help parse and assemble query strings. + */ + +struct QueryParams *query_params_new(int init_alloc) +{ + struct QueryParams *ps; + + if (init_alloc <= 0) { + init_alloc = 1; + } + + ps = g_new(QueryParams, 1); + ps->n = 0; + ps->alloc = init_alloc; + ps->p = g_new(QueryParam, ps->alloc); + + return ps; +} + +/* Ensure there is space to store at least one more parameter + * at the end of the set. + */ +static int query_params_append(struct QueryParams *ps, const char *name, + const char *value) +{ + if (ps->n >= ps->alloc) { + ps->p = g_renew(QueryParam, ps->p, ps->alloc * 2); + ps->alloc *= 2; + } + + ps->p[ps->n].name = g_strdup(name); + ps->p[ps->n].value = g_strdup(value); + ps->p[ps->n].ignore = 0; + ps->n++; + + return 0; +} + +void query_params_free(struct QueryParams *ps) +{ + int i; + + for (i = 0; i < ps->n; ++i) { + g_free(ps->p[i].name); + g_free(ps->p[i].value); + } + g_free(ps->p); + g_free(ps); +} + +struct QueryParams *query_params_parse(const char *query) +{ + struct QueryParams *ps; + const char *end, *eq; + + ps = query_params_new(0); + if (!query || query[0] == '\0') { + return ps; + } + + while (*query) { + char *name = NULL, *value = NULL; + + /* Find the next separator, or end of the string. */ + end = strchr(query, '&'); + if (!end) { + end = qemu_strchrnul(query, ';'); + } + + /* Find the first '=' character between here and end. */ + eq = strchr(query, '='); + if (eq && eq >= end) { + eq = NULL; + } + + /* Empty section (eg. "&&"). */ + if (end == query) { + goto next; + } + + /* If there is no '=' character, then we have just "name" + * and consistent with CGI.pm we assume value is "". + */ + else if (!eq) { + name = uri_string_unescape(query, end - query, NULL); + value = NULL; + } + /* Or if we have "name=" here (works around annoying + * problem when calling uri_string_unescape with len = 0). + */ + else if (eq + 1 == end) { + name = uri_string_unescape(query, eq - query, NULL); + value = g_new0(char, 1); + } + /* If the '=' character is at the beginning then we have + * "=value" and consistent with CGI.pm we _ignore_ this. + */ + else if (query == eq) { + goto next; + } + + /* Otherwise it's "name=value". */ + else { + name = uri_string_unescape(query, eq - query, NULL); + value = uri_string_unescape(eq + 1, end - (eq + 1), NULL); + } + + /* Append to the parameter set. */ + query_params_append(ps, name, value); + g_free(name); + g_free(value); + + next: + query = end; + if (*query) { + query++; /* skip '&' separator */ + } + } + + return ps; +} diff --git a/util/userfaultfd.c b/util/userfaultfd.c new file mode 100644 index 000000000..f1cd6af2b --- /dev/null +++ b/util/userfaultfd.c @@ -0,0 +1,345 @@ +/* + * Linux UFFD-WP support + * + * Copyright Virtuozzo GmbH, 2020 + * + * Authors: + * Andrey Gruzdev <andrey.gruzdev@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qemu/bitops.h" +#include "qemu/error-report.h" +#include "qemu/userfaultfd.h" +#include "trace.h" +#include <poll.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> + +/** + * uffd_query_features: query UFFD features + * + * Returns: 0 on success, negative value in case of an error + * + * @features: parameter to receive 'uffdio_api.features' + */ +int uffd_query_features(uint64_t *features) +{ + int uffd_fd; + struct uffdio_api api_struct = { 0 }; + int ret = -1; + + uffd_fd = syscall(__NR_userfaultfd, O_CLOEXEC); + if (uffd_fd < 0) { + trace_uffd_query_features_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = 0; + + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_query_features_api_failed(errno); + goto out; + } + *features = api_struct.features; + ret = 0; + +out: + close(uffd_fd); + return ret; +} + +/** + * uffd_create_fd: create UFFD file descriptor + * + * Returns non-negative file descriptor or negative value in case of an error + * + * @features: UFFD features to request + * @non_blocking: create UFFD file descriptor for non-blocking operation + */ +int uffd_create_fd(uint64_t features, bool non_blocking) +{ + int uffd_fd; + int flags; + struct uffdio_api api_struct = { 0 }; + uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER); + + flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0); + uffd_fd = syscall(__NR_userfaultfd, flags); + if (uffd_fd < 0) { + trace_uffd_create_fd_nosys(errno); + return -1; + } + + api_struct.api = UFFD_API; + api_struct.features = features; + if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) { + trace_uffd_create_fd_api_failed(errno); + goto fail; + } + if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { + trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls); + goto fail; + } + + return uffd_fd; + +fail: + close(uffd_fd); + return -1; +} + +/** + * uffd_close_fd: close UFFD file descriptor + * + * @uffd_fd: UFFD file descriptor + */ +void uffd_close_fd(int uffd_fd) +{ + assert(uffd_fd >= 0); + close(uffd_fd); +} + +/** + * uffd_register_memory: register memory range via UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...) + * @ioctls: optional pointer to receive supported IOCTL mask + */ +int uffd_register_memory(int uffd_fd, void *addr, uint64_t length, + uint64_t mode, uint64_t *ioctls) +{ + struct uffdio_register uffd_register; + + uffd_register.range.start = (uintptr_t) addr; + uffd_register.range.len = length; + uffd_register.mode = mode; + + if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) { + trace_uffd_register_memory_failed(addr, length, mode, errno); + return -1; + } + if (ioctls) { + *ioctls = uffd_register.ioctls; + } + + return 0; +} + +/** + * uffd_unregister_memory: un-register memory range with UFFD-IO + * + * Returns 0 in case of success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + */ +int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) { + trace_uffd_unregister_memory_failed(addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO + * + * Returns 0 on success, negative value in case of error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address of memory range + * @length: length of memory range + * @wp: write-protect/unprotect + * @dont_wake: do not wake threads waiting on wr-protected page + */ +int uffd_change_protection(int uffd_fd, void *addr, uint64_t length, + bool wp, bool dont_wake) +{ + struct uffdio_writeprotect uffd_writeprotect; + + uffd_writeprotect.range.start = (uintptr_t) addr; + uffd_writeprotect.range.len = length; + if (!wp && dont_wake) { + /* DONTWAKE is meaningful only on protection release */ + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE; + } else { + uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0); + } + + if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_writeprotect.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_copy_page: copy range of pages to destination via UFFD-IO + * + * Copy range of source pages to the destination to resolve + * missing page fault somewhere in the destination range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @dst_addr: destination base address + * @src_addr: source base address + * @length: length of the range to copy + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr, + uint64_t length, bool dont_wake) +{ + struct uffdio_copy uffd_copy; + + uffd_copy.dst = (uintptr_t) dst_addr; + uffd_copy.src = (uintptr_t) src_addr; + uffd_copy.len = length; + uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) { + error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", dst_addr, src_addr, + length, (uint64_t) uffd_copy.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_zero_page: fill range of pages with zeroes via UFFD-IO + * + * Fill range pages with zeroes to resolve missing page fault within the range. + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range to fill with zeroes + * @dont_wake: do not wake threads waiting on missing page + */ +int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake) +{ + struct uffdio_zeropage uffd_zeropage; + + uffd_zeropage.range.start = (uintptr_t) addr; + uffd_zeropage.range.len = length; + uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0; + + if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) { + error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64 + " mode=%" PRIx64 " errno=%i", addr, length, + (uint64_t) uffd_zeropage.mode, errno); + return -1; + } + + return 0; +} + +/** + * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution + * + * Wake up threads waiting on any page/pages from the designated range. + * The main use case is when during some period, page faults are resolved + * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits + * for the whole memory range are satisfied in a single call to uffd_wakeup(). + * + * Returns 0 on success, negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @addr: base address + * @length: length of the range + */ +int uffd_wakeup(int uffd_fd, void *addr, uint64_t length) +{ + struct uffdio_range uffd_range; + + uffd_range.start = (uintptr_t) addr; + uffd_range.len = length; + + if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) { + error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i", + addr, length, errno); + return -1; + } + + return 0; +} + +/** + * uffd_read_events: read pending UFFD events + * + * Returns number of fetched messages, 0 if non is available or + * negative value in case of an error + * + * @uffd_fd: UFFD file descriptor + * @msgs: pointer to message buffer + * @count: number of messages that can fit in the buffer + */ +int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count) +{ + ssize_t res; + do { + res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg)); + } while (res < 0 && errno == EINTR); + + if ((res < 0 && errno == EAGAIN)) { + return 0; + } + if (res < 0) { + error_report("uffd_read_events() failed: errno=%i", errno); + return -1; + } + + return (int) (res / sizeof(struct uffd_msg)); +} + +/** + * uffd_poll_events: poll UFFD file descriptor for read + * + * Returns true if events are available for read, false otherwise + * + * @uffd_fd: UFFD file descriptor + * @tmo: timeout value + */ +bool uffd_poll_events(int uffd_fd, int tmo) +{ + int res; + struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 }; + + do { + res = poll(&poll_fd, 1, tmo); + } while (res < 0 && errno == EINTR); + + if (res == 0) { + return false; + } + if (res < 0) { + error_report("uffd_poll_events() failed: errno=%i", errno); + return false; + } + + return (poll_fd.revents & POLLIN) != 0; +} diff --git a/util/uuid.c b/util/uuid.c new file mode 100644 index 000000000..b1108dde7 --- /dev/null +++ b/util/uuid.c @@ -0,0 +1,118 @@ +/* + * QEMU UUID functions + * + * Copyright 2016 Red Hat, Inc. + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include "qemu/osdep.h" +#include "qemu/uuid.h" +#include "qemu/bswap.h" + +void qemu_uuid_generate(QemuUUID *uuid) +{ + int i; + uint32_t tmp[4]; + + QEMU_BUILD_BUG_ON(sizeof(QemuUUID) != 16); + + for (i = 0; i < 4; ++i) { + tmp[i] = g_random_int(); + } + memcpy(uuid, tmp, sizeof(tmp)); + /* Set the two most significant bits (bits 6 and 7) of the + clock_seq_hi_and_reserved to zero and one, respectively. */ + uuid->data[8] = (uuid->data[8] & 0x3f) | 0x80; + /* Set the four most significant bits (bits 12 through 15) of the + time_hi_and_version field to the 4-bit version number. + */ + uuid->data[6] = (uuid->data[6] & 0xf) | 0x40; +} + +int qemu_uuid_is_null(const QemuUUID *uu) +{ + static QemuUUID null_uuid; + return qemu_uuid_is_equal(uu, &null_uuid); +} + +int qemu_uuid_is_equal(const QemuUUID *lhv, const QemuUUID *rhv) +{ + return memcmp(lhv, rhv, sizeof(QemuUUID)) == 0; +} + +void qemu_uuid_unparse(const QemuUUID *uuid, char *out) +{ + const unsigned char *uu = &uuid->data[0]; + snprintf(out, UUID_FMT_LEN + 1, UUID_FMT, + uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], + uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); +} + +char *qemu_uuid_unparse_strdup(const QemuUUID *uuid) +{ + const unsigned char *uu = &uuid->data[0]; + return g_strdup_printf(UUID_FMT, + uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], + uu[7], uu[8], uu[9], uu[10], uu[11], uu[12], + uu[13], uu[14], uu[15]); +} + +static bool qemu_uuid_is_valid(const char *str) +{ + int i; + + for (i = 0; i < strlen(str); i++) { + const char c = str[i]; + if (i == 8 || i == 13 || i == 18 || i == 23) { + if (str[i] != '-') { + return false; + } + } else { + if ((c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f')) { + continue; + } + return false; + } + } + return i == 36; +} + +int qemu_uuid_parse(const char *str, QemuUUID *uuid) +{ + unsigned char *uu = &uuid->data[0]; + int ret; + + if (!qemu_uuid_is_valid(str)) { + return -1; + } + + ret = sscanf(str, UUID_FMT, &uu[0], &uu[1], &uu[2], &uu[3], + &uu[4], &uu[5], &uu[6], &uu[7], &uu[8], &uu[9], + &uu[10], &uu[11], &uu[12], &uu[13], &uu[14], + &uu[15]); + + if (ret != 16) { + return -1; + } + return 0; +} + +/* Swap from UUID format endian (BE) to the opposite or vice versa. + */ +QemuUUID qemu_uuid_bswap(QemuUUID uuid) +{ + bswap32s(&uuid.fields.time_low); + bswap16s(&uuid.fields.time_mid); + bswap16s(&uuid.fields.time_high_and_version); + return uuid; +} diff --git a/util/vfio-helpers.c b/util/vfio-helpers.c new file mode 100644 index 000000000..00a80431a --- /dev/null +++ b/util/vfio-helpers.c @@ -0,0 +1,861 @@ +/* + * VFIO utility + * + * Copyright 2016 - 2018 Red Hat, Inc. + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include <sys/ioctl.h> +#include <linux/vfio.h> +#include "qapi/error.h" +#include "exec/ramlist.h" +#include "exec/cpu-common.h" +#include "exec/memory.h" +#include "trace.h" +#include "qemu/error-report.h" +#include "standard-headers/linux/pci_regs.h" +#include "qemu/event_notifier.h" +#include "qemu/vfio-helpers.h" +#include "qemu/lockable.h" +#include "trace.h" + +#define QEMU_VFIO_DEBUG 0 + +#define QEMU_VFIO_IOVA_MIN 0x10000ULL +/* XXX: Once VFIO exposes the iova bit width in the IOMMU capability interface, + * we can use a runtime limit; alternatively it's also possible to do platform + * specific detection by reading sysfs entries. Until then, 39 is a safe bet. + **/ +#define QEMU_VFIO_IOVA_MAX (1ULL << 39) + +typedef struct { + /* Page aligned addr. */ + void *host; + size_t size; + uint64_t iova; +} IOVAMapping; + +struct IOVARange { + uint64_t start; + uint64_t end; +}; + +struct QEMUVFIOState { + QemuMutex lock; + + /* These fields are protected by BQL */ + int container; + int group; + int device; + RAMBlockNotifier ram_notifier; + struct vfio_region_info config_region_info, bar_region_info[6]; + struct IOVARange *usable_iova_ranges; + uint8_t nb_iova_ranges; + + /* These fields are protected by @lock */ + /* VFIO's IO virtual address space is managed by splitting into a few + * sections: + * + * --------------- <= 0 + * |xxxxxxxxxxxxx| + * |-------------| <= QEMU_VFIO_IOVA_MIN + * | | + * | Fixed | + * | | + * |-------------| <= low_water_mark + * | | + * | Free | + * | | + * |-------------| <= high_water_mark + * | | + * | Temp | + * | | + * |-------------| <= QEMU_VFIO_IOVA_MAX + * |xxxxxxxxxxxxx| + * |xxxxxxxxxxxxx| + * --------------- + * + * - Addresses lower than QEMU_VFIO_IOVA_MIN are reserved as invalid; + * + * - Fixed mappings of HVAs are assigned "low" IOVAs in the range of + * [QEMU_VFIO_IOVA_MIN, low_water_mark). Once allocated they will not be + * reclaimed - low_water_mark never shrinks; + * + * - IOVAs in range [low_water_mark, high_water_mark) are free; + * + * - IOVAs in range [high_water_mark, QEMU_VFIO_IOVA_MAX) are volatile + * mappings. At each qemu_vfio_dma_reset_temporary() call, the whole area + * is recycled. The caller should make sure I/O's depending on these + * mappings are completed before calling. + **/ + uint64_t low_water_mark; + uint64_t high_water_mark; + IOVAMapping *mappings; + int nr_mappings; +}; + +/** + * Find group file by PCI device address as specified @device, and return the + * path. The returned string is owned by caller and should be g_free'ed later. + */ +static char *sysfs_find_group_file(const char *device, Error **errp) +{ + char *sysfs_link; + char *sysfs_group; + char *p; + char *path = NULL; + + sysfs_link = g_strdup_printf("/sys/bus/pci/devices/%s/iommu_group", device); + sysfs_group = g_malloc0(PATH_MAX); + if (readlink(sysfs_link, sysfs_group, PATH_MAX - 1) == -1) { + error_setg_errno(errp, errno, "Failed to find iommu group sysfs path"); + goto out; + } + p = strrchr(sysfs_group, '/'); + if (!p) { + error_setg(errp, "Failed to find iommu group number"); + goto out; + } + + path = g_strdup_printf("/dev/vfio/%s", p + 1); +out: + g_free(sysfs_link); + g_free(sysfs_group); + return path; +} + +static inline void assert_bar_index_valid(QEMUVFIOState *s, int index) +{ + assert(index >= 0 && index < ARRAY_SIZE(s->bar_region_info)); +} + +static int qemu_vfio_pci_init_bar(QEMUVFIOState *s, int index, Error **errp) +{ + g_autofree char *barname = NULL; + assert_bar_index_valid(s, index); + s->bar_region_info[index] = (struct vfio_region_info) { + .index = VFIO_PCI_BAR0_REGION_INDEX + index, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->bar_region_info[index])) { + error_setg_errno(errp, errno, "Failed to get BAR region info"); + return -errno; + } + barname = g_strdup_printf("bar[%d]", index); + trace_qemu_vfio_region_info(barname, s->bar_region_info[index].offset, + s->bar_region_info[index].size, + s->bar_region_info[index].cap_offset); + + return 0; +} + +/** + * Map a PCI bar area. + */ +void *qemu_vfio_pci_map_bar(QEMUVFIOState *s, int index, + uint64_t offset, uint64_t size, int prot, + Error **errp) +{ + void *p; + assert(QEMU_IS_ALIGNED(offset, qemu_real_host_page_size)); + assert_bar_index_valid(s, index); + p = mmap(NULL, MIN(size, s->bar_region_info[index].size - offset), + prot, MAP_SHARED, + s->device, s->bar_region_info[index].offset + offset); + trace_qemu_vfio_pci_map_bar(index, s->bar_region_info[index].offset , + size, offset, p); + if (p == MAP_FAILED) { + error_setg_errno(errp, errno, "Failed to map BAR region"); + p = NULL; + } + return p; +} + +/** + * Unmap a PCI bar area. + */ +void qemu_vfio_pci_unmap_bar(QEMUVFIOState *s, int index, void *bar, + uint64_t offset, uint64_t size) +{ + if (bar) { + munmap(bar, MIN(size, s->bar_region_info[index].size - offset)); + } +} + +/** + * Initialize device IRQ with @irq_type and register an event notifier. + */ +int qemu_vfio_pci_init_irq(QEMUVFIOState *s, EventNotifier *e, + int irq_type, Error **errp) +{ + int r; + struct vfio_irq_set *irq_set; + size_t irq_set_size; + struct vfio_irq_info irq_info = { .argsz = sizeof(irq_info) }; + + irq_info.index = irq_type; + if (ioctl(s->device, VFIO_DEVICE_GET_IRQ_INFO, &irq_info)) { + error_setg_errno(errp, errno, "Failed to get device interrupt info"); + return -errno; + } + if (!(irq_info.flags & VFIO_IRQ_INFO_EVENTFD)) { + error_setg(errp, "Device interrupt doesn't support eventfd"); + return -EINVAL; + } + + irq_set_size = sizeof(*irq_set) + sizeof(int); + irq_set = g_malloc0(irq_set_size); + + /* Get to a known IRQ state */ + *irq_set = (struct vfio_irq_set) { + .argsz = irq_set_size, + .flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER, + .index = irq_info.index, + .start = 0, + .count = 1, + }; + + *(int *)&irq_set->data = event_notifier_get_fd(e); + r = ioctl(s->device, VFIO_DEVICE_SET_IRQS, irq_set); + g_free(irq_set); + if (r) { + error_setg_errno(errp, errno, "Failed to setup device interrupt"); + return -errno; + } + return 0; +} + +static int qemu_vfio_pci_read_config(QEMUVFIOState *s, void *buf, + int size, int ofs) +{ + int ret; + + trace_qemu_vfio_pci_read_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); + do { + ret = pread(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static int qemu_vfio_pci_write_config(QEMUVFIOState *s, void *buf, int size, int ofs) +{ + int ret; + + trace_qemu_vfio_pci_write_config(buf, ofs, size, + s->config_region_info.offset, + s->config_region_info.size); + assert(QEMU_IS_ALIGNED(s->config_region_info.offset + ofs, size)); + do { + ret = pwrite(s->device, buf, size, s->config_region_info.offset + ofs); + } while (ret == -1 && errno == EINTR); + return ret == size ? 0 : -errno; +} + +static void collect_usable_iova_ranges(QEMUVFIOState *s, void *buf) +{ + struct vfio_iommu_type1_info *info = (struct vfio_iommu_type1_info *)buf; + struct vfio_info_cap_header *cap = (void *)buf + info->cap_offset; + struct vfio_iommu_type1_info_cap_iova_range *cap_iova_range; + int i; + + while (cap->id != VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE) { + if (!cap->next) { + return; + } + cap = (struct vfio_info_cap_header *)(buf + cap->next); + } + + cap_iova_range = (struct vfio_iommu_type1_info_cap_iova_range *)cap; + + s->nb_iova_ranges = cap_iova_range->nr_iovas; + if (s->nb_iova_ranges > 1) { + s->usable_iova_ranges = + g_realloc(s->usable_iova_ranges, + s->nb_iova_ranges * sizeof(struct IOVARange)); + } + + for (i = 0; i < s->nb_iova_ranges; i++) { + s->usable_iova_ranges[i].start = cap_iova_range->iova_ranges[i].start; + s->usable_iova_ranges[i].end = cap_iova_range->iova_ranges[i].end; + } +} + +static int qemu_vfio_init_pci(QEMUVFIOState *s, const char *device, + Error **errp) +{ + int ret; + int i; + uint16_t pci_cmd; + struct vfio_group_status group_status = { .argsz = sizeof(group_status) }; + struct vfio_iommu_type1_info *iommu_info = NULL; + size_t iommu_info_size = sizeof(*iommu_info); + struct vfio_device_info device_info = { .argsz = sizeof(device_info) }; + char *group_file = NULL; + + s->usable_iova_ranges = NULL; + + /* Create a new container */ + s->container = open("/dev/vfio/vfio", O_RDWR); + + if (s->container == -1) { + error_setg_errno(errp, errno, "Failed to open /dev/vfio/vfio"); + return -errno; + } + if (ioctl(s->container, VFIO_GET_API_VERSION) != VFIO_API_VERSION) { + error_setg(errp, "Invalid VFIO version"); + ret = -EINVAL; + goto fail_container; + } + + if (!ioctl(s->container, VFIO_CHECK_EXTENSION, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "VFIO IOMMU Type1 is not supported"); + ret = -EINVAL; + goto fail_container; + } + + /* Open the group */ + group_file = sysfs_find_group_file(device, errp); + if (!group_file) { + ret = -EINVAL; + goto fail_container; + } + + s->group = open(group_file, O_RDWR); + if (s->group == -1) { + error_setg_errno(errp, errno, "Failed to open VFIO group file: %s", + group_file); + g_free(group_file); + ret = -errno; + goto fail_container; + } + g_free(group_file); + + /* Test the group is viable and available */ + if (ioctl(s->group, VFIO_GROUP_GET_STATUS, &group_status)) { + error_setg_errno(errp, errno, "Failed to get VFIO group status"); + ret = -errno; + goto fail; + } + + if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) { + error_setg(errp, "VFIO group is not viable"); + ret = -EINVAL; + goto fail; + } + + /* Add the group to the container */ + if (ioctl(s->group, VFIO_GROUP_SET_CONTAINER, &s->container)) { + error_setg_errno(errp, errno, "Failed to add group to VFIO container"); + ret = -errno; + goto fail; + } + + /* Enable the IOMMU model we want */ + if (ioctl(s->container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU)) { + error_setg_errno(errp, errno, "Failed to set VFIO IOMMU type"); + ret = -errno; + goto fail; + } + + iommu_info = g_malloc0(iommu_info_size); + iommu_info->argsz = iommu_info_size; + + /* Get additional IOMMU info */ + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { + error_setg_errno(errp, errno, "Failed to get IOMMU info"); + ret = -errno; + goto fail; + } + + /* + * if the kernel does not report usable IOVA regions, choose + * the legacy [QEMU_VFIO_IOVA_MIN, QEMU_VFIO_IOVA_MAX -1] region + */ + s->nb_iova_ranges = 1; + s->usable_iova_ranges = g_new0(struct IOVARange, 1); + s->usable_iova_ranges[0].start = QEMU_VFIO_IOVA_MIN; + s->usable_iova_ranges[0].end = QEMU_VFIO_IOVA_MAX - 1; + + if (iommu_info->argsz > iommu_info_size) { + iommu_info_size = iommu_info->argsz; + iommu_info = g_realloc(iommu_info, iommu_info_size); + if (ioctl(s->container, VFIO_IOMMU_GET_INFO, iommu_info)) { + ret = -errno; + goto fail; + } + collect_usable_iova_ranges(s, iommu_info); + } + + s->device = ioctl(s->group, VFIO_GROUP_GET_DEVICE_FD, device); + + if (s->device < 0) { + error_setg_errno(errp, errno, "Failed to get device fd"); + ret = -errno; + goto fail; + } + + /* Test and setup the device */ + if (ioctl(s->device, VFIO_DEVICE_GET_INFO, &device_info)) { + error_setg_errno(errp, errno, "Failed to get device info"); + ret = -errno; + goto fail; + } + + if (device_info.num_regions < VFIO_PCI_CONFIG_REGION_INDEX) { + error_setg(errp, "Invalid device regions"); + ret = -EINVAL; + goto fail; + } + + s->config_region_info = (struct vfio_region_info) { + .index = VFIO_PCI_CONFIG_REGION_INDEX, + .argsz = sizeof(struct vfio_region_info), + }; + if (ioctl(s->device, VFIO_DEVICE_GET_REGION_INFO, &s->config_region_info)) { + error_setg_errno(errp, errno, "Failed to get config region info"); + ret = -errno; + goto fail; + } + trace_qemu_vfio_region_info("config", s->config_region_info.offset, + s->config_region_info.size, + s->config_region_info.cap_offset); + + for (i = 0; i < ARRAY_SIZE(s->bar_region_info); i++) { + ret = qemu_vfio_pci_init_bar(s, i, errp); + if (ret) { + goto fail; + } + } + + /* Enable bus master */ + ret = qemu_vfio_pci_read_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + pci_cmd |= PCI_COMMAND_MASTER; + ret = qemu_vfio_pci_write_config(s, &pci_cmd, sizeof(pci_cmd), PCI_COMMAND); + if (ret) { + goto fail; + } + g_free(iommu_info); + return 0; +fail: + g_free(s->usable_iova_ranges); + s->usable_iova_ranges = NULL; + s->nb_iova_ranges = 0; + g_free(iommu_info); + close(s->group); +fail_container: + close(s->container); + return ret; +} + +static void qemu_vfio_ram_block_added(RAMBlockNotifier *n, void *host, + size_t size, size_t max_size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + Error *local_err = NULL; + int ret; + + trace_qemu_vfio_ram_block_added(s, host, max_size); + ret = qemu_vfio_dma_map(s, host, max_size, false, NULL, &local_err); + if (ret) { + error_reportf_err(local_err, + "qemu_vfio_dma_map(%p, %zu) failed: ", + host, max_size); + } +} + +static void qemu_vfio_ram_block_removed(RAMBlockNotifier *n, void *host, + size_t size, size_t max_size) +{ + QEMUVFIOState *s = container_of(n, QEMUVFIOState, ram_notifier); + if (host) { + trace_qemu_vfio_ram_block_removed(s, host, max_size); + qemu_vfio_dma_unmap(s, host); + } +} + +static void qemu_vfio_open_common(QEMUVFIOState *s) +{ + qemu_mutex_init(&s->lock); + s->ram_notifier.ram_block_added = qemu_vfio_ram_block_added; + s->ram_notifier.ram_block_removed = qemu_vfio_ram_block_removed; + s->low_water_mark = QEMU_VFIO_IOVA_MIN; + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + ram_block_notifier_add(&s->ram_notifier); +} + +/** + * Open a PCI device, e.g. "0000:00:01.0". + */ +QEMUVFIOState *qemu_vfio_open_pci(const char *device, Error **errp) +{ + int r; + QEMUVFIOState *s = g_new0(QEMUVFIOState, 1); + + /* + * VFIO may pin all memory inside mappings, resulting it in pinning + * all memory inside RAM blocks unconditionally. + */ + r = ram_block_discard_disable(true); + if (r) { + error_setg_errno(errp, -r, "Cannot set discarding of RAM broken"); + g_free(s); + return NULL; + } + + r = qemu_vfio_init_pci(s, device, errp); + if (r) { + ram_block_discard_disable(false); + g_free(s); + return NULL; + } + qemu_vfio_open_common(s); + return s; +} + +static void qemu_vfio_dump_mappings(QEMUVFIOState *s) +{ + for (int i = 0; i < s->nr_mappings; ++i) { + trace_qemu_vfio_dump_mapping(s->mappings[i].host, + s->mappings[i].iova, + s->mappings[i].size); + } +} + +/** + * Find the mapping entry that contains [host, host + size) and set @index to + * the position. If no entry contains it, @index is the position _after_ which + * to insert the new mapping. IOW, it is the index of the largest element that + * is smaller than @host, or -1 if no entry is. + */ +static IOVAMapping *qemu_vfio_find_mapping(QEMUVFIOState *s, void *host, + int *index) +{ + IOVAMapping *p = s->mappings; + IOVAMapping *q = p ? p + s->nr_mappings - 1 : NULL; + IOVAMapping *mid; + trace_qemu_vfio_find_mapping(s, host); + if (!p) { + *index = -1; + return NULL; + } + while (true) { + mid = p + (q - p) / 2; + if (mid == p) { + break; + } + if (mid->host > host) { + q = mid; + } else if (mid->host < host) { + p = mid; + } else { + break; + } + } + if (mid->host > host) { + mid--; + } else if (mid < &s->mappings[s->nr_mappings - 1] + && (mid + 1)->host <= host) { + mid++; + } + *index = mid - &s->mappings[0]; + if (mid >= &s->mappings[0] && + mid->host <= host && mid->host + mid->size > host) { + assert(mid < &s->mappings[s->nr_mappings]); + return mid; + } + /* At this point *index + 1 is the right position to insert the new + * mapping.*/ + return NULL; +} + +/** + * Allocate IOVA and create a new mapping record and insert it in @s. + */ +static IOVAMapping *qemu_vfio_add_mapping(QEMUVFIOState *s, + void *host, size_t size, + int index, uint64_t iova) +{ + int shift; + IOVAMapping m = {.host = host, .size = size, .iova = iova}; + IOVAMapping *insert; + + assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); + assert(QEMU_IS_ALIGNED(s->low_water_mark, qemu_real_host_page_size)); + assert(QEMU_IS_ALIGNED(s->high_water_mark, qemu_real_host_page_size)); + trace_qemu_vfio_new_mapping(s, host, size, index, iova); + + assert(index >= 0); + s->nr_mappings++; + s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); + insert = &s->mappings[index]; + shift = s->nr_mappings - index - 1; + if (shift) { + memmove(insert + 1, insert, shift * sizeof(s->mappings[0])); + } + *insert = m; + return insert; +} + +/* Do the DMA mapping with VFIO. */ +static int qemu_vfio_do_mapping(QEMUVFIOState *s, void *host, size_t size, + uint64_t iova, Error **errp) +{ + struct vfio_iommu_type1_dma_map dma_map = { + .argsz = sizeof(dma_map), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .iova = iova, + .vaddr = (uintptr_t)host, + .size = size, + }; + trace_qemu_vfio_do_mapping(s, host, iova, size); + + if (ioctl(s->container, VFIO_IOMMU_MAP_DMA, &dma_map)) { + error_setg_errno(errp, errno, "VFIO_MAP_DMA failed"); + return -errno; + } + return 0; +} + +/** + * Undo the DMA mapping from @s with VFIO, and remove from mapping list. + */ +static void qemu_vfio_undo_mapping(QEMUVFIOState *s, IOVAMapping *mapping, + Error **errp) +{ + int index; + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = mapping->iova, + .size = mapping->size, + }; + + index = mapping - s->mappings; + assert(mapping->size > 0); + assert(QEMU_IS_ALIGNED(mapping->size, qemu_real_host_page_size)); + assert(index >= 0 && index < s->nr_mappings); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_setg_errno(errp, errno, "VFIO_UNMAP_DMA failed"); + } + memmove(mapping, &s->mappings[index + 1], + sizeof(s->mappings[0]) * (s->nr_mappings - index - 1)); + s->nr_mappings--; + s->mappings = g_renew(IOVAMapping, s->mappings, s->nr_mappings); +} + +/* Check if the mapping list is (ascending) ordered. */ +static bool qemu_vfio_verify_mappings(QEMUVFIOState *s) +{ + int i; + if (QEMU_VFIO_DEBUG) { + for (i = 0; i < s->nr_mappings - 1; ++i) { + if (!(s->mappings[i].host < s->mappings[i + 1].host)) { + error_report("item %d not sorted!", i); + qemu_vfio_dump_mappings(s); + return false; + } + if (!(s->mappings[i].host + s->mappings[i].size <= + s->mappings[i + 1].host)) { + error_report("item %d overlap with next!", i); + qemu_vfio_dump_mappings(s); + return false; + } + } + } + return true; +} + +static bool qemu_vfio_find_fixed_iova(QEMUVFIOState *s, size_t size, + uint64_t *iova, Error **errp) +{ + int i; + + for (i = 0; i < s->nb_iova_ranges; i++) { + if (s->usable_iova_ranges[i].end < s->low_water_mark) { + continue; + } + s->low_water_mark = + MAX(s->low_water_mark, s->usable_iova_ranges[i].start); + + if (s->usable_iova_ranges[i].end - s->low_water_mark + 1 >= size || + s->usable_iova_ranges[i].end - s->low_water_mark + 1 == 0) { + *iova = s->low_water_mark; + s->low_water_mark += size; + return true; + } + } + error_setg(errp, "fixed iova range not found"); + + return false; +} + +static bool qemu_vfio_find_temp_iova(QEMUVFIOState *s, size_t size, + uint64_t *iova, Error **errp) +{ + int i; + + for (i = s->nb_iova_ranges - 1; i >= 0; i--) { + if (s->usable_iova_ranges[i].start > s->high_water_mark) { + continue; + } + s->high_water_mark = + MIN(s->high_water_mark, s->usable_iova_ranges[i].end + 1); + + if (s->high_water_mark - s->usable_iova_ranges[i].start + 1 >= size || + s->high_water_mark - s->usable_iova_ranges[i].start + 1 == 0) { + *iova = s->high_water_mark - size; + s->high_water_mark = *iova; + return true; + } + } + error_setg(errp, "temporary iova range not found"); + + return false; +} + +/** + * qemu_vfio_water_mark_reached: + * + * Returns %true if high watermark has been reached, %false otherwise. + */ +static bool qemu_vfio_water_mark_reached(QEMUVFIOState *s, size_t size, + Error **errp) +{ + if (s->high_water_mark - s->low_water_mark + 1 < size) { + error_setg(errp, "iova exhausted (water mark reached)"); + return true; + } + return false; +} + +/* Map [host, host + size) area into a contiguous IOVA address space, and store + * the result in @iova if not NULL. The caller need to make sure the area is + * aligned to page size, and mustn't overlap with existing mapping areas (split + * mapping status within this area is not allowed). + */ +int qemu_vfio_dma_map(QEMUVFIOState *s, void *host, size_t size, + bool temporary, uint64_t *iova, Error **errp) +{ + int index; + IOVAMapping *mapping; + uint64_t iova0; + + assert(QEMU_PTR_IS_ALIGNED(host, qemu_real_host_page_size)); + assert(QEMU_IS_ALIGNED(size, qemu_real_host_page_size)); + trace_qemu_vfio_dma_map(s, host, size, temporary, iova); + QEMU_LOCK_GUARD(&s->lock); + mapping = qemu_vfio_find_mapping(s, host, &index); + if (mapping) { + iova0 = mapping->iova + ((uint8_t *)host - (uint8_t *)mapping->host); + } else { + int ret; + + if (qemu_vfio_water_mark_reached(s, size, errp)) { + return -ENOMEM; + } + if (!temporary) { + if (!qemu_vfio_find_fixed_iova(s, size, &iova0, errp)) { + return -ENOMEM; + } + + mapping = qemu_vfio_add_mapping(s, host, size, index + 1, iova0); + assert(qemu_vfio_verify_mappings(s)); + ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); + if (ret < 0) { + qemu_vfio_undo_mapping(s, mapping, NULL); + return ret; + } + qemu_vfio_dump_mappings(s); + } else { + if (!qemu_vfio_find_temp_iova(s, size, &iova0, errp)) { + return -ENOMEM; + } + ret = qemu_vfio_do_mapping(s, host, size, iova0, errp); + if (ret < 0) { + return ret; + } + } + } + trace_qemu_vfio_dma_mapped(s, host, iova0, size); + if (iova) { + *iova = iova0; + } + return 0; +} + +/* Reset the high watermark and free all "temporary" mappings. */ +int qemu_vfio_dma_reset_temporary(QEMUVFIOState *s) +{ + struct vfio_iommu_type1_dma_unmap unmap = { + .argsz = sizeof(unmap), + .flags = 0, + .iova = s->high_water_mark, + .size = QEMU_VFIO_IOVA_MAX - s->high_water_mark, + }; + trace_qemu_vfio_dma_reset_temporary(s); + QEMU_LOCK_GUARD(&s->lock); + if (ioctl(s->container, VFIO_IOMMU_UNMAP_DMA, &unmap)) { + error_report("VFIO_UNMAP_DMA failed: %s", strerror(errno)); + return -errno; + } + s->high_water_mark = QEMU_VFIO_IOVA_MAX; + return 0; +} + +/* Unmapping the whole area that was previously mapped with + * qemu_vfio_dma_map(). */ +void qemu_vfio_dma_unmap(QEMUVFIOState *s, void *host) +{ + int index = 0; + IOVAMapping *m; + + if (!host) { + return; + } + + trace_qemu_vfio_dma_unmap(s, host); + QEMU_LOCK_GUARD(&s->lock); + m = qemu_vfio_find_mapping(s, host, &index); + if (!m) { + return; + } + qemu_vfio_undo_mapping(s, m, NULL); +} + +static void qemu_vfio_reset(QEMUVFIOState *s) +{ + ioctl(s->device, VFIO_DEVICE_RESET); +} + +/* Close and free the VFIO resources. */ +void qemu_vfio_close(QEMUVFIOState *s) +{ + int i; + + if (!s) { + return; + } + for (i = 0; i < s->nr_mappings; ++i) { + qemu_vfio_undo_mapping(s, &s->mappings[i], NULL); + } + ram_block_notifier_remove(&s->ram_notifier); + g_free(s->usable_iova_ranges); + s->nb_iova_ranges = 0; + qemu_vfio_reset(s); + close(s->device); + close(s->group); + close(s->container); + ram_block_discard_disable(false); +} diff --git a/util/vhost-user-server.c b/util/vhost-user-server.c new file mode 100644 index 000000000..783d847a6 --- /dev/null +++ b/util/vhost-user-server.c @@ -0,0 +1,446 @@ +/* + * Sharing QEMU devices via vhost-user protocol + * + * Copyright (c) Coiby Xu <coiby.xu@gmail.com>. + * Copyright (c) 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "qemu/main-loop.h" +#include "qemu/vhost-user-server.h" +#include "block/aio-wait.h" + +/* + * Theory of operation: + * + * VuServer is started and stopped by vhost_user_server_start() and + * vhost_user_server_stop() from the main loop thread. Starting the server + * opens a vhost-user UNIX domain socket and listens for incoming connections. + * Only one connection is allowed at a time. + * + * The connection is handled by the vu_client_trip() coroutine in the + * VuServer->ctx AioContext. The coroutine consists of a vu_dispatch() loop + * where libvhost-user calls vu_message_read() to receive the next vhost-user + * protocol messages over the UNIX domain socket. + * + * When virtqueues are set up libvhost-user calls set_watch() to monitor kick + * fds. These fds are also handled in the VuServer->ctx AioContext. + * + * Both vu_client_trip() and kick fd monitoring can be stopped by shutting down + * the socket connection. Shutting down the socket connection causes + * vu_message_read() to fail since no more data can be received from the socket. + * After vu_dispatch() fails, vu_client_trip() calls vu_deinit() to stop + * libvhost-user before terminating the coroutine. vu_deinit() calls + * remove_watch() to stop monitoring kick fds and this stops virtqueue + * processing. + * + * When vu_client_trip() has finished cleaning up it schedules a BH in the main + * loop thread to accept the next client connection. + * + * When libvhost-user detects an error it calls panic_cb() and sets the + * dev->broken flag. Both vu_client_trip() and kick fd processing stop when + * the dev->broken flag is set. + * + * It is possible to switch AioContexts using + * vhost_user_server_detach_aio_context() and + * vhost_user_server_attach_aio_context(). They stop monitoring fds in the old + * AioContext and resume monitoring in the new AioContext. The vu_client_trip() + * coroutine remains in a yielded state during the switch. This is made + * possible by QIOChannel's support for spurious coroutine re-entry in + * qio_channel_yield(). The coroutine will restart I/O when re-entered from the + * new AioContext. + */ + +static void vmsg_close_fds(VhostUserMsg *vmsg) +{ + int i; + for (i = 0; i < vmsg->fd_num; i++) { + close(vmsg->fds[i]); + } +} + +static void vmsg_unblock_fds(VhostUserMsg *vmsg) +{ + int i; + for (i = 0; i < vmsg->fd_num; i++) { + qemu_set_nonblock(vmsg->fds[i]); + } +} + +static void panic_cb(VuDev *vu_dev, const char *buf) +{ + error_report("vu_panic: %s", buf); +} + +static bool coroutine_fn +vu_message_read(VuDev *vu_dev, int conn_fd, VhostUserMsg *vmsg) +{ + struct iovec iov = { + .iov_base = (char *)vmsg, + .iov_len = VHOST_USER_HDR_SIZE, + }; + int rc, read_bytes = 0; + Error *local_err = NULL; + const size_t max_fds = G_N_ELEMENTS(vmsg->fds); + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + QIOChannel *ioc = server->ioc; + + vmsg->fd_num = 0; + if (!ioc) { + error_report_err(local_err); + goto fail; + } + + assert(qemu_in_coroutine()); + do { + size_t nfds = 0; + int *fds = NULL; + + /* + * qio_channel_readv_full may have short reads, keeping calling it + * until getting VHOST_USER_HDR_SIZE or 0 bytes in total + */ + rc = qio_channel_readv_full(ioc, &iov, 1, &fds, &nfds, &local_err); + if (rc < 0) { + if (rc == QIO_CHANNEL_ERR_BLOCK) { + assert(local_err == NULL); + qio_channel_yield(ioc, G_IO_IN); + continue; + } else { + error_report_err(local_err); + goto fail; + } + } + + if (nfds > 0) { + if (vmsg->fd_num + nfds > max_fds) { + error_report("A maximum of %zu fds are allowed, " + "however got %zu fds now", + max_fds, vmsg->fd_num + nfds); + g_free(fds); + goto fail; + } + memcpy(vmsg->fds + vmsg->fd_num, fds, nfds * sizeof(vmsg->fds[0])); + vmsg->fd_num += nfds; + g_free(fds); + } + + if (rc == 0) { /* socket closed */ + goto fail; + } + + iov.iov_base += rc; + iov.iov_len -= rc; + read_bytes += rc; + } while (read_bytes != VHOST_USER_HDR_SIZE); + + /* qio_channel_readv_full will make socket fds blocking, unblock them */ + vmsg_unblock_fds(vmsg); + if (vmsg->size > sizeof(vmsg->payload)) { + error_report("Error: too big message request: %d, " + "size: vmsg->size: %u, " + "while sizeof(vmsg->payload) = %zu", + vmsg->request, vmsg->size, sizeof(vmsg->payload)); + goto fail; + } + + struct iovec iov_payload = { + .iov_base = (char *)&vmsg->payload, + .iov_len = vmsg->size, + }; + if (vmsg->size) { + rc = qio_channel_readv_all_eof(ioc, &iov_payload, 1, &local_err); + if (rc != 1) { + if (local_err) { + error_report_err(local_err); + } + goto fail; + } + } + + return true; + +fail: + vmsg_close_fds(vmsg); + + return false; +} + +static coroutine_fn void vu_client_trip(void *opaque) +{ + VuServer *server = opaque; + VuDev *vu_dev = &server->vu_dev; + + while (!vu_dev->broken && vu_dispatch(vu_dev)) { + /* Keep running */ + } + + vu_deinit(vu_dev); + + /* vu_deinit() should have called remove_watch() */ + assert(QTAILQ_EMPTY(&server->vu_fd_watches)); + + object_unref(OBJECT(server->sioc)); + server->sioc = NULL; + + object_unref(OBJECT(server->ioc)); + server->ioc = NULL; + + server->co_trip = NULL; + if (server->restart_listener_bh) { + qemu_bh_schedule(server->restart_listener_bh); + } + aio_wait_kick(); +} + +/* + * a wrapper for vu_kick_cb + * + * since aio_dispatch can only pass one user data pointer to the + * callback function, pack VuDev and pvt into a struct. Then unpack it + * and pass them to vu_kick_cb + */ +static void kick_handler(void *opaque) +{ + VuFdWatch *vu_fd_watch = opaque; + VuDev *vu_dev = vu_fd_watch->vu_dev; + + vu_fd_watch->cb(vu_dev, 0, vu_fd_watch->pvt); + + /* Stop vu_client_trip() if an error occurred in vu_fd_watch->cb() */ + if (vu_dev->broken) { + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + + qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + } +} + +static VuFdWatch *find_vu_fd_watch(VuServer *server, int fd) +{ + + VuFdWatch *vu_fd_watch, *next; + QTAILQ_FOREACH_SAFE(vu_fd_watch, &server->vu_fd_watches, next, next) { + if (vu_fd_watch->fd == fd) { + return vu_fd_watch; + } + } + return NULL; +} + +static void +set_watch(VuDev *vu_dev, int fd, int vu_evt, + vu_watch_cb cb, void *pvt) +{ + + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + g_assert(vu_dev); + g_assert(fd >= 0); + g_assert(cb); + + VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd); + + if (!vu_fd_watch) { + VuFdWatch *vu_fd_watch = g_new0(VuFdWatch, 1); + + QTAILQ_INSERT_TAIL(&server->vu_fd_watches, vu_fd_watch, next); + + vu_fd_watch->fd = fd; + vu_fd_watch->cb = cb; + qemu_set_nonblock(fd); + aio_set_fd_handler(server->ioc->ctx, fd, true, kick_handler, + NULL, NULL, vu_fd_watch); + vu_fd_watch->vu_dev = vu_dev; + vu_fd_watch->pvt = pvt; + } +} + + +static void remove_watch(VuDev *vu_dev, int fd) +{ + VuServer *server; + g_assert(vu_dev); + g_assert(fd >= 0); + + server = container_of(vu_dev, VuServer, vu_dev); + + VuFdWatch *vu_fd_watch = find_vu_fd_watch(server, fd); + + if (!vu_fd_watch) { + return; + } + aio_set_fd_handler(server->ioc->ctx, fd, true, NULL, NULL, NULL, NULL); + + QTAILQ_REMOVE(&server->vu_fd_watches, vu_fd_watch, next); + g_free(vu_fd_watch); +} + + +static void vu_accept(QIONetListener *listener, QIOChannelSocket *sioc, + gpointer opaque) +{ + VuServer *server = opaque; + + if (server->sioc) { + warn_report("Only one vhost-user client is allowed to " + "connect the server one time"); + return; + } + + if (!vu_init(&server->vu_dev, server->max_queues, sioc->fd, panic_cb, + vu_message_read, set_watch, remove_watch, server->vu_iface)) { + error_report("Failed to initialize libvhost-user"); + return; + } + + /* + * Unset the callback function for network listener to make another + * vhost-user client keeping waiting until this client disconnects + */ + qio_net_listener_set_client_func(server->listener, + NULL, + NULL, + NULL); + server->sioc = sioc; + /* + * Increase the object reference, so sioc will not freed by + * qio_net_listener_channel_func which will call object_unref(OBJECT(sioc)) + */ + object_ref(OBJECT(server->sioc)); + qio_channel_set_name(QIO_CHANNEL(sioc), "vhost-user client"); + server->ioc = QIO_CHANNEL(sioc); + object_ref(OBJECT(server->ioc)); + + /* TODO vu_message_write() spins if non-blocking! */ + qio_channel_set_blocking(server->ioc, false, NULL); + + server->co_trip = qemu_coroutine_create(vu_client_trip, server); + + aio_context_acquire(server->ctx); + vhost_user_server_attach_aio_context(server, server->ctx); + aio_context_release(server->ctx); +} + +void vhost_user_server_stop(VuServer *server) +{ + aio_context_acquire(server->ctx); + + qemu_bh_delete(server->restart_listener_bh); + server->restart_listener_bh = NULL; + + if (server->sioc) { + VuFdWatch *vu_fd_watch; + + QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) { + aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true, + NULL, NULL, NULL, vu_fd_watch); + } + + qio_channel_shutdown(server->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL); + + AIO_WAIT_WHILE(server->ctx, server->co_trip); + } + + aio_context_release(server->ctx); + + if (server->listener) { + qio_net_listener_disconnect(server->listener); + object_unref(OBJECT(server->listener)); + } +} + +/* + * Allow the next client to connect to the server. Called from a BH in the main + * loop. + */ +static void restart_listener_bh(void *opaque) +{ + VuServer *server = opaque; + + qio_net_listener_set_client_func(server->listener, vu_accept, server, + NULL); +} + +/* Called with ctx acquired */ +void vhost_user_server_attach_aio_context(VuServer *server, AioContext *ctx) +{ + VuFdWatch *vu_fd_watch; + + server->ctx = ctx; + + if (!server->sioc) { + return; + } + + qio_channel_attach_aio_context(server->ioc, ctx); + + QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) { + aio_set_fd_handler(ctx, vu_fd_watch->fd, true, kick_handler, NULL, + NULL, vu_fd_watch); + } + + aio_co_schedule(ctx, server->co_trip); +} + +/* Called with server->ctx acquired */ +void vhost_user_server_detach_aio_context(VuServer *server) +{ + if (server->sioc) { + VuFdWatch *vu_fd_watch; + + QTAILQ_FOREACH(vu_fd_watch, &server->vu_fd_watches, next) { + aio_set_fd_handler(server->ctx, vu_fd_watch->fd, true, + NULL, NULL, NULL, vu_fd_watch); + } + + qio_channel_detach_aio_context(server->ioc); + } + + server->ctx = NULL; +} + +bool vhost_user_server_start(VuServer *server, + SocketAddress *socket_addr, + AioContext *ctx, + uint16_t max_queues, + const VuDevIface *vu_iface, + Error **errp) +{ + QEMUBH *bh; + QIONetListener *listener; + + if (socket_addr->type != SOCKET_ADDRESS_TYPE_UNIX && + socket_addr->type != SOCKET_ADDRESS_TYPE_FD) { + error_setg(errp, "Only socket address types 'unix' and 'fd' are supported"); + return false; + } + + listener = qio_net_listener_new(); + if (qio_net_listener_open_sync(listener, socket_addr, 1, + errp) < 0) { + object_unref(OBJECT(listener)); + return false; + } + + bh = qemu_bh_new(restart_listener_bh, server); + + /* zero out unspecified fields */ + *server = (VuServer) { + .listener = listener, + .restart_listener_bh = bh, + .vu_iface = vu_iface, + .max_queues = max_queues, + .ctx = ctx, + }; + + qio_net_listener_set_name(server->listener, "vhost-user-backend-listener"); + + qio_net_listener_set_client_func(server->listener, + vu_accept, + server, + NULL); + + QTAILQ_INIT(&server->vu_fd_watches); + return true; +} diff --git a/util/yank.c b/util/yank.c new file mode 100644 index 000000000..abf47c346 --- /dev/null +++ b/util/yank.c @@ -0,0 +1,199 @@ +/* + * QEMU yank feature + * + * Copyright (c) Lukas Straub <lukasstraub2@web.de> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/thread.h" +#include "qemu/queue.h" +#include "qemu/lockable.h" +#include "qapi/qapi-commands-yank.h" +#include "qapi/qapi-visit-yank.h" +#include "qapi/clone-visitor.h" +#include "qemu/yank.h" + +struct YankFuncAndParam { + YankFn *func; + void *opaque; + QLIST_ENTRY(YankFuncAndParam) next; +}; + +struct YankInstanceEntry { + YankInstance *instance; + QLIST_HEAD(, YankFuncAndParam) yankfns; + QLIST_ENTRY(YankInstanceEntry) next; +}; + +typedef struct YankFuncAndParam YankFuncAndParam; +typedef struct YankInstanceEntry YankInstanceEntry; + +/* + * This lock protects the yank_instance_list below. Because it's taken by + * OOB-capable commands, it must be "fast", i.e. it may only be held for a + * bounded, short time. See docs/devel/qapi-code-gen.txt for additional + * information. + */ +static QemuMutex yank_lock; + +static QLIST_HEAD(, YankInstanceEntry) yank_instance_list + = QLIST_HEAD_INITIALIZER(yank_instance_list); + +static bool yank_instance_equal(const YankInstance *a, const YankInstance *b) +{ + if (a->type != b->type) { + return false; + } + + switch (a->type) { + case YANK_INSTANCE_TYPE_BLOCK_NODE: + return g_str_equal(a->u.block_node.node_name, + b->u.block_node.node_name); + + case YANK_INSTANCE_TYPE_CHARDEV: + return g_str_equal(a->u.chardev.id, b->u.chardev.id); + + case YANK_INSTANCE_TYPE_MIGRATION: + return true; + + default: + abort(); + } +} + +static YankInstanceEntry *yank_find_entry(const YankInstance *instance) +{ + YankInstanceEntry *entry; + + QLIST_FOREACH(entry, &yank_instance_list, next) { + if (yank_instance_equal(entry->instance, instance)) { + return entry; + } + } + return NULL; +} + +bool yank_register_instance(const YankInstance *instance, Error **errp) +{ + YankInstanceEntry *entry; + + QEMU_LOCK_GUARD(&yank_lock); + + if (yank_find_entry(instance)) { + error_setg(errp, "duplicate yank instance"); + return false; + } + + entry = g_new0(YankInstanceEntry, 1); + entry->instance = QAPI_CLONE(YankInstance, instance); + QLIST_INIT(&entry->yankfns); + QLIST_INSERT_HEAD(&yank_instance_list, entry, next); + + return true; +} + +void yank_unregister_instance(const YankInstance *instance) +{ + YankInstanceEntry *entry; + + QEMU_LOCK_GUARD(&yank_lock); + entry = yank_find_entry(instance); + assert(entry); + + assert(QLIST_EMPTY(&entry->yankfns)); + QLIST_REMOVE(entry, next); + qapi_free_YankInstance(entry->instance); + g_free(entry); +} + +void yank_register_function(const YankInstance *instance, + YankFn *func, + void *opaque) +{ + YankInstanceEntry *entry; + YankFuncAndParam *func_entry; + + QEMU_LOCK_GUARD(&yank_lock); + entry = yank_find_entry(instance); + assert(entry); + + func_entry = g_new0(YankFuncAndParam, 1); + func_entry->func = func; + func_entry->opaque = opaque; + + QLIST_INSERT_HEAD(&entry->yankfns, func_entry, next); +} + +void yank_unregister_function(const YankInstance *instance, + YankFn *func, + void *opaque) +{ + YankInstanceEntry *entry; + YankFuncAndParam *func_entry; + + QEMU_LOCK_GUARD(&yank_lock); + entry = yank_find_entry(instance); + assert(entry); + + QLIST_FOREACH(func_entry, &entry->yankfns, next) { + if (func_entry->func == func && func_entry->opaque == opaque) { + QLIST_REMOVE(func_entry, next); + g_free(func_entry); + return; + } + } + + abort(); +} + +void qmp_yank(YankInstanceList *instances, + Error **errp) +{ + YankInstanceList *tail; + YankInstanceEntry *entry; + YankFuncAndParam *func_entry; + + QEMU_LOCK_GUARD(&yank_lock); + for (tail = instances; tail; tail = tail->next) { + entry = yank_find_entry(tail->value); + if (!entry) { + error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "Instance not found"); + return; + } + } + for (tail = instances; tail; tail = tail->next) { + entry = yank_find_entry(tail->value); + assert(entry); + QLIST_FOREACH(func_entry, &entry->yankfns, next) { + func_entry->func(func_entry->opaque); + } + } +} + +YankInstanceList *qmp_query_yank(Error **errp) +{ + YankInstanceEntry *entry; + YankInstanceList *ret; + + ret = NULL; + + QEMU_LOCK_GUARD(&yank_lock); + QLIST_FOREACH(entry, &yank_instance_list, next) { + YankInstanceList *new_entry; + new_entry = g_new0(YankInstanceList, 1); + new_entry->value = QAPI_CLONE(YankInstance, entry->instance); + new_entry->next = ret; + ret = new_entry; + } + + return ret; +} + +static void __attribute__((__constructor__)) yank_init(void) +{ + qemu_mutex_init(&yank_lock); +} |