diff options
Diffstat (limited to 'include/block')
-rw-r--r-- | include/block/accounting.h | 123 | ||||
-rw-r--r-- | include/block/aio-wait.h | 149 | ||||
-rw-r--r-- | include/block/aio.h | 770 | ||||
-rw-r--r-- | include/block/aio_task.h | 54 | ||||
-rw-r--r-- | include/block/block-copy.h | 96 | ||||
-rw-r--r-- | include/block/block-hmp-cmds.h | 54 | ||||
-rw-r--r-- | include/block/block.h | 864 | ||||
-rw-r--r-- | include/block/block_backup.h | 25 | ||||
-rw-r--r-- | include/block/block_int.h | 1504 | ||||
-rw-r--r-- | include/block/blockjob.h | 176 | ||||
-rw-r--r-- | include/block/blockjob_int.h | 122 | ||||
-rw-r--r-- | include/block/dirty-bitmap.h | 121 | ||||
-rw-r--r-- | include/block/export.h | 89 | ||||
-rw-r--r-- | include/block/fuse.h | 30 | ||||
-rw-r--r-- | include/block/nbd.h | 427 | ||||
-rw-r--r-- | include/block/nvme.h | 1501 | ||||
-rw-r--r-- | include/block/qapi.h | 45 | ||||
-rw-r--r-- | include/block/qdict.h | 32 | ||||
-rw-r--r-- | include/block/raw-aio.h | 90 | ||||
-rw-r--r-- | include/block/replication.h | 175 | ||||
-rw-r--r-- | include/block/snapshot.h | 102 | ||||
-rw-r--r-- | include/block/thread-pool.h | 37 | ||||
-rw-r--r-- | include/block/throttle-groups.h | 91 | ||||
-rw-r--r-- | include/block/write-threshold.h | 47 |
24 files changed, 6724 insertions, 0 deletions
diff --git a/include/block/accounting.h b/include/block/accounting.h new file mode 100644 index 000000000..878b4c358 --- /dev/null +++ b/include/block/accounting.h @@ -0,0 +1,123 @@ +/* + * QEMU System Emulator block accounting + * + * Copyright (c) 2011 Christoph Hellwig + * Copyright (c) 2015 Igalia, S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_ACCOUNTING_H +#define BLOCK_ACCOUNTING_H + +#include "qemu/timed-average.h" +#include "qemu/thread.h" +#include "qapi/qapi-builtin-types.h" + +typedef struct BlockAcctTimedStats BlockAcctTimedStats; +typedef struct BlockAcctStats BlockAcctStats; + +enum BlockAcctType { + BLOCK_ACCT_NONE = 0, + BLOCK_ACCT_READ, + BLOCK_ACCT_WRITE, + BLOCK_ACCT_FLUSH, + BLOCK_ACCT_UNMAP, + BLOCK_MAX_IOTYPE, +}; + +struct BlockAcctTimedStats { + BlockAcctStats *stats; + TimedAverage latency[BLOCK_MAX_IOTYPE]; + unsigned interval_length; /* in seconds */ + QSLIST_ENTRY(BlockAcctTimedStats) entries; +}; + +typedef struct BlockLatencyHistogram { + /* The following histogram is represented like this: + * + * 5| * + * 4| * + * 3| * * + * 2| * * * + * 1| * * * * + * +------------------ + * 10 50 100 + * + * BlockLatencyHistogram histogram = { + * .nbins = 4, + * .boundaries = {10, 50, 100}, + * .bins = {3, 1, 5, 2}, + * }; + * + * @boundaries array define histogram intervals as follows: + * [0, boundaries[0]), [boundaries[0], boundaries[1]), ... + * [boundaries[nbins-2], +inf) + * + * So, for example above, histogram intervals are: + * [0, 10), [10, 50), [50, 100), [100, +inf) + */ + int nbins; + uint64_t *boundaries; /* @nbins-1 numbers here + (all boundaries, except 0 and +inf) */ + uint64_t *bins; +} BlockLatencyHistogram; + +struct BlockAcctStats { + QemuMutex lock; + uint64_t nr_bytes[BLOCK_MAX_IOTYPE]; + uint64_t nr_ops[BLOCK_MAX_IOTYPE]; + uint64_t invalid_ops[BLOCK_MAX_IOTYPE]; + uint64_t failed_ops[BLOCK_MAX_IOTYPE]; + uint64_t total_time_ns[BLOCK_MAX_IOTYPE]; + uint64_t merged[BLOCK_MAX_IOTYPE]; + int64_t last_access_time_ns; + QSLIST_HEAD(, BlockAcctTimedStats) intervals; + bool account_invalid; + bool account_failed; + BlockLatencyHistogram latency_histogram[BLOCK_MAX_IOTYPE]; +}; + +typedef struct BlockAcctCookie { + int64_t bytes; + int64_t start_time_ns; + enum BlockAcctType type; +} BlockAcctCookie; + +void block_acct_init(BlockAcctStats *stats); +void block_acct_setup(BlockAcctStats *stats, bool account_invalid, + bool account_failed); +void block_acct_cleanup(BlockAcctStats *stats); +void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length); +BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, + BlockAcctTimedStats *s); +void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, + int64_t bytes, enum BlockAcctType type); +void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie); +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie); +void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type); +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, + int num_requests); +int64_t block_acct_idle_time_ns(BlockAcctStats *stats); +double block_acct_queue_depth(BlockAcctTimedStats *stats, + enum BlockAcctType type); +int block_latency_histogram_set(BlockAcctStats *stats, enum BlockAcctType type, + uint64List *boundaries); +void block_latency_histograms_clear(BlockAcctStats *stats); + +#endif diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h new file mode 100644 index 000000000..b39eefb38 --- /dev/null +++ b/include/block/aio-wait.h @@ -0,0 +1,149 @@ +/* + * AioContext wait support + * + * Copyright (C) 2018 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef QEMU_AIO_WAIT_H +#define QEMU_AIO_WAIT_H + +#include "block/aio.h" +#include "qemu/main-loop.h" + +/** + * AioWait: + * + * An object that facilitates synchronous waiting on a condition. A single + * global AioWait object (global_aio_wait) is used internally. + * + * The main loop can wait on an operation running in an IOThread as follows: + * + * AioContext *ctx = ...; + * MyWork work = { .done = false }; + * schedule_my_work_in_iothread(ctx, &work); + * AIO_WAIT_WHILE(ctx, !work.done); + * + * The IOThread must call aio_wait_kick() to notify the main loop when + * work.done changes: + * + * static void do_work(...) + * { + * ... + * work.done = true; + * aio_wait_kick(); + * } + */ +typedef struct { + /* Number of waiting AIO_WAIT_WHILE() callers. Accessed with atomic ops. */ + unsigned num_waiters; +} AioWait; + +extern AioWait global_aio_wait; + +/** + * AIO_WAIT_WHILE: + * @ctx: the aio context, or NULL if multiple aio contexts (for which the + * caller does not hold a lock) are involved in the polling condition. + * @cond: wait while this conditional expression is true + * + * Wait while a condition is true. Use this to implement synchronous + * operations that require event loop activity. + * + * The caller must be sure that something calls aio_wait_kick() when the value + * of @cond might have changed. + * + * The caller's thread must be the IOThread that owns @ctx or the main loop + * thread (with @ctx acquired exactly once). This function cannot be used to + * wait on conditions between two IOThreads since that could lead to deadlock, + * go via the main loop instead. + */ +#define AIO_WAIT_WHILE(ctx, cond) ({ \ + bool waited_ = false; \ + AioWait *wait_ = &global_aio_wait; \ + AioContext *ctx_ = (ctx); \ + /* Increment wait_->num_waiters before evaluating cond. */ \ + qatomic_inc(&wait_->num_waiters); \ + if (ctx_ && in_aio_context_home_thread(ctx_)) { \ + while ((cond)) { \ + aio_poll(ctx_, true); \ + waited_ = true; \ + } \ + } else { \ + assert(qemu_get_current_aio_context() == \ + qemu_get_aio_context()); \ + while ((cond)) { \ + if (ctx_) { \ + aio_context_release(ctx_); \ + } \ + aio_poll(qemu_get_aio_context(), true); \ + if (ctx_) { \ + aio_context_acquire(ctx_); \ + } \ + waited_ = true; \ + } \ + } \ + qatomic_dec(&wait_->num_waiters); \ + waited_; }) + +/** + * aio_wait_kick: + * Wake up the main thread if it is waiting on AIO_WAIT_WHILE(). During + * synchronous operations performed in an IOThread, the main thread lets the + * IOThread's event loop run, waiting for the operation to complete. A + * aio_wait_kick() call will wake up the main thread. + */ +void aio_wait_kick(void); + +/** + * aio_wait_bh_oneshot: + * @ctx: the aio context + * @cb: the BH callback function + * @opaque: user data for the BH callback function + * + * Run a BH in @ctx and wait for it to complete. + * + * Must be called from the main loop thread with @ctx acquired exactly once. + * Note that main loop event processing may occur. + */ +void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque); + +/** + * in_aio_context_home_thread: + * @ctx: the aio context + * + * Return whether we are running in the thread that normally runs @ctx. Note + * that acquiring/releasing ctx does not affect the outcome, each AioContext + * still only has one home thread that is responsible for running it. + */ +static inline bool in_aio_context_home_thread(AioContext *ctx) +{ + if (ctx == qemu_get_current_aio_context()) { + return true; + } + + if (ctx == qemu_get_aio_context()) { + return qemu_mutex_iothread_locked(); + } else { + return false; + } +} + +#endif /* QEMU_AIO_WAIT_H */ diff --git a/include/block/aio.h b/include/block/aio.h new file mode 100644 index 000000000..47fbe9d81 --- /dev/null +++ b/include/block/aio.h @@ -0,0 +1,770 @@ +/* + * QEMU aio implementation + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef QEMU_AIO_H +#define QEMU_AIO_H + +#ifdef CONFIG_LINUX_IO_URING +#include <liburing.h> +#endif +#include "qemu/coroutine.h" +#include "qemu/queue.h" +#include "qemu/event_notifier.h" +#include "qemu/thread.h" +#include "qemu/timer.h" + +typedef struct BlockAIOCB BlockAIOCB; +typedef void BlockCompletionFunc(void *opaque, int ret); + +typedef struct AIOCBInfo { + void (*cancel_async)(BlockAIOCB *acb); + AioContext *(*get_aio_context)(BlockAIOCB *acb); + size_t aiocb_size; +} AIOCBInfo; + +struct BlockAIOCB { + const AIOCBInfo *aiocb_info; + BlockDriverState *bs; + BlockCompletionFunc *cb; + void *opaque; + int refcnt; +}; + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque); +void qemu_aio_unref(void *p); +void qemu_aio_ref(void *p); + +typedef struct AioHandler AioHandler; +typedef QLIST_HEAD(, AioHandler) AioHandlerList; +typedef void QEMUBHFunc(void *opaque); +typedef bool AioPollFn(void *opaque); +typedef void IOHandler(void *opaque); + +struct Coroutine; +struct ThreadPool; +struct LinuxAioState; +struct LuringState; + +/* Is polling disabled? */ +bool aio_poll_disabled(AioContext *ctx); + +/* Callbacks for file descriptor monitoring implementations */ +typedef struct { + /* + * update: + * @ctx: the AioContext + * @old_node: the existing handler or NULL if this file descriptor is being + * monitored for the first time + * @new_node: the new handler or NULL if this file descriptor is being + * removed + * + * Add/remove/modify a monitored file descriptor. + * + * Called with ctx->list_lock acquired. + */ + void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node); + + /* + * wait: + * @ctx: the AioContext + * @ready_list: list for handlers that become ready + * @timeout: maximum duration to wait, in nanoseconds + * + * Wait for file descriptors to become ready and place them on ready_list. + * + * Called with ctx->list_lock incremented but not locked. + * + * Returns: number of ready file descriptors. + */ + int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout); + + /* + * need_wait: + * @ctx: the AioContext + * + * Tell aio_poll() when to stop userspace polling early because ->wait() + * has fds ready. + * + * File descriptor monitoring implementations that cannot poll fd readiness + * from userspace should use aio_poll_disabled() here. This ensures that + * file descriptors are not starved by handlers that frequently make + * progress via userspace polling. + * + * Returns: true if ->wait() should be called, false otherwise. + */ + bool (*need_wait)(AioContext *ctx); +} FDMonOps; + +/* + * Each aio_bh_poll() call carves off a slice of the BH list, so that newly + * scheduled BHs are not processed until the next aio_bh_poll() call. All + * active aio_bh_poll() calls chain their slices together in a list, so that + * nested aio_bh_poll() calls process all scheduled bottom halves. + */ +typedef QSLIST_HEAD(, QEMUBH) BHList; +typedef struct BHListSlice BHListSlice; +struct BHListSlice { + BHList bh_list; + QSIMPLEQ_ENTRY(BHListSlice) next; +}; + +typedef QSLIST_HEAD(, AioHandler) AioHandlerSList; + +struct AioContext { + GSource source; + + /* Used by AioContext users to protect from multi-threaded access. */ + QemuRecMutex lock; + + /* The list of registered AIO handlers. Protected by ctx->list_lock. */ + AioHandlerList aio_handlers; + + /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. */ + AioHandlerList deleted_aio_handlers; + + /* Used to avoid unnecessary event_notifier_set calls in aio_notify; + * only written from the AioContext home thread, or under the BQL in + * the case of the main AioContext. However, it is read from any + * thread so it is still accessed with atomic primitives. + * + * If this field is 0, everything (file descriptors, bottom halves, + * timers) will be re-evaluated before the next blocking poll() or + * io_uring wait; therefore, the event_notifier_set call can be + * skipped. If it is non-zero, you may need to wake up a concurrent + * aio_poll or the glib main event loop, making event_notifier_set + * necessary. + * + * Bit 0 is reserved for GSource usage of the AioContext, and is 1 + * between a call to aio_ctx_prepare and the next call to aio_ctx_check. + * Bits 1-31 simply count the number of active calls to aio_poll + * that are in the prepare or poll phase. + * + * The GSource and aio_poll must use a different mechanism because + * there is no certainty that a call to GSource's prepare callback + * (via g_main_context_prepare) is indeed followed by check and + * dispatch. It's not clear whether this would be a bug, but let's + * play safe and allow it---it will just cause extra calls to + * event_notifier_set until the next call to dispatch. + * + * Instead, the aio_poll calls include both the prepare and the + * dispatch phase, hence a simple counter is enough for them. + */ + uint32_t notify_me; + + /* A lock to protect between QEMUBH and AioHandler adders and deleter, + * and to ensure that no callbacks are removed while we're walking and + * dispatching them. + */ + QemuLockCnt list_lock; + + /* Bottom Halves pending aio_bh_poll() processing */ + BHList bh_list; + + /* Chained BH list slices for each nested aio_bh_poll() call */ + QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list; + + /* Used by aio_notify. + * + * "notified" is used to avoid expensive event_notifier_test_and_clear + * calls. When it is clear, the EventNotifier is clear, or one thread + * is going to clear "notified" before processing more events. False + * positives are possible, i.e. "notified" could be set even though the + * EventNotifier is clear. + * + * Note that event_notifier_set *cannot* be optimized the same way. For + * more information on the problem that would result, see "#ifdef BUG2" + * in the docs/aio_notify_accept.promela formal model. + */ + bool notified; + EventNotifier notifier; + + QSLIST_HEAD(, Coroutine) scheduled_coroutines; + QEMUBH *co_schedule_bh; + + /* Thread pool for performing work and receiving completion callbacks. + * Has its own locking. + */ + struct ThreadPool *thread_pool; + +#ifdef CONFIG_LINUX_AIO + /* + * State for native Linux AIO. Uses aio_context_acquire/release for + * locking. + */ + struct LinuxAioState *linux_aio; +#endif +#ifdef CONFIG_LINUX_IO_URING + /* + * State for Linux io_uring. Uses aio_context_acquire/release for + * locking. + */ + struct LuringState *linux_io_uring; + + /* State for file descriptor monitoring using Linux io_uring */ + struct io_uring fdmon_io_uring; + AioHandlerSList submit_list; +#endif + + /* TimerLists for calling timers - one per clock type. Has its own + * locking. + */ + QEMUTimerListGroup tlg; + + int external_disable_cnt; + + /* Number of AioHandlers without .io_poll() */ + int poll_disable_cnt; + + /* Polling mode parameters */ + int64_t poll_ns; /* current polling time in nanoseconds */ + int64_t poll_max_ns; /* maximum polling time in nanoseconds */ + int64_t poll_grow; /* polling time growth factor */ + int64_t poll_shrink; /* polling time shrink factor */ + + /* AIO engine parameters */ + int64_t aio_max_batch; /* maximum number of requests in a batch */ + + /* + * List of handlers participating in userspace polling. Protected by + * ctx->list_lock. Iterated and modified mostly by the event loop thread + * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler() + * only touches the list to delete nodes if ctx->list_lock's count is zero. + */ + AioHandlerList poll_aio_handlers; + + /* Are we in polling mode or monitoring file descriptors? */ + bool poll_started; + + /* epoll(7) state used when built with CONFIG_EPOLL */ + int epollfd; + + const FDMonOps *fdmon_ops; +}; + +/** + * aio_context_new: Allocate a new AioContext. + * + * AioContext provide a mini event-loop that can be waited on synchronously. + * They also provide bottom halves, a service to execute a piece of code + * as soon as possible. + */ +AioContext *aio_context_new(Error **errp); + +/** + * aio_context_ref: + * @ctx: The AioContext to operate on. + * + * Add a reference to an AioContext. + */ +void aio_context_ref(AioContext *ctx); + +/** + * aio_context_unref: + * @ctx: The AioContext to operate on. + * + * Drop a reference to an AioContext. + */ +void aio_context_unref(AioContext *ctx); + +/* Take ownership of the AioContext. If the AioContext will be shared between + * threads, and a thread does not want to be interrupted, it will have to + * take ownership around calls to aio_poll(). Otherwise, aio_poll() + * automatically takes care of calling aio_context_acquire and + * aio_context_release. + * + * Note that this is separate from bdrv_drained_begin/bdrv_drained_end. A + * thread still has to call those to avoid being interrupted by the guest. + * + * Bottom halves, timers and callbacks can be created or removed without + * acquiring the AioContext. + */ +void aio_context_acquire(AioContext *ctx); + +/* Relinquish ownership of the AioContext. */ +void aio_context_release(AioContext *ctx); + +/** + * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will + * run only once and as soon as possible. + * + * @name: A human-readable identifier for debugging purposes. + */ +void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** + * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run + * only once and as soon as possible. + * + * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the + * name string. + */ +#define aio_bh_schedule_oneshot(ctx, cb, opaque) \ + aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb))) + +/** + * aio_bh_new_full: Allocate a new bottom half structure. + * + * Bottom halves are lightweight callbacks whose invocation is guaranteed + * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure + * is opaque and must be allocated prior to its use. + * + * @name: A human-readable identifier for debugging purposes. + */ +QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque, + const char *name); + +/** + * aio_bh_new: Allocate a new bottom half structure + * + * A convenience wrapper for aio_bh_new_full() that uses the cb as the name + * string. + */ +#define aio_bh_new(ctx, cb, opaque) \ + aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb))) + +/** + * aio_notify: Force processing of pending events. + * + * Similar to signaling a condition variable, aio_notify forces + * aio_poll to exit, so that the next call will re-examine pending events. + * The caller of aio_notify will usually call aio_poll again very soon, + * or go through another iteration of the GLib main loop. Hence, aio_notify + * also has the side effect of recalculating the sets of file descriptors + * that the main loop waits for. + * + * Calling aio_notify is rarely necessary, because for example scheduling + * a bottom half calls it already. + */ +void aio_notify(AioContext *ctx); + +/** + * aio_notify_accept: Acknowledge receiving an aio_notify. + * + * aio_notify() uses an EventNotifier in order to wake up a sleeping + * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are + * usually rare, but the AioContext has to clear the EventNotifier on + * every aio_poll() or g_main_context_iteration() in order to avoid + * busy waiting. This event_notifier_test_and_clear() cannot be done + * using the usual aio_context_set_event_notifier(), because it must + * be done before processing all events (file descriptors, bottom halves, + * timers). + * + * aio_notify_accept() is an optimized event_notifier_test_and_clear() + * that is specific to an AioContext's notifier; it is used internally + * to clear the EventNotifier only if aio_notify() had been called. + */ +void aio_notify_accept(AioContext *ctx); + +/** + * aio_bh_call: Executes callback function of the specified BH. + */ +void aio_bh_call(QEMUBH *bh); + +/** + * aio_bh_poll: Poll bottom halves for an AioContext. + * + * These are internal functions used by the QEMU main loop. + * And notice that multiple occurrences of aio_bh_poll cannot + * be called concurrently + */ +int aio_bh_poll(AioContext *ctx); + +/** + * qemu_bh_schedule: Schedule a bottom half. + * + * Scheduling a bottom half interrupts the main loop and causes the + * execution of the callback that was passed to qemu_bh_new. + * + * Bottom halves that are scheduled from a bottom half handler are instantly + * invoked. This can create an infinite loop if a bottom half handler + * schedules itself. + * + * @bh: The bottom half to be scheduled. + */ +void qemu_bh_schedule(QEMUBH *bh); + +/** + * qemu_bh_cancel: Cancel execution of a bottom half. + * + * Canceling execution of a bottom half undoes the effect of calls to + * qemu_bh_schedule without freeing its resources yet. While cancellation + * itself is also wait-free and thread-safe, it can of course race with the + * loop that executes bottom halves unless you are holding the iothread + * mutex. This makes it mostly useless if you are not holding the mutex. + * + * @bh: The bottom half to be canceled. + */ +void qemu_bh_cancel(QEMUBH *bh); + +/** + *qemu_bh_delete: Cancel execution of a bottom half and free its resources. + * + * Deleting a bottom half frees the memory that was allocated for it by + * qemu_bh_new. It also implies canceling the bottom half if it was + * scheduled. + * This func is async. The bottom half will do the delete action at the finial + * end. + * + * @bh: The bottom half to be deleted. + */ +void qemu_bh_delete(QEMUBH *bh); + +/* Return whether there are any pending callbacks from the GSource + * attached to the AioContext, before g_poll is invoked. + * + * This is used internally in the implementation of the GSource. + */ +bool aio_prepare(AioContext *ctx); + +/* Return whether there are any pending callbacks from the GSource + * attached to the AioContext, after g_poll is invoked. + * + * This is used internally in the implementation of the GSource. + */ +bool aio_pending(AioContext *ctx); + +/* Dispatch any pending callbacks from the GSource attached to the AioContext. + * + * This is used internally in the implementation of the GSource. + */ +void aio_dispatch(AioContext *ctx); + +/* Progress in completing AIO work to occur. This can issue new pending + * aio as a result of executing I/O completion or bh callbacks. + * + * Return whether any progress was made by executing AIO or bottom half + * handlers. If @blocking == true, this should always be true except + * if someone called aio_notify. + * + * If there are no pending bottom halves, but there are pending AIO + * operations, it may not be possible to make any progress without + * blocking. If @blocking is true, this function will wait until one + * or more AIO events have completed, to ensure something has moved + * before returning. + */ +bool aio_poll(AioContext *ctx, bool blocking); + +/* Register a file descriptor and associated callbacks. Behaves very similarly + * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will + * be invoked when using aio_poll(). + * + * Code that invokes AIO completion functions should rely on this function + * instead of qemu_set_fd_handler[2]. + */ +void aio_set_fd_handler(AioContext *ctx, + int fd, + bool is_external, + IOHandler *io_read, + IOHandler *io_write, + AioPollFn *io_poll, + void *opaque); + +/* Set polling begin/end callbacks for a file descriptor that has already been + * registered with aio_set_fd_handler. Do nothing if the file descriptor is + * not registered. + */ +void aio_set_fd_poll(AioContext *ctx, int fd, + IOHandler *io_poll_begin, + IOHandler *io_poll_end); + +/* Register an event notifier and associated callbacks. Behaves very similarly + * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks + * will be invoked when using aio_poll(). + * + * Code that invokes AIO completion functions should rely on this function + * instead of event_notifier_set_handler. + */ +void aio_set_event_notifier(AioContext *ctx, + EventNotifier *notifier, + bool is_external, + EventNotifierHandler *io_read, + AioPollFn *io_poll); + +/* Set polling begin/end callbacks for an event notifier that has already been + * registered with aio_set_event_notifier. Do nothing if the event notifier is + * not registered. + */ +void aio_set_event_notifier_poll(AioContext *ctx, + EventNotifier *notifier, + EventNotifierHandler *io_poll_begin, + EventNotifierHandler *io_poll_end); + +/* Return a GSource that lets the main loop poll the file descriptors attached + * to this AioContext. + */ +GSource *aio_get_g_source(AioContext *ctx); + +/* Return the ThreadPool bound to this AioContext */ +struct ThreadPool *aio_get_thread_pool(AioContext *ctx); + +/* Setup the LinuxAioState bound to this AioContext */ +struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp); + +/* Return the LinuxAioState bound to this AioContext */ +struct LinuxAioState *aio_get_linux_aio(AioContext *ctx); + +/* Setup the LuringState bound to this AioContext */ +struct LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp); + +/* Return the LuringState bound to this AioContext */ +struct LuringState *aio_get_linux_io_uring(AioContext *ctx); +/** + * aio_timer_new_with_attrs: + * @ctx: the aio context + * @type: the clock type + * @scale: the scale + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values + * to assign + * @cb: the callback to call on timer expiry + * @opaque: the opaque pointer to pass to the callback + * + * Allocate a new timer (with attributes) attached to the context @ctx. + * The function is responsible for memory allocation. + * + * The preferred interface is aio_timer_init or aio_timer_init_with_attrs. + * Use that unless you really need dynamic memory allocation. + * + * Returns: a pointer to the new timer + */ +static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx, + QEMUClockType type, + int scale, int attributes, + QEMUTimerCB *cb, void *opaque) +{ + return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque); +} + +/** + * aio_timer_new: + * @ctx: the aio context + * @type: the clock type + * @scale: the scale + * @cb: the callback to call on timer expiry + * @opaque: the opaque pointer to pass to the callback + * + * Allocate a new timer attached to the context @ctx. + * See aio_timer_new_with_attrs for details. + * + * Returns: a pointer to the new timer + */ +static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type, + int scale, + QEMUTimerCB *cb, void *opaque) +{ + return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque); +} + +/** + * aio_timer_init_with_attrs: + * @ctx: the aio context + * @ts: the timer + * @type: the clock type + * @scale: the scale + * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values + * to assign + * @cb: the callback to call on timer expiry + * @opaque: the opaque pointer to pass to the callback + * + * Initialise a new timer (with attributes) attached to the context @ctx. + * The caller is responsible for memory allocation. + */ +static inline void aio_timer_init_with_attrs(AioContext *ctx, + QEMUTimer *ts, QEMUClockType type, + int scale, int attributes, + QEMUTimerCB *cb, void *opaque) +{ + timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque); +} + +/** + * aio_timer_init: + * @ctx: the aio context + * @ts: the timer + * @type: the clock type + * @scale: the scale + * @cb: the callback to call on timer expiry + * @opaque: the opaque pointer to pass to the callback + * + * Initialise a new timer attached to the context @ctx. + * See aio_timer_init_with_attrs for details. + */ +static inline void aio_timer_init(AioContext *ctx, + QEMUTimer *ts, QEMUClockType type, + int scale, + QEMUTimerCB *cb, void *opaque) +{ + timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque); +} + +/** + * aio_compute_timeout: + * @ctx: the aio context + * + * Compute the timeout that a blocking aio_poll should use. + */ +int64_t aio_compute_timeout(AioContext *ctx); + +/** + * aio_disable_external: + * @ctx: the aio context + * + * Disable the further processing of external clients. + */ +static inline void aio_disable_external(AioContext *ctx) +{ + qatomic_inc(&ctx->external_disable_cnt); +} + +/** + * aio_enable_external: + * @ctx: the aio context + * + * Enable the processing of external clients. + */ +static inline void aio_enable_external(AioContext *ctx) +{ + int old; + + old = qatomic_fetch_dec(&ctx->external_disable_cnt); + assert(old > 0); + if (old == 1) { + /* Kick event loop so it re-arms file descriptors */ + aio_notify(ctx); + } +} + +/** + * aio_external_disabled: + * @ctx: the aio context + * + * Return true if the external clients are disabled. + */ +static inline bool aio_external_disabled(AioContext *ctx) +{ + return qatomic_read(&ctx->external_disable_cnt); +} + +/** + * aio_node_check: + * @ctx: the aio context + * @is_external: Whether or not the checked node is an external event source. + * + * Check if the node's is_external flag is okay to be polled by the ctx at this + * moment. True means green light. + */ +static inline bool aio_node_check(AioContext *ctx, bool is_external) +{ + return !is_external || !qatomic_read(&ctx->external_disable_cnt); +} + +/** + * aio_co_schedule: + * @ctx: the aio context + * @co: the coroutine + * + * Start a coroutine on a remote AioContext. + * + * The coroutine must not be entered by anyone else while aio_co_schedule() + * is active. In addition the coroutine must have yielded unless ctx + * is the context in which the coroutine is running (i.e. the value of + * qemu_get_current_aio_context() from the coroutine itself). + */ +void aio_co_schedule(AioContext *ctx, struct Coroutine *co); + +/** + * aio_co_reschedule_self: + * @new_ctx: the new context + * + * Move the currently running coroutine to new_ctx. If the coroutine is already + * running in new_ctx, do nothing. + */ +void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx); + +/** + * aio_co_wake: + * @co: the coroutine + * + * Restart a coroutine on the AioContext where it was running last, thus + * preventing coroutines from jumping from one context to another when they + * go to sleep. + * + * aio_co_wake may be executed either in coroutine or non-coroutine + * context. The coroutine must not be entered by anyone else while + * aio_co_wake() is active. + */ +void aio_co_wake(struct Coroutine *co); + +/** + * aio_co_enter: + * @ctx: the context to run the coroutine + * @co: the coroutine to run + * + * Enter a coroutine in the specified AioContext. + */ +void aio_co_enter(AioContext *ctx, struct Coroutine *co); + +/** + * Return the AioContext whose event loop runs in the current thread. + * + * If called from an IOThread this will be the IOThread's AioContext. If + * called from the main thread or with the "big QEMU lock" taken it + * will be the main loop AioContext. + */ +AioContext *qemu_get_current_aio_context(void); + +void qemu_set_current_aio_context(AioContext *ctx); + +/** + * aio_context_setup: + * @ctx: the aio context + * + * Initialize the aio context. + */ +void aio_context_setup(AioContext *ctx); + +/** + * aio_context_destroy: + * @ctx: the aio context + * + * Destroy the aio context. + */ +void aio_context_destroy(AioContext *ctx); + +/* Used internally, do not call outside AioContext code */ +void aio_context_use_g_source(AioContext *ctx); + +/** + * aio_context_set_poll_params: + * @ctx: the aio context + * @max_ns: how long to busy poll for, in nanoseconds + * @grow: polling time growth factor + * @shrink: polling time shrink factor + * + * Poll mode can be disabled by setting poll_max_ns to 0. + */ +void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns, + int64_t grow, int64_t shrink, + Error **errp); + +/** + * aio_context_set_aio_params: + * @ctx: the aio context + * @max_batch: maximum number of requests in a batch, 0 means that the + * engine will use its default + */ +void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch, + Error **errp); + +#endif diff --git a/include/block/aio_task.h b/include/block/aio_task.h new file mode 100644 index 000000000..50bc1e181 --- /dev/null +++ b/include/block/aio_task.h @@ -0,0 +1,54 @@ +/* + * Aio tasks loops + * + * Copyright (c) 2019 Virtuozzo International GmbH. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_AIO_TASK_H +#define BLOCK_AIO_TASK_H + +#include "qemu/coroutine.h" + +typedef struct AioTaskPool AioTaskPool; +typedef struct AioTask AioTask; +typedef int coroutine_fn (*AioTaskFunc)(AioTask *task); +struct AioTask { + AioTaskPool *pool; + AioTaskFunc func; + int ret; +}; + +AioTaskPool *coroutine_fn aio_task_pool_new(int max_busy_tasks); +void aio_task_pool_free(AioTaskPool *); + +/* error code of failed task or 0 if all is OK */ +int aio_task_pool_status(AioTaskPool *pool); + +bool aio_task_pool_empty(AioTaskPool *pool); + +/* User provides filled @task, however task->pool will be set automatically */ +void coroutine_fn aio_task_pool_start_task(AioTaskPool *pool, AioTask *task); + +void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool); +void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool); +void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool); + +#endif /* BLOCK_AIO_TASK_H */ diff --git a/include/block/block-copy.h b/include/block/block-copy.h new file mode 100644 index 000000000..99370fa38 --- /dev/null +++ b/include/block/block-copy.h @@ -0,0 +1,96 @@ +/* + * block_copy API + * + * Copyright (C) 2013 Proxmox Server Solutions + * Copyright (c) 2019 Virtuozzo International GmbH. + * + * Authors: + * Dietmar Maurer (dietmar@proxmox.com) + * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef BLOCK_COPY_H +#define BLOCK_COPY_H + +#include "block/block.h" +#include "qemu/co-shared-resource.h" + +/* All APIs are thread-safe */ + +typedef void (*BlockCopyAsyncCallbackFunc)(void *opaque); +typedef struct BlockCopyState BlockCopyState; +typedef struct BlockCopyCallState BlockCopyCallState; + +BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target, + Error **errp); + +/* Function should be called prior any actual copy request */ +void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range, + bool compress); +void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm); + +void block_copy_state_free(BlockCopyState *s); + +int64_t block_copy_reset_unallocated(BlockCopyState *s, + int64_t offset, int64_t *count); + +int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes, + bool ignore_ratelimit); + +/* + * Run block-copy in a coroutine, create corresponding BlockCopyCallState + * object and return pointer to it. Never returns NULL. + * + * Caller is responsible to call block_copy_call_free() to free + * BlockCopyCallState object. + * + * @max_workers means maximum of parallel coroutines to execute sub-requests, + * must be > 0. + * + * @max_chunk means maximum length for one IO operation. Zero means unlimited. + */ +BlockCopyCallState *block_copy_async(BlockCopyState *s, + int64_t offset, int64_t bytes, + int max_workers, int64_t max_chunk, + BlockCopyAsyncCallbackFunc cb, + void *cb_opaque); + +/* + * Free finished BlockCopyCallState. Trying to free running + * block-copy will crash. + */ +void block_copy_call_free(BlockCopyCallState *call_state); + +/* + * Note, that block-copy call is marked finished prior to calling + * the callback. + */ +bool block_copy_call_finished(BlockCopyCallState *call_state); +bool block_copy_call_succeeded(BlockCopyCallState *call_state); +bool block_copy_call_failed(BlockCopyCallState *call_state); +bool block_copy_call_cancelled(BlockCopyCallState *call_state); +int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read); + +void block_copy_set_speed(BlockCopyState *s, uint64_t speed); +void block_copy_kick(BlockCopyCallState *call_state); + +/* + * Cancel running block-copy call. + * + * Cancel leaves block-copy state valid: dirty bits are correct and you may use + * cancel + <run block_copy with same parameters> to emulate pause/resume. + * + * Note also, that the cancel is async: it only marks block-copy call to be + * cancelled. So, the call may be cancelled (block_copy_call_cancelled() reports + * true) but not yet finished (block_copy_call_finished() reports false). + */ +void block_copy_call_cancel(BlockCopyCallState *call_state); + +BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s); +int64_t block_copy_cluster_size(BlockCopyState *s); +void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip); + +#endif /* BLOCK_COPY_H */ diff --git a/include/block/block-hmp-cmds.h b/include/block/block-hmp-cmds.h new file mode 100644 index 000000000..3412e108c --- /dev/null +++ b/include/block/block-hmp-cmds.h @@ -0,0 +1,54 @@ +/* + * HMP commands related to the block layer + * + * Copyright (c) 2003-2008 Fabrice Bellard + * Copyright (c) 2020 Red Hat, Inc. + * Copyright IBM, Corp. 2011 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#ifndef BLOCK_HMP_COMMANDS_H +#define BLOCK_HMP_COMMANDS_H + +void hmp_drive_add(Monitor *mon, const QDict *qdict); + +void hmp_commit(Monitor *mon, const QDict *qdict); +void hmp_drive_del(Monitor *mon, const QDict *qdict); + +void hmp_drive_mirror(Monitor *mon, const QDict *qdict); +void hmp_drive_backup(Monitor *mon, const QDict *qdict); + +void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict); +void hmp_block_job_cancel(Monitor *mon, const QDict *qdict); +void hmp_block_job_pause(Monitor *mon, const QDict *qdict); +void hmp_block_job_resume(Monitor *mon, const QDict *qdict); +void hmp_block_job_complete(Monitor *mon, const QDict *qdict); + +void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict); +void hmp_snapshot_blkdev_internal(Monitor *mon, const QDict *qdict); +void hmp_snapshot_delete_blkdev_internal(Monitor *mon, const QDict *qdict); + +void hmp_nbd_server_start(Monitor *mon, const QDict *qdict); +void hmp_nbd_server_add(Monitor *mon, const QDict *qdict); +void hmp_nbd_server_remove(Monitor *mon, const QDict *qdict); +void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); + +void hmp_block_resize(Monitor *mon, const QDict *qdict); +void hmp_block_stream(Monitor *mon, const QDict *qdict); +void hmp_block_passwd(Monitor *mon, const QDict *qdict); +void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict); +void hmp_eject(Monitor *mon, const QDict *qdict); + +void hmp_qemu_io(Monitor *mon, const QDict *qdict); + +void hmp_info_block(Monitor *mon, const QDict *qdict); +void hmp_info_blockstats(Monitor *mon, const QDict *qdict); +void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); +void hmp_info_snapshots(Monitor *mon, const QDict *qdict); + +#endif diff --git a/include/block/block.h b/include/block/block.h new file mode 100644 index 000000000..e5dd22b03 --- /dev/null +++ b/include/block/block.h @@ -0,0 +1,864 @@ +#ifndef BLOCK_H +#define BLOCK_H + +#include "block/aio.h" +#include "block/aio-wait.h" +#include "qemu/iov.h" +#include "qemu/coroutine.h" +#include "block/accounting.h" +#include "block/dirty-bitmap.h" +#include "block/blockjob.h" +#include "qemu/hbitmap.h" +#include "qemu/transactions.h" + +/* + * generated_co_wrapper + * + * Function specifier, which does nothing but mark functions to be + * generated by scripts/block-coroutine-wrapper.py + * + * Read more in docs/devel/block-coroutine-wrapper.rst + */ +#define generated_co_wrapper + +/* block.c */ +typedef struct BlockDriver BlockDriver; +typedef struct BdrvChild BdrvChild; +typedef struct BdrvChildClass BdrvChildClass; + +typedef struct BlockDriverInfo { + /* in bytes, 0 if irrelevant */ + int cluster_size; + /* offset at which the VM state can be saved (0 if not possible) */ + int64_t vm_state_offset; + bool is_dirty; + /* + * True if this block driver only supports compressed writes + */ + bool needs_compressed_writes; +} BlockDriverInfo; + +typedef struct BlockFragInfo { + uint64_t allocated_clusters; + uint64_t total_clusters; + uint64_t fragmented_clusters; + uint64_t compressed_clusters; +} BlockFragInfo; + +typedef enum { + BDRV_REQ_COPY_ON_READ = 0x1, + BDRV_REQ_ZERO_WRITE = 0x2, + + /* + * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate + * that the block driver should unmap (discard) blocks if it is guaranteed + * that the result will read back as zeroes. The flag is only passed to the + * driver if the block device is opened with BDRV_O_UNMAP. + */ + BDRV_REQ_MAY_UNMAP = 0x4, + + BDRV_REQ_FUA = 0x10, + BDRV_REQ_WRITE_COMPRESSED = 0x20, + + /* Signifies that this write request will not change the visible disk + * content. */ + BDRV_REQ_WRITE_UNCHANGED = 0x40, + + /* Forces request serialisation. Use only with write requests. */ + BDRV_REQ_SERIALISING = 0x80, + + /* Execute the request only if the operation can be offloaded or otherwise + * be executed efficiently, but return an error instead of using a slow + * fallback. */ + BDRV_REQ_NO_FALLBACK = 0x100, + + /* + * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read + * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR + * filter is involved), in which case it signals that the COR operation + * need not read the data into memory (qiov) but only ensure they are + * copied to the top layer (i.e., that COR operation is done). + */ + BDRV_REQ_PREFETCH = 0x200, + + /* + * If we need to wait for other requests, just fail immediately. Used + * only together with BDRV_REQ_SERIALISING. + */ + BDRV_REQ_NO_WAIT = 0x400, + + /* Mask of valid flags */ + BDRV_REQ_MASK = 0x7ff, +} BdrvRequestFlags; + +typedef struct BlockSizes { + uint32_t phys; + uint32_t log; +} BlockSizes; + +typedef struct HDGeometry { + uint32_t heads; + uint32_t sectors; + uint32_t cylinders; +} HDGeometry; + +#define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */ +#define BDRV_O_RDWR 0x0002 +#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */ +#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save writes in a snapshot */ +#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */ +#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ +#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */ +#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */ +#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */ +#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */ +#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */ +#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */ +#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */ +#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */ +#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given: + select an appropriate protocol driver, + ignoring the format layer */ +#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */ +#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */ +#define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */ + +#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH) + + +/* Option names of options parsed by the block layer */ + +#define BDRV_OPT_CACHE_WB "cache.writeback" +#define BDRV_OPT_CACHE_DIRECT "cache.direct" +#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush" +#define BDRV_OPT_READ_ONLY "read-only" +#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only" +#define BDRV_OPT_DISCARD "discard" +#define BDRV_OPT_FORCE_SHARE "force-share" + + +#define BDRV_SECTOR_BITS 9 +#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS) + +#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \ + INT_MAX >> BDRV_SECTOR_BITS) +#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) + +/* + * We want allow aligning requests and disk length up to any 32bit alignment + * and don't afraid of overflow. + * To achieve it, and in the same time use some pretty number as maximum disk + * size, let's define maximum "length" (a limit for any offset/bytes request and + * for disk size) to be the greatest power of 2 less than INT64_MAX. + */ +#define BDRV_MAX_ALIGNMENT (1L << 30) +#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT)) + +/* + * Allocation status flags for bdrv_block_status() and friends. + * + * Public flags: + * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer + * BDRV_BLOCK_ZERO: offset reads as zero + * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data + * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this + * layer rather than any backing, set by block layer + * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this + * layer, set by block layer + * + * Internal flags: + * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request + * that the block layer recompute the answer from the returned + * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID. + * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for + * zeroes in file child of current block node inside + * returned region. Only valid together with both + * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not + * appear with BDRV_BLOCK_ZERO. + * + * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the + * host offset within the returned BDS that is allocated for the + * corresponding raw guest data. However, whether that offset + * actually contains data also depends on BDRV_BLOCK_DATA, as follows: + * + * DATA ZERO OFFSET_VALID + * t t t sectors read as zero, returned file is zero at offset + * t f t sectors read as valid from file at offset + * f t t sectors preallocated, read as zero, returned file not + * necessarily zero at offset + * f f t sectors preallocated but read from backing_hd, + * returned file contains garbage at offset + * t t f sectors preallocated, read as zero, unknown offset + * t f f sectors read from unknown file or offset + * f t f not allocated or unknown offset, read as zero + * f f f not allocated or unknown offset, read from backing_hd + */ +#define BDRV_BLOCK_DATA 0x01 +#define BDRV_BLOCK_ZERO 0x02 +#define BDRV_BLOCK_OFFSET_VALID 0x04 +#define BDRV_BLOCK_RAW 0x08 +#define BDRV_BLOCK_ALLOCATED 0x10 +#define BDRV_BLOCK_EOF 0x20 +#define BDRV_BLOCK_RECURSE 0x40 + +typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue; + +typedef struct BDRVReopenState { + BlockDriverState *bs; + int flags; + BlockdevDetectZeroesOptions detect_zeroes; + bool backing_missing; + BlockDriverState *old_backing_bs; /* keep pointer for permissions update */ + BlockDriverState *old_file_bs; /* keep pointer for permissions update */ + QDict *options; + QDict *explicit_options; + void *opaque; +} BDRVReopenState; + +/* + * Block operation types + */ +typedef enum BlockOpType { + BLOCK_OP_TYPE_BACKUP_SOURCE, + BLOCK_OP_TYPE_BACKUP_TARGET, + BLOCK_OP_TYPE_CHANGE, + BLOCK_OP_TYPE_COMMIT_SOURCE, + BLOCK_OP_TYPE_COMMIT_TARGET, + BLOCK_OP_TYPE_DATAPLANE, + BLOCK_OP_TYPE_DRIVE_DEL, + BLOCK_OP_TYPE_EJECT, + BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT, + BLOCK_OP_TYPE_INTERNAL_SNAPSHOT, + BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE, + BLOCK_OP_TYPE_MIRROR_SOURCE, + BLOCK_OP_TYPE_MIRROR_TARGET, + BLOCK_OP_TYPE_RESIZE, + BLOCK_OP_TYPE_STREAM, + BLOCK_OP_TYPE_REPLACE, + BLOCK_OP_TYPE_MAX, +} BlockOpType; + +/* Block node permission constants */ +enum { + /** + * A user that has the "permission" of consistent reads is guaranteed that + * their view of the contents of the block device is complete and + * self-consistent, representing the contents of a disk at a specific + * point. + * + * For most block devices (including their backing files) this is true, but + * the property cannot be maintained in a few situations like for + * intermediate nodes of a commit block job. + */ + BLK_PERM_CONSISTENT_READ = 0x01, + + /** This permission is required to change the visible disk contents. */ + BLK_PERM_WRITE = 0x02, + + /** + * This permission (which is weaker than BLK_PERM_WRITE) is both enough and + * required for writes to the block node when the caller promises that + * the visible disk content doesn't change. + * + * As the BLK_PERM_WRITE permission is strictly stronger, either is + * sufficient to perform an unchanging write. + */ + BLK_PERM_WRITE_UNCHANGED = 0x04, + + /** This permission is required to change the size of a block node. */ + BLK_PERM_RESIZE = 0x08, + + /** + * This permission is required to change the node that this BdrvChild + * points to. + */ + BLK_PERM_GRAPH_MOD = 0x10, + + BLK_PERM_ALL = 0x1f, + + DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ + | BLK_PERM_WRITE + | BLK_PERM_WRITE_UNCHANGED + | BLK_PERM_RESIZE, + + DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH, +}; + +/* + * Flags that parent nodes assign to child nodes to specify what kind of + * role(s) they take. + * + * At least one of DATA, METADATA, FILTERED, or COW must be set for + * every child. + */ +enum BdrvChildRoleBits { + /* + * This child stores data. + * Any node may have an arbitrary number of such children. + */ + BDRV_CHILD_DATA = (1 << 0), + + /* + * This child stores metadata. + * Any node may have an arbitrary number of metadata-storing + * children. + */ + BDRV_CHILD_METADATA = (1 << 1), + + /* + * A child that always presents exactly the same visible data as + * the parent, e.g. by virtue of the parent forwarding all reads + * and writes. + * This flag is mutually exclusive with DATA, METADATA, and COW. + * Any node may have at most one filtered child at a time. + */ + BDRV_CHILD_FILTERED = (1 << 2), + + /* + * Child from which to read all data that isn't allocated in the + * parent (i.e., the backing child); such data is copied to the + * parent through COW (and optionally COR). + * This field is mutually exclusive with DATA, METADATA, and + * FILTERED. + * Any node may have at most one such backing child at a time. + */ + BDRV_CHILD_COW = (1 << 3), + + /* + * The primary child. For most drivers, this is the child whose + * filename applies best to the parent node. + * Any node may have at most one primary child at a time. + */ + BDRV_CHILD_PRIMARY = (1 << 4), + + /* Useful combination of flags */ + BDRV_CHILD_IMAGE = BDRV_CHILD_DATA + | BDRV_CHILD_METADATA + | BDRV_CHILD_PRIMARY, +}; + +/* Mask of BdrvChildRoleBits values */ +typedef unsigned int BdrvChildRole; + +char *bdrv_perm_names(uint64_t perm); +uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm); + +/* disk I/O throttling */ +void bdrv_init(void); +void bdrv_init_with_whitelist(void); +bool bdrv_uses_whitelist(void); +int bdrv_is_whitelisted(BlockDriver *drv, bool read_only); +BlockDriver *bdrv_find_protocol(const char *filename, + bool allow_protocol_prefix, + Error **errp); +BlockDriver *bdrv_find_format(const char *format_name); +int bdrv_create(BlockDriver *drv, const char* filename, + QemuOpts *opts, Error **errp); +int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp); + +BlockDriverState *bdrv_new(void); +int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top, + Error **errp); +int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to, + Error **errp); +int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs, + Error **errp); +BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options, + int flags, Error **errp); +int bdrv_drop_filter(BlockDriverState *bs, Error **errp); + +int bdrv_parse_aio(const char *mode, int *flags); +int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough); +int bdrv_parse_discard_flags(const char *mode, int *flags); +BdrvChild *bdrv_open_child(const char *filename, + QDict *options, const char *bdref_key, + BlockDriverState* parent, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + bool allow_none, Error **errp); +BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp); +int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd, + Error **errp); +int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options, + const char *bdref_key, Error **errp); +BlockDriverState *bdrv_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp); +BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv, + const char *node_name, + QDict *options, int flags, + Error **errp); +BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name, + int flags, Error **errp); +BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue, + BlockDriverState *bs, QDict *options, + bool keep_old_opts); +void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue); +int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp); +int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts, + Error **errp); +int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only, + Error **errp); +int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); +int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags); +int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes); +int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, + int64_t bytes); +int bdrv_pwrite_sync(BdrvChild *child, int64_t offset, + const void *buf, int64_t bytes); +/* + * Efficiently zero a region of the disk image. Note that this is a regular + * I/O request like read or write and should have a reasonable size. This + * function is not suitable for zeroing the entire image in a single request + * because it may allocate memory for the entire region. + */ +int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, + int64_t bytes, BdrvRequestFlags flags); +BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs, + const char *backing_file); +void bdrv_refresh_filename(BlockDriverState *bs); + +int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, + Error **errp); +int generated_co_wrapper +bdrv_truncate(BdrvChild *child, int64_t offset, bool exact, + PreallocMode prealloc, BdrvRequestFlags flags, Error **errp); + +int64_t bdrv_nb_sectors(BlockDriverState *bs); +int64_t bdrv_getlength(BlockDriverState *bs); +int64_t bdrv_get_allocated_file_size(BlockDriverState *bs); +BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts, + BlockDriverState *in_bs, Error **errp); +void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr); +void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp); +int bdrv_commit(BlockDriverState *bs); +int bdrv_make_empty(BdrvChild *c, Error **errp); +int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file, + const char *backing_fmt, bool warn); +void bdrv_register(BlockDriver *bdrv); +int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base, + const char *backing_file_str); +BlockDriverState *bdrv_find_overlay(BlockDriverState *active, + BlockDriverState *bs); +BlockDriverState *bdrv_find_base(BlockDriverState *bs); +bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base, + Error **errp); +int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base, + Error **errp); +void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base); +int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp); +void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs); + + +typedef struct BdrvCheckResult { + int corruptions; + int leaks; + int check_errors; + int corruptions_fixed; + int leaks_fixed; + int64_t image_end_offset; + BlockFragInfo bfi; +} BdrvCheckResult; + +typedef enum { + BDRV_FIX_LEAKS = 1, + BDRV_FIX_ERRORS = 2, +} BdrvCheckMode; + +int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); + +/* The units of offset and total_work_size may be chosen arbitrarily by the + * block driver; total_work_size may change during the course of the amendment + * operation */ +typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset, + int64_t total_work_size, void *opaque); +int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb, void *cb_opaque, + bool force, + Error **errp); + +/* check if a named node can be replaced when doing drive-mirror */ +BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs, + const char *node_name, Error **errp); + +/* async block I/O */ +void bdrv_aio_cancel(BlockAIOCB *acb); +void bdrv_aio_cancel_async(BlockAIOCB *acb); + +/* sg packet commands */ +int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); + +/* Invalidate any cached metadata used by image formats */ +int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs, + Error **errp); +void bdrv_invalidate_cache_all(Error **errp); +int bdrv_inactivate_all(void); + +/* Ensure contents are flushed to disk. */ +int generated_co_wrapper bdrv_flush(BlockDriverState *bs); +int coroutine_fn bdrv_co_flush(BlockDriverState *bs); +int bdrv_flush_all(void); +void bdrv_close_all(void); +void bdrv_drain(BlockDriverState *bs); +void coroutine_fn bdrv_co_drain(BlockDriverState *bs); +void bdrv_drain_all_begin(void); +void bdrv_drain_all_end(void); +void bdrv_drain_all(void); + +#define BDRV_POLL_WHILE(bs, cond) ({ \ + BlockDriverState *bs_ = (bs); \ + AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \ + cond); }) + +int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset, + int64_t bytes); +int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes); +int bdrv_has_zero_init_1(BlockDriverState *bs); +int bdrv_has_zero_init(BlockDriverState *bs); +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); +int bdrv_block_status(BlockDriverState *bs, int64_t offset, + int64_t bytes, int64_t *pnum, int64_t *map, + BlockDriverState **file); +int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base, + int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); +int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes, + int64_t *pnum); +int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base, + bool include_base, int64_t offset, int64_t bytes, + int64_t *pnum); +int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset, + int64_t bytes); + +bool bdrv_is_read_only(BlockDriverState *bs); +int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only, + bool ignore_allow_rdw, Error **errp); +int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg, + Error **errp); +bool bdrv_is_writable(BlockDriverState *bs); +bool bdrv_is_sg(BlockDriverState *bs); +bool bdrv_is_inserted(BlockDriverState *bs); +void bdrv_lock_medium(BlockDriverState *bs, bool locked); +void bdrv_eject(BlockDriverState *bs, bool eject_flag); +const char *bdrv_get_format_name(BlockDriverState *bs); +BlockDriverState *bdrv_find_node(const char *node_name); +BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp); +XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp); +BlockDriverState *bdrv_lookup_bs(const char *device, + const char *node_name, + Error **errp); +bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base); +BlockDriverState *bdrv_next_node(BlockDriverState *bs); +BlockDriverState *bdrv_next_all_states(BlockDriverState *bs); + +typedef struct BdrvNextIterator { + enum { + BDRV_NEXT_BACKEND_ROOTS, + BDRV_NEXT_MONITOR_OWNED, + } phase; + BlockBackend *blk; + BlockDriverState *bs; +} BdrvNextIterator; + +BlockDriverState *bdrv_first(BdrvNextIterator *it); +BlockDriverState *bdrv_next(BdrvNextIterator *it); +void bdrv_next_cleanup(BdrvNextIterator *it); + +BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs); +bool bdrv_supports_compressed_writes(BlockDriverState *bs); +void bdrv_iterate_format(void (*it)(void *opaque, const char *name), + void *opaque, bool read_only); +const char *bdrv_get_node_name(const BlockDriverState *bs); +const char *bdrv_get_device_name(const BlockDriverState *bs); +const char *bdrv_get_device_or_node_name(const BlockDriverState *bs); +int bdrv_get_flags(BlockDriverState *bs); +int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi); +ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs, + Error **errp); +BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs); +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t offset, int64_t bytes, + int64_t *cluster_offset, + int64_t *cluster_bytes); + +void bdrv_get_backing_filename(BlockDriverState *bs, + char *filename, int filename_size); +char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp); +char *bdrv_get_full_backing_filename_from_filename(const char *backed, + const char *backing, + Error **errp); +char *bdrv_dirname(BlockDriverState *bs, Error **errp); + +int path_has_protocol(const char *path); +int path_is_absolute(const char *path); +char *path_combine(const char *base_path, const char *filename); + +int generated_co_wrapper +bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int generated_co_wrapper +bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos); +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size); + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size); + +void bdrv_img_create(const char *filename, const char *fmt, + const char *base_filename, const char *base_fmt, + char *options, uint64_t img_size, int flags, + bool quiet, Error **errp); + +/* Returns the alignment in bytes that is required so that no bounce buffer + * is required throughout the stack */ +size_t bdrv_min_mem_align(BlockDriverState *bs); +/* Returns optimal alignment in bytes for bounce buffer */ +size_t bdrv_opt_mem_align(BlockDriverState *bs); +void *qemu_blockalign(BlockDriverState *bs, size_t size); +void *qemu_blockalign0(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign(BlockDriverState *bs, size_t size); +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size); +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov); + +void bdrv_enable_copy_on_read(BlockDriverState *bs); +void bdrv_disable_copy_on_read(BlockDriverState *bs); + +void bdrv_ref(BlockDriverState *bs); +void bdrv_unref(BlockDriverState *bs); +void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child); +BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs, + BlockDriverState *child_bs, + const char *child_name, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + Error **errp); + +bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp); +void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason); +void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason); +void bdrv_op_block_all(BlockDriverState *bs, Error *reason); +void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason); +bool bdrv_op_blocker_is_empty(BlockDriverState *bs); + +#define BLKDBG_EVENT(child, evt) \ + do { \ + if (child) { \ + bdrv_debug_event(child->bs, evt); \ + } \ + } while (0) + +void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event); + +int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag); +int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag); +int bdrv_debug_resume(BlockDriverState *bs, const char *tag); +bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag); + +/** + * bdrv_get_aio_context: + * + * Returns: the currently bound #AioContext + */ +AioContext *bdrv_get_aio_context(BlockDriverState *bs); + +/** + * Move the current coroutine to the AioContext of @bs and return the old + * AioContext of the coroutine. Increase bs->in_flight so that draining @bs + * will wait for the operation to proceed until the corresponding + * bdrv_co_leave(). + * + * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as + * this will deadlock. + */ +AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs); + +/** + * Ends a section started by bdrv_co_enter(). Move the current coroutine back + * to old_ctx and decrease bs->in_flight again. + */ +void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx); + +/** + * Locks the AioContext of @bs if it's not the current AioContext. This avoids + * double locking which could lead to deadlocks: This is a coroutine_fn, so we + * know we already own the lock of the current AioContext. + * + * May only be called in the main thread. + */ +void coroutine_fn bdrv_co_lock(BlockDriverState *bs); + +/** + * Unlocks the AioContext of @bs if it's not the current AioContext. + */ +void coroutine_fn bdrv_co_unlock(BlockDriverState *bs); + +/** + * Transfer control to @co in the aio context of @bs + */ +void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co); + +void bdrv_set_aio_context_ignore(BlockDriverState *bs, + AioContext *new_context, GSList **ignore); +int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, + Error **errp); +int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx, + BdrvChild *ignore_child, Error **errp); +bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx, + GSList **ignore, Error **errp); +bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx, + GSList **ignore, Error **errp); +AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c); +AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c); + +int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz); +int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo); + +void bdrv_io_plug(BlockDriverState *bs); +void bdrv_io_unplug(BlockDriverState *bs); + +/** + * bdrv_parent_drained_begin_single: + * + * Begin a quiesced section for the parent of @c. If @poll is true, wait for + * any pending activity to cease. + */ +void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll); + +/** + * bdrv_parent_drained_end_single: + * + * End a quiesced section for the parent of @c. + * + * This polls @bs's AioContext until all scheduled sub-drained_ends + * have settled, which may result in graph changes. + */ +void bdrv_parent_drained_end_single(BdrvChild *c); + +/** + * bdrv_drain_poll: + * + * Poll for pending requests in @bs, its parents (except for @ignore_parent), + * and if @recursive is true its children as well (used for subtree drain). + * + * If @ignore_bds_parents is true, parents that are BlockDriverStates must + * ignore the drain request because they will be drained separately (used for + * drain_all). + * + * This is part of bdrv_drained_begin. + */ +bool bdrv_drain_poll(BlockDriverState *bs, bool recursive, + BdrvChild *ignore_parent, bool ignore_bds_parents); + +/** + * bdrv_drained_begin: + * + * Begin a quiesced section for exclusive access to the BDS, by disabling + * external request sources including NBD server, block jobs, and device model. + * + * This function can be recursive. + */ +void bdrv_drained_begin(BlockDriverState *bs); + +/** + * bdrv_do_drained_begin_quiesce: + * + * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already + * running requests to complete. + */ +void bdrv_do_drained_begin_quiesce(BlockDriverState *bs, + BdrvChild *parent, bool ignore_bds_parents); + +/** + * Like bdrv_drained_begin, but recursively begins a quiesced section for + * exclusive access to all child nodes as well. + */ +void bdrv_subtree_drained_begin(BlockDriverState *bs); + +/** + * bdrv_drained_end: + * + * End a quiescent section started by bdrv_drained_begin(). + * + * This polls @bs's AioContext until all scheduled sub-drained_ends + * have settled. On one hand, that may result in graph changes. On + * the other, this requires that the caller either runs in the main + * loop; or that all involved nodes (@bs and all of its parents) are + * in the caller's AioContext. + */ +void bdrv_drained_end(BlockDriverState *bs); + +/** + * bdrv_drained_end_no_poll: + * + * Same as bdrv_drained_end(), but do not poll for the subgraph to + * actually become unquiesced. Therefore, no graph changes will occur + * with this function. + * + * *drained_end_counter is incremented for every background operation + * that is scheduled, and will be decremented for every operation once + * it settles. The caller must poll until it reaches 0. The counter + * should be accessed using atomic operations only. + */ +void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter); + +/** + * End a quiescent section started by bdrv_subtree_drained_begin(). + */ +void bdrv_subtree_drained_end(BlockDriverState *bs); + +void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child, + Error **errp); +void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp); + +bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name, + uint32_t granularity, Error **errp); +/** + * + * bdrv_register_buf/bdrv_unregister_buf: + * + * Register/unregister a buffer for I/O. For example, VFIO drivers are + * interested to know the memory areas that would later be used for I/O, so + * that they can prepare IOMMU mapping etc., to get better performance. + */ +void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size); +void bdrv_unregister_buf(BlockDriverState *bs, void *host); + +/** + * + * bdrv_co_copy_range: + * + * Do offloaded copy between two children. If the operation is not implemented + * by the driver, or if the backend storage doesn't support it, a negative + * error code will be returned. + * + * Note: block layer doesn't emulate or fallback to a bounce buffer approach + * because usually the caller shouldn't attempt offloaded copy any more (e.g. + * calling copy_file_range(2)) after the first error, thus it should fall back + * to a read+write path in the caller level. + * + * @src: Source child to copy data from + * @src_offset: offset in @src image to read data + * @dst: Destination child to copy data to + * @dst_offset: offset in @dst image to write data + * @bytes: number of bytes to copy + * @flags: request flags. Supported flags: + * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero + * write on @dst as if bdrv_co_pwrite_zeroes is + * called. Used to simplify caller code, or + * during BlockDriver.bdrv_co_copy_range_from() + * recursion. + * BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping + * requests currently in flight. + * + * Returns: 0 if succeeded; negative error code if failed. + **/ +int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + +void bdrv_cancel_in_flight(BlockDriverState *bs); + +#endif diff --git a/include/block/block_backup.h b/include/block/block_backup.h new file mode 100644 index 000000000..157596c29 --- /dev/null +++ b/include/block/block_backup.h @@ -0,0 +1,25 @@ +/* + * QEMU backup + * + * Copyright (c) 2013 Proxmox Server Solutions + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 Intel Corporation + * Copyright (c) 2016 FUJITSU LIMITED + * + * Authors: + * Dietmar Maurer <dietmar@proxmox.com> + * Changlong Xie <xiecl.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef BLOCK_BACKUP_H +#define BLOCK_BACKUP_H + +#include "block/block_int.h" + +void backup_do_checkpoint(BlockJob *job, Error **errp); + +#endif diff --git a/include/block/block_int.h b/include/block/block_int.h new file mode 100644 index 000000000..f4c75e8ba --- /dev/null +++ b/include/block/block_int.h @@ -0,0 +1,1504 @@ +/* + * QEMU System Emulator block driver + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef BLOCK_INT_H +#define BLOCK_INT_H + +#include "block/accounting.h" +#include "block/block.h" +#include "block/aio-wait.h" +#include "qemu/queue.h" +#include "qemu/coroutine.h" +#include "qemu/stats64.h" +#include "qemu/timer.h" +#include "qemu/hbitmap.h" +#include "block/snapshot.h" +#include "qemu/throttle.h" +#include "qemu/rcu.h" + +#define BLOCK_FLAG_LAZY_REFCOUNTS 8 + +#define BLOCK_OPT_SIZE "size" +#define BLOCK_OPT_ENCRYPT "encryption" +#define BLOCK_OPT_ENCRYPT_FORMAT "encrypt.format" +#define BLOCK_OPT_COMPAT6 "compat6" +#define BLOCK_OPT_HWVERSION "hwversion" +#define BLOCK_OPT_BACKING_FILE "backing_file" +#define BLOCK_OPT_BACKING_FMT "backing_fmt" +#define BLOCK_OPT_CLUSTER_SIZE "cluster_size" +#define BLOCK_OPT_TABLE_SIZE "table_size" +#define BLOCK_OPT_PREALLOC "preallocation" +#define BLOCK_OPT_SUBFMT "subformat" +#define BLOCK_OPT_COMPAT_LEVEL "compat" +#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts" +#define BLOCK_OPT_ADAPTER_TYPE "adapter_type" +#define BLOCK_OPT_REDUNDANCY "redundancy" +#define BLOCK_OPT_NOCOW "nocow" +#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint" +#define BLOCK_OPT_OBJECT_SIZE "object_size" +#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits" +#define BLOCK_OPT_DATA_FILE "data_file" +#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw" +#define BLOCK_OPT_COMPRESSION_TYPE "compression_type" +#define BLOCK_OPT_EXTL2 "extended_l2" + +#define BLOCK_PROBE_BUF_SIZE 512 + +enum BdrvTrackedRequestType { + BDRV_TRACKED_READ, + BDRV_TRACKED_WRITE, + BDRV_TRACKED_DISCARD, + BDRV_TRACKED_TRUNCATE, +}; + +/* + * That is not quite good that BdrvTrackedRequest structure is public, + * as block/io.c is very careful about incoming offset/bytes being + * correct. Be sure to assert bdrv_check_request() succeeded after any + * modification of BdrvTrackedRequest object out of block/io.c + */ +typedef struct BdrvTrackedRequest { + BlockDriverState *bs; + int64_t offset; + int64_t bytes; + enum BdrvTrackedRequestType type; + + bool serialising; + int64_t overlap_offset; + int64_t overlap_bytes; + + QLIST_ENTRY(BdrvTrackedRequest) list; + Coroutine *co; /* owner, used for deadlock detection */ + CoQueue wait_queue; /* coroutines blocked on this request */ + + struct BdrvTrackedRequest *waiting_for; +} BdrvTrackedRequest; + +int bdrv_check_qiov_request(int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, + Error **errp); +int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp); + +struct BlockDriver { + const char *format_name; + int instance_size; + + /* set to true if the BlockDriver is a block filter. Block filters pass + * certain callbacks that refer to data (see block.c) to their bs->file + * or bs->backing (whichever one exists) if the driver doesn't implement + * them. Drivers that do not wish to forward must implement them and return + * -ENOTSUP. + * Note that filters are not allowed to modify data. + * + * Filters generally cannot have more than a single filtered child, + * because the data they present must at all times be the same as + * that on their filtered child. That would be impossible to + * achieve for multiple filtered children. + * (And this filtered child must then be bs->file or bs->backing.) + */ + bool is_filter; + /* + * Set to true if the BlockDriver is a format driver. Format nodes + * generally do not expect their children to be other format nodes + * (except for backing files), and so format probing is disabled + * on those children. + */ + bool is_format; + /* + * Return true if @to_replace can be replaced by a BDS with the + * same data as @bs without it affecting @bs's behavior (that is, + * without it being visible to @bs's parents). + */ + bool (*bdrv_recurse_can_replace)(BlockDriverState *bs, + BlockDriverState *to_replace); + + int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename); + int (*bdrv_probe_device)(const char *filename); + + /* Any driver implementing this callback is expected to be able to handle + * NULL file names in its .bdrv_open() implementation */ + void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp); + /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have + * this field set to true, except ones that are defined only by their + * child's bs. + * An example of the last type will be the quorum block driver. + */ + bool bdrv_needs_filename; + + /* + * Set if a driver can support backing files. This also implies the + * following semantics: + * + * - Return status 0 of .bdrv_co_block_status means that corresponding + * blocks are not allocated in this layer of backing-chain + * - For such (unallocated) blocks, read will: + * - fill buffer with zeros if there is no backing file + * - read from the backing file otherwise, where the block layer + * takes care of reading zeros beyond EOF if backing file is short + */ + bool supports_backing; + + /* For handling image reopen for split or non-split files */ + int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp); + void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state); + void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state); + void (*bdrv_join_options)(QDict *options, QDict *old_options); + + int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags, + Error **errp); + + /* Protocol drivers should implement this instead of bdrv_open */ + int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags, + Error **errp); + void (*bdrv_close)(BlockDriverState *bs); + + + int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts, + Error **errp); + int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv, + const char *filename, + QemuOpts *opts, + Error **errp); + + int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs, + BlockdevAmendOptions *opts, + bool force, + Error **errp); + + int (*bdrv_amend_options)(BlockDriverState *bs, + QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque, + bool force, + Error **errp); + + int (*bdrv_make_empty)(BlockDriverState *bs); + + /* + * Refreshes the bs->exact_filename field. If that is impossible, + * bs->exact_filename has to be left empty. + */ + void (*bdrv_refresh_filename)(BlockDriverState *bs); + + /* + * Gathers the open options for all children into @target. + * A simple format driver (without backing file support) might + * implement this function like this: + * + * QINCREF(bs->file->bs->full_open_options); + * qdict_put(target, "file", bs->file->bs->full_open_options); + * + * If not specified, the generic implementation will simply put + * all children's options under their respective name. + * + * @backing_overridden is true when bs->backing seems not to be + * the child that would result from opening bs->backing_file. + * Therefore, if it is true, the backing child's options should be + * gathered; otherwise, there is no need since the backing child + * is the one implied by the image header. + * + * Note that ideally this function would not be needed. Every + * block driver which implements it is probably doing something + * shady regarding its runtime option structure. + */ + void (*bdrv_gather_child_options)(BlockDriverState *bs, QDict *target, + bool backing_overridden); + + /* + * Returns an allocated string which is the directory name of this BDS: It + * will be used to make relative filenames absolute by prepending this + * function's return value to them. + */ + char *(*bdrv_dirname)(BlockDriverState *bs, Error **errp); + + /* aio */ + BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque); + BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs, + int64_t offset, int bytes, + BlockCompletionFunc *cb, void *opaque); + + int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); + + /** + * @offset: position in bytes to read at + * @bytes: number of bytes to read + * @qiov: the buffers to fill with read data + * @flags: currently unused, always 0 + * + * @offset and @bytes will be a multiple of 'request_alignment', + * but the length of individual @qiov elements does not have to + * be a multiple. + * + * @bytes will always equal the total size of @qiov, and will be + * no larger than 'max_transfer'. + * + * The buffer in @qiov may point directly to guest memory. + */ + int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags); + /** + * @offset: position in bytes to write at + * @bytes: number of bytes to write + * @qiov: the buffers containing data to write + * @flags: zero or more bits allowed by 'supported_write_flags' + * + * @offset and @bytes will be a multiple of 'request_alignment', + * but the length of individual @qiov elements does not have to + * be a multiple. + * + * @bytes will always equal the total size of @qiov, and will be + * no larger than 'max_transfer'. + * + * The buffer in @qiov may point directly to guest memory. + */ + int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset, + BdrvRequestFlags flags); + + /* + * Efficiently zero a region of the disk image. Typically an image format + * would use a compact metadata representation to implement this. This + * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev() + * will be called instead. + */ + int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs, + int64_t offset, int64_t bytes, BdrvRequestFlags flags); + int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs, + int64_t offset, int64_t bytes); + + /* Map [offset, offset + nbytes) range onto a child of @bs to copy from, + * and invoke bdrv_co_copy_range_from(child, ...), or invoke + * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs, + BdrvChild *src, + int64_t offset, + BdrvChild *dst, + int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + + /* Map [offset, offset + nbytes) range onto a child of bs to copy data to, + * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy + * operation if @bs is the leaf and @src has the same BlockDriver. Return + * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver. + * + * See the comment of bdrv_co_copy_range for the parameter and return value + * semantics. + */ + int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs, + BdrvChild *src, + int64_t src_offset, + BdrvChild *dst, + int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + + /* + * Building block for bdrv_block_status[_above] and + * bdrv_is_allocated[_above]. The driver should answer only + * according to the current layer, and should only need to set + * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID, + * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing + * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See + * block.h for the overall meaning of the bits. As a hint, the + * flag want_zero is true if the caller cares more about precise + * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for + * overall allocation (favor larger *pnum, perhaps by reporting + * _DATA instead of _ZERO). The block layer guarantees input + * clamped to bdrv_getlength() and aligned to request_alignment, + * as well as non-NULL pnum, map, and file; in turn, the driver + * must return an error or set pnum to an aligned non-zero value. + * + * Note that @bytes is just a hint on how big of a region the + * caller wants to inspect. It is not a limit on *pnum. + * Implementations are free to return larger values of *pnum if + * doing so does not incur a performance penalty. + * + * block/io.c's bdrv_co_block_status() will utilize an unclamped + * *pnum value for the block-status cache on protocol nodes, prior + * to clamping *pnum for return to its caller. + */ + int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs, + bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum, + int64_t *map, BlockDriverState **file); + + /* + * This informs the driver that we are no longer interested in the result + * of in-flight requests, so don't waste the time if possible. + * + * One example usage is to avoid waiting for an nbd target node reconnect + * timeout during job-cancel with force=true. + */ + void (*bdrv_cancel_in_flight)(BlockDriverState *bs); + + /* + * Invalidate any cached meta-data. + */ + void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs, + Error **errp); + int (*bdrv_inactivate)(BlockDriverState *bs); + + /* + * Flushes all data for all layers by calling bdrv_co_flush for underlying + * layers, if needed. This function is needed for deterministic + * synchronization of the flush finishing callback. + */ + int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs); + + /* Delete a created file. */ + int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs, + Error **errp); + + /* + * Flushes all data that was already written to the OS all the way down to + * the disk (for example file-posix.c calls fsync()). + */ + int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs); + + /* + * Flushes all internal caches to the OS. The data may still sit in a + * writeback cache of the host OS, but it will survive a crash of the qemu + * process. + */ + int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs); + + /* + * Drivers setting this field must be able to work with just a plain + * filename with '<protocol_name>:' as a prefix, and no other options. + * Options may be extracted from the filename by implementing + * bdrv_parse_filename. + */ + const char *protocol_name; + + /* + * Truncate @bs to @offset bytes using the given @prealloc mode + * when growing. Modes other than PREALLOC_MODE_OFF should be + * rejected when shrinking @bs. + * + * If @exact is true, @bs must be resized to exactly @offset. + * Otherwise, it is sufficient for @bs (if it is a host block + * device and thus there is no way to resize it) to be at least + * @offset bytes in length. + * + * If @exact is true and this function fails but would succeed + * with @exact = false, it should return -ENOTSUP. + */ + int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset, + bool exact, PreallocMode prealloc, + BdrvRequestFlags flags, Error **errp); + + int64_t (*bdrv_getlength)(BlockDriverState *bs); + bool has_variable_length; + int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs); + BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs, + Error **errp); + + int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov); + int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset); + + int (*bdrv_snapshot_create)(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info); + int (*bdrv_snapshot_goto)(BlockDriverState *bs, + const char *snapshot_id); + int (*bdrv_snapshot_delete)(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); + int (*bdrv_snapshot_list)(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info); + int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); + int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi); + ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs, + Error **errp); + BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs); + + int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs, + QEMUIOVector *qiov, + int64_t pos); + int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs, + QEMUIOVector *qiov, + int64_t pos); + + int (*bdrv_change_backing_file)(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt); + + /* removable device specific */ + bool (*bdrv_is_inserted)(BlockDriverState *bs); + void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag); + void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked); + + /* to control generic scsi devices */ + BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque); + int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs, + unsigned long int req, void *buf); + + /* List of options for creating images, terminated by name == NULL */ + QemuOptsList *create_opts; + + /* List of options for image amend */ + QemuOptsList *amend_opts; + + /* + * If this driver supports reopening images this contains a + * NULL-terminated list of the runtime options that can be + * modified. If an option in this list is unspecified during + * reopen then it _must_ be reset to its default value or return + * an error. + */ + const char *const *mutable_opts; + + /* + * Returns 0 for completed check, -errno for internal errors. + * The check results are stored in result. + */ + int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs, + BdrvCheckResult *result, + BdrvCheckMode fix); + + void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event); + + /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */ + int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event, + const char *tag); + int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs, + const char *tag); + int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag); + bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag); + + void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp); + + /* + * Returns 1 if newly created images are guaranteed to contain only + * zeros, 0 otherwise. + */ + int (*bdrv_has_zero_init)(BlockDriverState *bs); + + /* Remove fd handlers, timers, and other event loop callbacks so the event + * loop is no longer in use. Called with no in-flight requests and in + * depth-first traversal order with parents before child nodes. + */ + void (*bdrv_detach_aio_context)(BlockDriverState *bs); + + /* Add fd handlers, timers, and other event loop callbacks so I/O requests + * can be processed again. Called with no in-flight requests and in + * depth-first traversal order with child nodes before parent nodes. + */ + void (*bdrv_attach_aio_context)(BlockDriverState *bs, + AioContext *new_context); + + /* io queue for linux-aio */ + void (*bdrv_io_plug)(BlockDriverState *bs); + void (*bdrv_io_unplug)(BlockDriverState *bs); + + /** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ + int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz); + /** + * Try to get @bs's geometry (cyls, heads, sectors) + * On success, store them in @geo and return 0. + * On failure return -errno. + * Only drivers that want to override guest geometry implement this + * callback; see hd_geometry_guess(). + */ + int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo); + + /** + * bdrv_co_drain_begin is called if implemented in the beginning of a + * drain operation to drain and stop any internal sources of requests in + * the driver. + * bdrv_co_drain_end is called if implemented at the end of the drain. + * + * They should be used by the driver to e.g. manage scheduled I/O + * requests, or toggle an internal state. After the end of the drain new + * requests will continue normally. + */ + void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs); + void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs); + + void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child, + Error **errp); + void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child, + Error **errp); + + /** + * Informs the block driver that a permission change is intended. The + * driver checks whether the change is permissible and may take other + * preparations for the change (e.g. get file system locks). This operation + * is always followed either by a call to either .bdrv_set_perm or + * .bdrv_abort_perm_update. + * + * Checks whether the requested set of cumulative permissions in @perm + * can be granted for accessing @bs and whether no other users are using + * permissions other than those given in @shared (both arguments take + * BLK_PERM_* bitmasks). + * + * If both conditions are met, 0 is returned. Otherwise, -errno is returned + * and errp is set to an error describing the conflict. + */ + int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm, + uint64_t shared, Error **errp); + + /** + * Called to inform the driver that the set of cumulative set of used + * permissions for @bs has changed to @perm, and the set of sharable + * permission to @shared. The driver can use this to propagate changes to + * its children (i.e. request permissions only if a parent actually needs + * them). + * + * This function is only invoked after bdrv_check_perm(), so block drivers + * may rely on preparations made in their .bdrv_check_perm implementation. + */ + void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared); + + /* + * Called to inform the driver that after a previous bdrv_check_perm() + * call, the permission update is not performed and any preparations made + * for it (e.g. taken file locks) need to be undone. + * + * This function can be called even for nodes that never saw a + * bdrv_check_perm() call. It is a no-op then. + */ + void (*bdrv_abort_perm_update)(BlockDriverState *bs); + + /** + * Returns in @nperm and @nshared the permissions that the driver for @bs + * needs on its child @c, based on the cumulative permissions requested by + * the parents in @parent_perm and @parent_shared. + * + * If @c is NULL, return the permissions for attaching a new child for the + * given @child_class and @role. + * + * If @reopen_queue is non-NULL, don't return the currently needed + * permissions, but those that will be needed after applying the + * @reopen_queue. + */ + void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, + BlockReopenQueue *reopen_queue, + uint64_t parent_perm, uint64_t parent_shared, + uint64_t *nperm, uint64_t *nshared); + + bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs); + bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs, + const char *name, + uint32_t granularity, + Error **errp); + int (*bdrv_co_remove_persistent_dirty_bitmap)(BlockDriverState *bs, + const char *name, + Error **errp); + + /** + * Register/unregister a buffer for I/O. For example, when the driver is + * interested to know the memory areas that will later be used in iovs, so + * that it can do IOMMU mapping with VFIO etc., in order to get better + * performance. In the case of VFIO drivers, this callback is used to do + * DMA mapping for hot buffers. + */ + void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size); + void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host); + QLIST_ENTRY(BlockDriver) list; + + /* Pointer to a NULL-terminated array of names of strong options + * that can be specified for bdrv_open(). A strong option is one + * that changes the data of a BDS. + * If this pointer is NULL, the array is considered empty. + * "filename" and "driver" are always considered strong. */ + const char *const *strong_runtime_opts; +}; + +static inline bool block_driver_can_compress(BlockDriver *drv) +{ + return drv->bdrv_co_pwritev_compressed || + drv->bdrv_co_pwritev_compressed_part; +} + +typedef struct BlockLimits { + /* Alignment requirement, in bytes, for offset/length of I/O + * requests. Must be a power of 2 less than INT_MAX; defaults to + * 1 for drivers with modern byte interfaces, and to 512 + * otherwise. */ + uint32_t request_alignment; + + /* + * Maximum number of bytes that can be discarded at once. Must be multiple + * of pdiscard_alignment, but need not be power of 2. May be 0 if no + * inherent 64-bit limit. + */ + int64_t max_pdiscard; + + /* Optimal alignment for discard requests in bytes. A power of 2 + * is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pdiscard if + * that is set. May be 0 if bl.request_alignment is good enough */ + uint32_t pdiscard_alignment; + + /* + * Maximum number of bytes that can zeroized at once. Must be multiple of + * pwrite_zeroes_alignment. 0 means no limit. + */ + int64_t max_pwrite_zeroes; + + /* Optimal alignment for write zeroes requests in bytes. A power + * of 2 is best but not mandatory. Must be a multiple of + * bl.request_alignment, and must be less than max_pwrite_zeroes + * if that is set. May be 0 if bl.request_alignment is good + * enough */ + uint32_t pwrite_zeroes_alignment; + + /* Optimal transfer length in bytes. A power of 2 is best but not + * mandatory. Must be a multiple of bl.request_alignment, or 0 if + * no preferred size */ + uint32_t opt_transfer; + + /* Maximal transfer length in bytes. Need not be power of 2, but + * must be multiple of opt_transfer and bl.request_alignment, or 0 + * for no 32-bit limit. For now, anything larger than INT_MAX is + * clamped down. */ + uint32_t max_transfer; + + /* Maximal hardware transfer length in bytes. Applies whenever + * transfers to the device bypass the kernel I/O scheduler, for + * example with SG_IO. If larger than max_transfer or if zero, + * blk_get_max_hw_transfer will fall back to max_transfer. + */ + uint64_t max_hw_transfer; + + /* Maximal number of scatter/gather elements allowed by the hardware. + * Applies whenever transfers to the device bypass the kernel I/O + * scheduler, for example with SG_IO. If larger than max_iov + * or if zero, blk_get_max_hw_iov will fall back to max_iov. + */ + int max_hw_iov; + + /* memory alignment, in bytes so that no bounce buffer is needed */ + size_t min_mem_alignment; + + /* memory alignment, in bytes, for bounce buffer */ + size_t opt_mem_alignment; + + /* maximum number of iovec elements */ + int max_iov; +} BlockLimits; + +typedef struct BdrvOpBlocker BdrvOpBlocker; + +typedef struct BdrvAioNotifier { + void (*attached_aio_context)(AioContext *new_context, void *opaque); + void (*detach_aio_context)(void *opaque); + + void *opaque; + bool deleted; + + QLIST_ENTRY(BdrvAioNotifier) list; +} BdrvAioNotifier; + +struct BdrvChildClass { + /* If true, bdrv_replace_node() doesn't change the node this BdrvChild + * points to. */ + bool stay_at_node; + + /* If true, the parent is a BlockDriverState and bdrv_next_all_states() + * will return it. This information is used for drain_all, where every node + * will be drained separately, so the drain only needs to be propagated to + * non-BDS parents. */ + bool parent_is_bds; + + void (*inherit_options)(BdrvChildRole role, bool parent_is_format, + int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options); + + void (*change_media)(BdrvChild *child, bool load); + void (*resize)(BdrvChild *child); + + /* Returns a name that is supposedly more useful for human users than the + * node name for identifying the node in question (in particular, a BB + * name), or NULL if the parent can't provide a better name. */ + const char *(*get_name)(BdrvChild *child); + + /* Returns a malloced string that describes the parent of the child for a + * human reader. This could be a node-name, BlockBackend name, qdev ID or + * QOM path of the device owning the BlockBackend, job type and ID etc. The + * caller is responsible for freeing the memory. */ + char *(*get_parent_desc)(BdrvChild *child); + + /* + * If this pair of functions is implemented, the parent doesn't issue new + * requests after returning from .drained_begin() until .drained_end() is + * called. + * + * These functions must not change the graph (and therefore also must not + * call aio_poll(), which could change the graph indirectly). + * + * If drained_end() schedules background operations, it must atomically + * increment *drained_end_counter for each such operation and atomically + * decrement it once the operation has settled. + * + * Note that this can be nested. If drained_begin() was called twice, new + * I/O is allowed only after drained_end() was called twice, too. + */ + void (*drained_begin)(BdrvChild *child); + void (*drained_end)(BdrvChild *child, int *drained_end_counter); + + /* + * Returns whether the parent has pending requests for the child. This + * callback is polled after .drained_begin() has been called until all + * activity on the child has stopped. + */ + bool (*drained_poll)(BdrvChild *child); + + /* Notifies the parent that the child has been activated/inactivated (e.g. + * when migration is completing) and it can start/stop requesting + * permissions and doing I/O on it. */ + void (*activate)(BdrvChild *child, Error **errp); + int (*inactivate)(BdrvChild *child); + + void (*attach)(BdrvChild *child); + void (*detach)(BdrvChild *child); + + /* Notifies the parent that the filename of its child has changed (e.g. + * because the direct child was removed from the backing chain), so that it + * can update its reference. */ + int (*update_filename)(BdrvChild *child, BlockDriverState *new_base, + const char *filename, Error **errp); + + bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx, + GSList **ignore, Error **errp); + void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore); + + AioContext *(*get_parent_aio_context)(BdrvChild *child); +}; + +extern const BdrvChildClass child_of_bds; + +struct BdrvChild { + BlockDriverState *bs; + char *name; + const BdrvChildClass *klass; + BdrvChildRole role; + void *opaque; + + /** + * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask) + */ + uint64_t perm; + + /** + * Permissions that can still be granted to other users of @bs while this + * BdrvChild is still attached to it. (BLK_PERM_* bitmask) + */ + uint64_t shared_perm; + + /* + * This link is frozen: the child can neither be replaced nor + * detached from the parent. + */ + bool frozen; + + /* + * How many times the parent of this child has been drained + * (through klass->drained_*). + * Usually, this is equal to bs->quiesce_counter (potentially + * reduced by bdrv_drain_all_count). It may differ while the + * child is entering or leaving a drained section. + */ + int parent_quiesce_counter; + + QLIST_ENTRY(BdrvChild) next; + QLIST_ENTRY(BdrvChild) next_parent; +}; + +/* + * Allows bdrv_co_block_status() to cache one data region for a + * protocol node. + * + * @valid: Whether the cache is valid (should be accessed with atomic + * functions so this can be reset by RCU readers) + * @data_start: Offset where we know (or strongly assume) is data + * @data_end: Offset where the data region ends (which is not necessarily + * the start of a zeroed region) + */ +typedef struct BdrvBlockStatusCache { + struct rcu_head rcu; + + bool valid; + int64_t data_start; + int64_t data_end; +} BdrvBlockStatusCache; + +struct BlockDriverState { + /* Protected by big QEMU lock or read-only after opening. No special + * locking needed during I/O... + */ + int open_flags; /* flags used to open the file, re-used for re-open */ + bool encrypted; /* if true, the media is encrypted */ + bool sg; /* if true, the device is a /dev/sg* */ + bool probed; /* if true, format was probed rather than specified */ + bool force_share; /* if true, always allow all shared permissions */ + bool implicit; /* if true, this filter node was automatically inserted */ + + BlockDriver *drv; /* NULL means no media */ + void *opaque; + + AioContext *aio_context; /* event loop used for fd handlers, timers, etc */ + /* long-running tasks intended to always use the same AioContext as this + * BDS may register themselves in this list to be notified of changes + * regarding this BDS's context */ + QLIST_HEAD(, BdrvAioNotifier) aio_notifiers; + bool walking_aio_notifiers; /* to make removal during iteration safe */ + + char filename[PATH_MAX]; + /* + * If not empty, this image is a diff in relation to backing_file. + * Note that this is the name given in the image header and + * therefore may or may not be equal to .backing->bs->filename. + * If this field contains a relative path, it is to be resolved + * relatively to the overlay's location. + */ + char backing_file[PATH_MAX]; + /* + * The backing filename indicated by the image header. Contrary + * to backing_file, if we ever open this file, auto_backing_file + * is replaced by the resulting BDS's filename (i.e. after a + * bdrv_refresh_filename() run). + */ + char auto_backing_file[PATH_MAX]; + char backing_format[16]; /* if non-zero and backing_file exists */ + + QDict *full_open_options; + char exact_filename[PATH_MAX]; + + BdrvChild *backing; + BdrvChild *file; + + /* I/O Limits */ + BlockLimits bl; + + /* + * Flags honored during pread + */ + unsigned int supported_read_flags; + /* Flags honored during pwrite (so far: BDRV_REQ_FUA, + * BDRV_REQ_WRITE_UNCHANGED). + * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those + * writes will be issued as normal writes without the flag set. + * This is important to note for drivers that do not explicitly + * request a WRITE permission for their children and instead take + * the same permissions as their parent did (this is commonly what + * block filters do). Such drivers have to be aware that the + * parent may have taken a WRITE_UNCHANGED permission only and is + * issuing such requests. Drivers either must make sure that + * these requests do not result in plain WRITE accesses (usually + * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding + * every incoming write request as-is, including potentially that + * flag), or they have to explicitly take the WRITE permission for + * their children. */ + unsigned int supported_write_flags; + /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA, + * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */ + unsigned int supported_zero_flags; + /* + * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE). + * + * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure + * that any added space reads as all zeros. If this can't be guaranteed, + * the operation must fail. + */ + unsigned int supported_truncate_flags; + + /* the following member gives a name to every node on the bs graph. */ + char node_name[32]; + /* element of the list of named nodes building the graph */ + QTAILQ_ENTRY(BlockDriverState) node_list; + /* element of the list of all BlockDriverStates (all_bdrv_states) */ + QTAILQ_ENTRY(BlockDriverState) bs_list; + /* element of the list of monitor-owned BDS */ + QTAILQ_ENTRY(BlockDriverState) monitor_list; + int refcnt; + + /* operation blockers */ + QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX]; + + /* The node that this node inherited default options from (and a reopen on + * which can affect this node by changing these defaults). This is always a + * parent node of this node. */ + BlockDriverState *inherits_from; + QLIST_HEAD(, BdrvChild) children; + QLIST_HEAD(, BdrvChild) parents; + + QDict *options; + QDict *explicit_options; + BlockdevDetectZeroesOptions detect_zeroes; + + /* The error object in use for blocking operations on backing_hd */ + Error *backing_blocker; + + /* Protected by AioContext lock */ + + /* If we are reading a disk image, give its size in sectors. + * Generally read-only; it is written to by load_snapshot and + * save_snaphost, but the block layer is quiescent during those. + */ + int64_t total_sectors; + + /* threshold limit for writes, in bytes. "High water mark". */ + uint64_t write_threshold_offset; + + /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex. + * Reading from the list can be done with either the BQL or the + * dirty_bitmap_mutex. Modifying a bitmap only requires + * dirty_bitmap_mutex. */ + QemuMutex dirty_bitmap_mutex; + QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps; + + /* Offset after the highest byte written to */ + Stat64 wr_highest_offset; + + /* If true, copy read backing sectors into image. Can be >1 if more + * than one client has requested copy-on-read. Accessed with atomic + * ops. + */ + int copy_on_read; + + /* number of in-flight requests; overall and serialising. + * Accessed with atomic ops. + */ + unsigned int in_flight; + unsigned int serialising_in_flight; + + /* counter for nested bdrv_io_plug. + * Accessed with atomic ops. + */ + unsigned io_plugged; + + /* do we need to tell the quest if we have a volatile write cache? */ + int enable_write_cache; + + /* Accessed with atomic ops. */ + int quiesce_counter; + int recursive_quiesce_counter; + + unsigned int write_gen; /* Current data generation */ + + /* Protected by reqs_lock. */ + CoMutex reqs_lock; + QLIST_HEAD(, BdrvTrackedRequest) tracked_requests; + CoQueue flush_queue; /* Serializing flush queue */ + bool active_flush_req; /* Flush request in flight? */ + + /* Only read/written by whoever has set active_flush_req to true. */ + unsigned int flushed_gen; /* Flushed write generation */ + + /* BdrvChild links to this node may never be frozen */ + bool never_freeze; + + /* Lock for block-status cache RCU writers */ + CoMutex bsc_modify_lock; + /* Always non-NULL, but must only be dereferenced under an RCU read guard */ + BdrvBlockStatusCache *block_status_cache; +}; + +struct BlockBackendRootState { + int open_flags; + BlockdevDetectZeroesOptions detect_zeroes; +}; + +typedef enum BlockMirrorBackingMode { + /* Reuse the existing backing chain from the source for the target. + * - sync=full: Set backing BDS to NULL. + * - sync=top: Use source's backing BDS. + * - sync=none: Use source as the backing BDS. */ + MIRROR_SOURCE_BACKING_CHAIN, + + /* Open the target's backing chain completely anew */ + MIRROR_OPEN_BACKING_CHAIN, + + /* Do not change the target's backing BDS after job completion */ + MIRROR_LEAVE_BACKING_CHAIN, +} BlockMirrorBackingMode; + + +/* Essential block drivers which must always be statically linked into qemu, and + * which therefore can be accessed without using bdrv_find_format() */ +extern BlockDriver bdrv_file; +extern BlockDriver bdrv_raw; +extern BlockDriver bdrv_qcow2; + +int coroutine_fn bdrv_co_preadv(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn bdrv_co_preadv_part(BdrvChild *child, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); +int coroutine_fn bdrv_co_pwritev(BdrvChild *child, + int64_t offset, int64_t bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child, + int64_t offset, int64_t bytes, + QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags); + +static inline int coroutine_fn bdrv_co_pread(BdrvChild *child, + int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + + return bdrv_co_preadv(child, offset, bytes, &qiov, flags); +} + +static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child, + int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags) +{ + QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes); + + return bdrv_co_pwritev(child, offset, bytes, &qiov, flags); +} + +extern unsigned int bdrv_drain_all_count; +void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent); +void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent); + +bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req, + uint64_t align); +BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs); + +int get_tmp_filename(char *filename, int size); +BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size, + const char *filename); + +void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix, + QDict *options); + +bool bdrv_backing_overridden(BlockDriverState *bs); + + +/** + * bdrv_add_aio_context_notifier: + * + * If a long-running job intends to be always run in the same AioContext as a + * certain BDS, it may use this function to be notified of changes regarding the + * association of the BDS to an AioContext. + * + * attached_aio_context() is called after the target BDS has been attached to a + * new AioContext; detach_aio_context() is called before the target BDS is being + * detached from its old AioContext. + */ +void bdrv_add_aio_context_notifier(BlockDriverState *bs, + void (*attached_aio_context)(AioContext *new_context, void *opaque), + void (*detach_aio_context)(void *opaque), void *opaque); + +/** + * bdrv_remove_aio_context_notifier: + * + * Unsubscribe of change notifications regarding the BDS's AioContext. The + * parameters given here have to be the same as those given to + * bdrv_add_aio_context_notifier(). + */ +void bdrv_remove_aio_context_notifier(BlockDriverState *bs, + void (*aio_context_attached)(AioContext *, + void *), + void (*aio_context_detached)(void *), + void *opaque); + +/** + * bdrv_wakeup: + * @bs: The BlockDriverState for which an I/O operation has been completed. + * + * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During + * synchronous I/O on a BlockDriverState that is attached to another + * I/O thread, the main thread lets the I/O thread's event loop run, + * waiting for the I/O operation to complete. A bdrv_wakeup will wake + * up the main thread if necessary. + * + * Manual calls to bdrv_wakeup are rarely necessary, because + * bdrv_dec_in_flight already calls it. + */ +void bdrv_wakeup(BlockDriverState *bs); + +#ifdef _WIN32 +int is_windows_drive(const char *filename); +#endif + +/** + * stream_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @base: Block device that will become the new base, or %NULL to + * flatten the whole backing file chain onto @bs. + * @backing_file_str: The file name that will be written to @bs as the + * the new backing file if the job completes. Ignored if @base is %NULL. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the stream job inserts into the graph above + * @bs. NULL means that a node name should be autogenerated. + * @errp: Error object. + * + * Start a streaming operation on @bs. Clusters that are unallocated + * in @bs, but allocated in any image between @base and @bs (both + * exclusive) will be written to @bs. At the end of a successful + * streaming job, the backing file of @bs will be changed to + * @backing_file_str in the written image and to @base in the live + * BlockDriverState. + */ +void stream_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, const char *backing_file_str, + BlockDriverState *bottom, + int creation_flags, int64_t speed, + BlockdevOnError on_error, + const char *filter_node_name, + Error **errp); + +/** + * commit_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Active block device. + * @top: Top block device to be committed. + * @base: Block device that will be written into, and become the new top. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @backing_file_str: String to use as the backing file in @top's overlay + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @top. NULL means + * that a node name should be autogenerated. + * @errp: Error object. + * + */ +void commit_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, BlockDriverState *top, + int creation_flags, int64_t speed, + BlockdevOnError on_error, const char *backing_file_str, + const char *filter_node_name, Error **errp); +/** + * commit_active_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Active block device to be committed. + * @base: Block device that will be written into, and become the new top. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @on_error: The action to take upon error. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the commit job inserts into the graph above @bs. NULL means that + * a node name should be autogenerated. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @auto_complete: Auto complete the job. + * @errp: Error object. + * + */ +BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *base, int creation_flags, + int64_t speed, BlockdevOnError on_error, + const char *filter_node_name, + BlockCompletionFunc *cb, void *opaque, + bool auto_complete, Error **errp); +/* + * mirror_start: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @target: Block device to write to. + * @replaces: Block graph node name to replace once the mirror is done. Can + * only be used when full mirroring is selected. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @granularity: The chosen granularity for the dirty bitmap. + * @buf_size: The amount of data that can be in flight at one time. + * @mode: Whether to collapse all images in the chain to the target. + * @backing_mode: How to establish the target's backing chain after completion. + * @zero_target: Whether the target should be explicitly zero-initialized + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @unmap: Whether to unmap target where source sectors only contain zeroes. + * @filter_node_name: The node name that should be assigned to the filter + * driver that the mirror job inserts into the graph above @bs. NULL means that + * a node name should be autogenerated. + * @copy_mode: When to trigger writes to the target. + * @errp: Error object. + * + * Start a mirroring operation on @bs. Clusters that are allocated + * in @bs will be written to @target until the job is cancelled or + * manually completed. At the end of a successful mirroring job, + * @bs will be switched to read from @target. + */ +void mirror_start(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, const char *replaces, + int creation_flags, int64_t speed, + uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockMirrorBackingMode backing_mode, + bool zero_target, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, const char *filter_node_name, + MirrorCopyMode copy_mode, Error **errp); + +/* + * backup_job_create: + * @job_id: The id of the newly-created job, or %NULL to use the + * device name of @bs. + * @bs: Block device to operate on. + * @target: Block device to write to. + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @sync_mode: What parts of the disk image should be copied to the destination. + * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental' + * @bitmap_mode: The bitmap synchronization policy to use. + * @perf: Performance options. All actual fields assumed to be present, + * all ".has_*" fields are ignored. + * @on_source_error: The action to take upon error reading from the source. + * @on_target_error: The action to take upon error writing to the target. + * @creation_flags: Flags that control the behavior of the Job lifetime. + * See @BlockJobCreateFlags + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @txn: Transaction that this job is part of (may be NULL). + * + * Create a backup operation on @bs. Clusters in @bs are written to @target + * until the job is cancelled or manually completed. + */ +BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs, + BlockDriverState *target, int64_t speed, + MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, + BitmapSyncMode bitmap_mode, + bool compress, + const char *filter_node_name, + BackupPerf *perf, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + int creation_flags, + BlockCompletionFunc *cb, void *opaque, + JobTxn *txn, Error **errp); + +BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs, + const char *child_name, + const BdrvChildClass *child_class, + BdrvChildRole child_role, + uint64_t perm, uint64_t shared_perm, + void *opaque, Error **errp); +void bdrv_root_unref_child(BdrvChild *child); + +void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm, + uint64_t *shared_perm); + +/** + * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use + * bdrv_child_refresh_perms() instead and make the parent's + * .bdrv_child_perm() implementation return the correct values. + */ +int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared, + Error **errp); + +/** + * Calls bs->drv->bdrv_child_perm() and updates the child's permission + * masks with the result. + * Drivers should invoke this function whenever an event occurs that + * makes their .bdrv_child_perm() implementation return different + * values than before, but which will not result in the block layer + * automatically refreshing the permissions. + */ +int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp); + +bool bdrv_recurse_can_replace(BlockDriverState *bs, + BlockDriverState *to_replace); + +/* + * Default implementation for BlockDriver.bdrv_child_perm() that can + * be used by block filters and image formats, as long as they use the + * child_of_bds child class and set an appropriate BdrvChildRole. + */ +void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c, + BdrvChildRole role, BlockReopenQueue *reopen_queue, + uint64_t perm, uint64_t shared, + uint64_t *nperm, uint64_t *nshared); + +const char *bdrv_get_parent_name(const BlockDriverState *bs); +void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp); +bool blk_dev_has_removable_media(BlockBackend *blk); +bool blk_dev_has_tray(BlockBackend *blk); +void blk_dev_eject_request(BlockBackend *blk, bool force); +bool blk_dev_is_tray_open(BlockBackend *blk); +bool blk_dev_is_medium_locked(BlockBackend *blk); + +void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes); + +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out); +void bdrv_restore_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *backup); +bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest, + const BdrvDirtyBitmap *src, + HBitmap **backup, bool lock); + +void bdrv_inc_in_flight(BlockDriverState *bs); +void bdrv_dec_in_flight(BlockDriverState *bs); + +void blockdev_close_all_bdrv_states(void); + +int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); +int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset, + BdrvChild *dst, int64_t dst_offset, + int64_t bytes, + BdrvRequestFlags read_flags, + BdrvRequestFlags write_flags); + +int refresh_total_sectors(BlockDriverState *bs, int64_t hint); + +void bdrv_set_monitor_owned(BlockDriverState *bs); +BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp); + +/** + * Simple implementation of bdrv_co_create_opts for protocol drivers + * which only support creation via opening a file + * (usually existing raw storage device) + */ +int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv, + const char *filename, + QemuOpts *opts, + Error **errp); +extern QemuOptsList bdrv_create_opts_simple; + +BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node, + const char *name, + BlockDriverState **pbs, + Error **errp); +BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target, + BlockDirtyBitmapMergeSourceList *bms, + HBitmap **backup, Error **errp); +BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name, + bool release, + BlockDriverState **bitmap_bs, + Error **errp); + +BdrvChild *bdrv_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_child(BlockDriverState *bs); +BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs); +BdrvChild *bdrv_primary_child(BlockDriverState *bs); +BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs); +BlockDriverState *bdrv_skip_filters(BlockDriverState *bs); +BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs); + +static inline BlockDriverState *child_bs(BdrvChild *child) +{ + return child ? child->bs : NULL; +} + +static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_filter_child(bs)); +} + +static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_filter_or_cow_child(bs)); +} + +static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs) +{ + return child_bs(bdrv_primary_child(bs)); +} + +/** + * End all quiescent sections started by bdrv_drain_all_begin(). This is + * needed when deleting a BDS before bdrv_drain_all_end() is called. + * + * NOTE: this is an internal helper for bdrv_close() *only*. No one else + * should call it. + */ +void bdrv_drain_all_end_quiesce(BlockDriverState *bs); + +/** + * Check whether the given offset is in the cached block-status data + * region. + * + * If it is, and @pnum is not NULL, *pnum is set to + * `bsc.data_end - offset`, i.e. how many bytes, starting from + * @offset, are data (according to the cache). + * Otherwise, *pnum is not touched. + */ +bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum); + +/** + * If [offset, offset + bytes) overlaps with the currently cached + * block-status region, invalidate the cache. + * + * (To be used by I/O paths that cause data regions to be zero or + * holes.) + */ +void bdrv_bsc_invalidate_range(BlockDriverState *bs, + int64_t offset, int64_t bytes); + +/** + * Mark the range [offset, offset + bytes) as a data region. + */ +void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes); + +#endif /* BLOCK_INT_H */ diff --git a/include/block/blockjob.h b/include/block/blockjob.h new file mode 100644 index 000000000..d200f33c1 --- /dev/null +++ b/include/block/blockjob.h @@ -0,0 +1,176 @@ +/* + * Declarations for long-running block device operations + * + * Copyright (c) 2011 IBM Corp. + * Copyright (c) 2012 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCKJOB_H +#define BLOCKJOB_H + +#include "qemu/job.h" +#include "block/block.h" +#include "qemu/ratelimit.h" + +#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */ + +typedef struct BlockJobDriver BlockJobDriver; + +/** + * BlockJob: + * + * Long-running operation on a BlockDriverState. + */ +typedef struct BlockJob { + /** Data belonging to the generic Job infrastructure */ + Job job; + + /** The block device on which the job is operating. */ + BlockBackend *blk; + + /** Status that is published by the query-block-jobs QMP API */ + BlockDeviceIoStatus iostatus; + + /** Speed that was set with @block_job_set_speed. */ + int64_t speed; + + /** Rate limiting data structure for implementing @speed. */ + RateLimit limit; + + /** Block other operations when block job is running */ + Error *blocker; + + /** Called when a cancelled job is finalised. */ + Notifier finalize_cancelled_notifier; + + /** Called when a successfully completed job is finalised. */ + Notifier finalize_completed_notifier; + + /** Called when the job transitions to PENDING */ + Notifier pending_notifier; + + /** Called when the job transitions to READY */ + Notifier ready_notifier; + + /** Called when the job coroutine yields or terminates */ + Notifier idle_notifier; + + /** BlockDriverStates that are involved in this block job */ + GSList *nodes; +} BlockJob; + +/** + * block_job_next: + * @job: A block job, or %NULL. + * + * Get the next element from the list of block jobs after @job, or the + * first one if @job is %NULL. + * + * Returns the requested job, or %NULL if there are no more jobs left. + */ +BlockJob *block_job_next(BlockJob *job); + +/** + * block_job_get: + * @id: The id of the block job. + * + * Get the block job identified by @id (which must not be %NULL). + * + * Returns the requested job, or %NULL if it doesn't exist. + */ +BlockJob *block_job_get(const char *id); + +/** + * block_job_add_bdrv: + * @job: A block job + * @name: The name to assign to the new BdrvChild + * @bs: A BlockDriverState that is involved in @job + * @perm, @shared_perm: Permissions to request on the node + * + * Add @bs to the list of BlockDriverState that are involved in + * @job. This means that all operations will be blocked on @bs while + * @job exists. + */ +int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs, + uint64_t perm, uint64_t shared_perm, Error **errp); + +/** + * block_job_remove_all_bdrv: + * @job: The block job + * + * Remove all BlockDriverStates from the list of nodes that are involved in the + * job. This removes the blockers added with block_job_add_bdrv(). + */ +void block_job_remove_all_bdrv(BlockJob *job); + +/** + * block_job_has_bdrv: + * @job: The block job + * + * Searches for @bs in the list of nodes that are involved in the + * job. + */ +bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs); + +/** + * block_job_set_speed: + * @job: The job to set the speed for. + * @speed: The new value + * @errp: Error object. + * + * Set a rate-limiting parameter for the job; the actual meaning may + * vary depending on the job type. + */ +bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp); + +/** + * block_job_query: + * @job: The job to get information about. + * + * Return information about a job. + */ +BlockJobInfo *block_job_query(BlockJob *job, Error **errp); + +/** + * block_job_iostatus_reset: + * @job: The job whose I/O status should be reset. + * + * Reset I/O status on @job and on BlockDriverState objects it uses, + * other than job->blk. + */ +void block_job_iostatus_reset(BlockJob *job); + +/** + * block_job_is_internal: + * @job: The job to determine if it is user-visible or not. + * + * Returns true if the job should not be visible to the management layer. + */ +bool block_job_is_internal(BlockJob *job); + +/** + * block_job_driver: + * + * Returns the driver associated with a block job. + */ +const BlockJobDriver *block_job_driver(BlockJob *job); + +#endif diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h new file mode 100644 index 000000000..6633d83da --- /dev/null +++ b/include/block/blockjob_int.h @@ -0,0 +1,122 @@ +/* + * Declarations for long-running block device operations + * + * Copyright (c) 2011 IBM Corp. + * Copyright (c) 2012 Red Hat, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCKJOB_INT_H +#define BLOCKJOB_INT_H + +#include "block/blockjob.h" +#include "block/block.h" + +/** + * BlockJobDriver: + * + * A class type for block job driver. + */ +struct BlockJobDriver { + /** Generic JobDriver callbacks and settings */ + JobDriver job_driver; + + /* + * Returns whether the job has pending requests for the child or will + * submit new requests before the next pause point. This callback is polled + * in the context of draining a job node after requesting that the job be + * paused, until all activity on the child has stopped. + */ + bool (*drained_poll)(BlockJob *job); + + /* + * If the callback is not NULL, it will be invoked before the job is + * resumed in a new AioContext. This is the place to move any resources + * besides job->blk to the new AioContext. + */ + void (*attached_aio_context)(BlockJob *job, AioContext *new_context); + + void (*set_speed)(BlockJob *job, int64_t speed); +}; + +/** + * block_job_create: + * @job_id: The id of the newly-created job, or %NULL to have one + * generated automatically. + * @driver: The class object for the newly-created job. + * @txn: The transaction this job belongs to, if any. %NULL otherwise. + * @bs: The block + * @perm, @shared_perm: Permissions to request for @bs + * @speed: The maximum speed, in bytes per second, or 0 for unlimited. + * @flags: Creation flags for the Block Job. See @JobCreateFlags. + * @cb: Completion function for the job. + * @opaque: Opaque pointer value passed to @cb. + * @errp: Error object. + * + * Create a new long-running block device job and return it. The job + * will call @cb asynchronously when the job completes. Note that + * @bs may have been closed at the time the @cb it is called. If + * this is the case, the job may be reported as either cancelled or + * completed. + * + * This function is not part of the public job interface; it should be + * called from a wrapper that is specific to the job type. + */ +void *block_job_create(const char *job_id, const BlockJobDriver *driver, + JobTxn *txn, BlockDriverState *bs, uint64_t perm, + uint64_t shared_perm, int64_t speed, int flags, + BlockCompletionFunc *cb, void *opaque, Error **errp); + +/** + * block_job_free: + * Callback to be used for JobDriver.free in all block jobs. Frees block job + * specific resources in @job. + */ +void block_job_free(Job *job); + +/** + * block_job_user_resume: + * Callback to be used for JobDriver.user_resume in all block jobs. Resets the + * iostatus when the user resumes @job. + */ +void block_job_user_resume(Job *job); + +/** + * block_job_ratelimit_get_delay: + * + * Calculate and return delay for the next request in ns. See the documentation + * of ratelimit_calculate_delay() for details. + */ +int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n); + +/** + * block_job_error_action: + * @job: The job to signal an error for. + * @on_err: The error action setting. + * @is_read: Whether the operation was a read. + * @error: The error that was reported. + * + * Report an I/O error for a block job and possibly stop the VM. Return the + * action that was selected based on @on_err and @error. + */ +BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err, + int is_read, int error); + +#endif diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h new file mode 100644 index 000000000..40950ae3d --- /dev/null +++ b/include/block/dirty-bitmap.h @@ -0,0 +1,121 @@ +#ifndef BLOCK_DIRTY_BITMAP_H +#define BLOCK_DIRTY_BITMAP_H + +#include "qapi/qapi-types-block-core.h" +#include "qemu/hbitmap.h" + +typedef enum BitmapCheckFlags { + BDRV_BITMAP_BUSY = 1, + BDRV_BITMAP_RO = 2, + BDRV_BITMAP_INCONSISTENT = 4, +} BitmapCheckFlags; + +#define BDRV_BITMAP_DEFAULT (BDRV_BITMAP_BUSY | BDRV_BITMAP_RO | \ + BDRV_BITMAP_INCONSISTENT) +#define BDRV_BITMAP_ALLOW_RO (BDRV_BITMAP_BUSY | BDRV_BITMAP_INCONSISTENT) + +#define BDRV_BITMAP_MAX_NAME_SIZE 1023 + +bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs); +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp); +int bdrv_dirty_bitmap_create_successor(BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BdrvDirtyBitmap *bitmap, + Error **errp); +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BdrvDirtyBitmap *bitmap, + Error **errp); +void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap *bitmap); +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, + const char *name); +int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags, + Error **errp); +void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs); +int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name, + Error **errp); +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap); +void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap); +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs); +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs); +uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_has_successor(BdrvDirtyBitmap *bitmap); +const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap); +int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap); +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t offset, int64_t bytes); +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t offset, int64_t bytes); +BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter); + +uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap, + uint64_t offset, uint64_t bytes); +uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap); +uint64_t bdrv_dirty_bitmap_serialization_coverage(int serialized_chunk_size, + const BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t offset, + uint64_t bytes); +void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap, + uint8_t *buf, uint64_t offset, + uint64_t bytes, bool finish); +void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap, + uint64_t offset, uint64_t bytes, + bool finish); +void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap, + uint64_t offset, uint64_t bytes, + bool finish); +void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap); + +void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value); +void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap *bitmap, + bool persistent); +void bdrv_dirty_bitmap_set_inconsistent(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_set_busy(BdrvDirtyBitmap *bitmap, bool busy); +void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src, + HBitmap **backup, Error **errp); +void bdrv_dirty_bitmap_skip_store(BdrvDirtyBitmap *bitmap, bool skip); +bool bdrv_dirty_bitmap_get(BdrvDirtyBitmap *bitmap, int64_t offset); + +/* Functions that require manual locking. */ +void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_get_locked(BdrvDirtyBitmap *bitmap, int64_t offset); +void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t offset, int64_t bytes); +void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + int64_t offset, int64_t bytes); +int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter); +void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset); +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap); +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs, int64_t bytes); +bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap); +bool bdrv_has_readonly_bitmaps(BlockDriverState *bs); +bool bdrv_has_named_bitmaps(BlockDriverState *bs); +bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_get_persistence(BdrvDirtyBitmap *bitmap); +bool bdrv_dirty_bitmap_inconsistent(const BdrvDirtyBitmap *bitmap); + +BdrvDirtyBitmap *bdrv_dirty_bitmap_first(BlockDriverState *bs); +BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BdrvDirtyBitmap *bitmap); +#define FOR_EACH_DIRTY_BITMAP(bs, bitmap) \ +for (bitmap = bdrv_dirty_bitmap_first(bs); bitmap; \ + bitmap = bdrv_dirty_bitmap_next(bitmap)) + +char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp); +int64_t bdrv_dirty_bitmap_next_dirty(BdrvDirtyBitmap *bitmap, int64_t offset, + int64_t bytes); +int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, int64_t offset, + int64_t bytes); +bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap *bitmap, + int64_t start, int64_t end, int64_t max_dirty_count, + int64_t *dirty_start, int64_t *dirty_count); +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap, + Error **errp); + +#endif diff --git a/include/block/export.h b/include/block/export.h new file mode 100644 index 000000000..7feb02e10 --- /dev/null +++ b/include/block/export.h @@ -0,0 +1,89 @@ +/* + * Declarations for block exports + * + * Copyright (c) 2012, 2020 Red Hat, Inc. + * + * Authors: + * Paolo Bonzini <pbonzini@redhat.com> + * Kevin Wolf <kwolf@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef BLOCK_EXPORT_H +#define BLOCK_EXPORT_H + +#include "qapi/qapi-types-block-export.h" +#include "qemu/queue.h" + +typedef struct BlockExport BlockExport; + +typedef struct BlockExportDriver { + /* The export type that this driver services */ + BlockExportType type; + + /* + * The size of the driver-specific state that contains BlockExport as its + * first field. + */ + size_t instance_size; + + /* Creates and starts a new block export */ + int (*create)(BlockExport *, BlockExportOptions *, Error **); + + /* + * Frees a removed block export. This function is only called after all + * references have been dropped. + */ + void (*delete)(BlockExport *); + + /* + * Start to disconnect all clients and drop other references held + * internally by the export driver. When the function returns, there may + * still be active references while the export is in the process of + * shutting down. + */ + void (*request_shutdown)(BlockExport *); +} BlockExportDriver; + +struct BlockExport { + const BlockExportDriver *drv; + + /* Unique identifier for the export */ + char *id; + + /* + * Reference count for this block export. This includes strong references + * both from the owner (qemu-nbd or the monitor) and clients connected to + * the export. + */ + int refcount; + + /* + * True if one of the references in refcount belongs to the user. After the + * user has dropped their reference, they may not e.g. remove the same + * export a second time (which would decrease the refcount without having + * it incremented first). + */ + bool user_owned; + + /* The AioContext whose lock protects this BlockExport object. */ + AioContext *ctx; + + /* The block device to export */ + BlockBackend *blk; + + /* List entry for block_exports */ + QLIST_ENTRY(BlockExport) next; +}; + +BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp); +BlockExport *blk_exp_find(const char *id); +void blk_exp_ref(BlockExport *exp); +void blk_exp_unref(BlockExport *exp); +void blk_exp_request_shutdown(BlockExport *exp); +void blk_exp_close_all(void); +void blk_exp_close_all_type(BlockExportType type); + +#endif diff --git a/include/block/fuse.h b/include/block/fuse.h new file mode 100644 index 000000000..ffa91fe36 --- /dev/null +++ b/include/block/fuse.h @@ -0,0 +1,30 @@ +/* + * Present a block device as a raw image through FUSE + * + * Copyright (c) 2020 Max Reitz <mreitz@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 or later of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef BLOCK_FUSE_H +#define BLOCK_FUSE_H + +#ifdef CONFIG_FUSE + +#include "block/export.h" + +extern const BlockExportDriver blk_exp_fuse; + +#endif /* CONFIG_FUSE */ + +#endif diff --git a/include/block/nbd.h b/include/block/nbd.h new file mode 100644 index 000000000..78d101b77 --- /dev/null +++ b/include/block/nbd.h @@ -0,0 +1,427 @@ +/* + * Copyright (C) 2016-2020 Red Hat, Inc. + * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws> + * + * Network Block Device + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef NBD_H +#define NBD_H + +#include "block/export.h" +#include "io/channel-socket.h" +#include "crypto/tlscreds.h" +#include "qapi/error.h" + +extern const BlockExportDriver blk_exp_nbd; + +/* Handshake phase structs - this struct is passed on the wire */ + +struct NBDOption { + uint64_t magic; /* NBD_OPTS_MAGIC */ + uint32_t option; /* NBD_OPT_* */ + uint32_t length; +} QEMU_PACKED; +typedef struct NBDOption NBDOption; + +struct NBDOptionReply { + uint64_t magic; /* NBD_REP_MAGIC */ + uint32_t option; /* NBD_OPT_* */ + uint32_t type; /* NBD_REP_* */ + uint32_t length; +} QEMU_PACKED; +typedef struct NBDOptionReply NBDOptionReply; + +typedef struct NBDOptionReplyMetaContext { + NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */ + uint32_t context_id; + /* metadata context name follows */ +} QEMU_PACKED NBDOptionReplyMetaContext; + +/* Transmission phase structs + * + * Note: these are _NOT_ the same as the network representation of an NBD + * request and reply! + */ +struct NBDRequest { + uint64_t handle; + uint64_t from; + uint32_t len; + uint16_t flags; /* NBD_CMD_FLAG_* */ + uint16_t type; /* NBD_CMD_* */ +}; +typedef struct NBDRequest NBDRequest; + +typedef struct NBDSimpleReply { + uint32_t magic; /* NBD_SIMPLE_REPLY_MAGIC */ + uint32_t error; + uint64_t handle; +} QEMU_PACKED NBDSimpleReply; + +/* Header of all structured replies */ +typedef struct NBDStructuredReplyChunk { + uint32_t magic; /* NBD_STRUCTURED_REPLY_MAGIC */ + uint16_t flags; /* combination of NBD_REPLY_FLAG_* */ + uint16_t type; /* NBD_REPLY_TYPE_* */ + uint64_t handle; /* request handle */ + uint32_t length; /* length of payload */ +} QEMU_PACKED NBDStructuredReplyChunk; + +typedef union NBDReply { + NBDSimpleReply simple; + NBDStructuredReplyChunk structured; + struct { + /* @magic and @handle fields have the same offset and size both in + * simple reply and structured reply chunk, so let them be accessible + * without ".simple." or ".structured." specification + */ + uint32_t magic; + uint32_t _skip; + uint64_t handle; + } QEMU_PACKED; +} NBDReply; + +/* Header of chunk for NBD_REPLY_TYPE_OFFSET_DATA */ +typedef struct NBDStructuredReadData { + NBDStructuredReplyChunk h; /* h.length >= 9 */ + uint64_t offset; + /* At least one byte of data payload follows, calculated from h.length */ +} QEMU_PACKED NBDStructuredReadData; + +/* Complete chunk for NBD_REPLY_TYPE_OFFSET_HOLE */ +typedef struct NBDStructuredReadHole { + NBDStructuredReplyChunk h; /* h.length == 12 */ + uint64_t offset; + uint32_t length; +} QEMU_PACKED NBDStructuredReadHole; + +/* Header of all NBD_REPLY_TYPE_ERROR* errors */ +typedef struct NBDStructuredError { + NBDStructuredReplyChunk h; /* h.length >= 6 */ + uint32_t error; + uint16_t message_length; +} QEMU_PACKED NBDStructuredError; + +/* Header of NBD_REPLY_TYPE_BLOCK_STATUS */ +typedef struct NBDStructuredMeta { + NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */ + uint32_t context_id; + /* extents follows */ +} QEMU_PACKED NBDStructuredMeta; + +/* Extent chunk for NBD_REPLY_TYPE_BLOCK_STATUS */ +typedef struct NBDExtent { + uint32_t length; + uint32_t flags; /* NBD_STATE_* */ +} QEMU_PACKED NBDExtent; + +/* Transmission (export) flags: sent from server to client during handshake, + but describe what will happen during transmission */ +enum { + NBD_FLAG_HAS_FLAGS_BIT = 0, /* Flags are there */ + NBD_FLAG_READ_ONLY_BIT = 1, /* Device is read-only */ + NBD_FLAG_SEND_FLUSH_BIT = 2, /* Send FLUSH */ + NBD_FLAG_SEND_FUA_BIT = 3, /* Send FUA (Force Unit Access) */ + NBD_FLAG_ROTATIONAL_BIT = 4, /* Use elevator algorithm - + rotational media */ + NBD_FLAG_SEND_TRIM_BIT = 5, /* Send TRIM (discard) */ + NBD_FLAG_SEND_WRITE_ZEROES_BIT = 6, /* Send WRITE_ZEROES */ + NBD_FLAG_SEND_DF_BIT = 7, /* Send DF (Do not Fragment) */ + NBD_FLAG_CAN_MULTI_CONN_BIT = 8, /* Multi-client cache consistent */ + NBD_FLAG_SEND_RESIZE_BIT = 9, /* Send resize */ + NBD_FLAG_SEND_CACHE_BIT = 10, /* Send CACHE (prefetch) */ + NBD_FLAG_SEND_FAST_ZERO_BIT = 11, /* FAST_ZERO flag for WRITE_ZEROES */ +}; + +#define NBD_FLAG_HAS_FLAGS (1 << NBD_FLAG_HAS_FLAGS_BIT) +#define NBD_FLAG_READ_ONLY (1 << NBD_FLAG_READ_ONLY_BIT) +#define NBD_FLAG_SEND_FLUSH (1 << NBD_FLAG_SEND_FLUSH_BIT) +#define NBD_FLAG_SEND_FUA (1 << NBD_FLAG_SEND_FUA_BIT) +#define NBD_FLAG_ROTATIONAL (1 << NBD_FLAG_ROTATIONAL_BIT) +#define NBD_FLAG_SEND_TRIM (1 << NBD_FLAG_SEND_TRIM_BIT) +#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT) +#define NBD_FLAG_SEND_DF (1 << NBD_FLAG_SEND_DF_BIT) +#define NBD_FLAG_CAN_MULTI_CONN (1 << NBD_FLAG_CAN_MULTI_CONN_BIT) +#define NBD_FLAG_SEND_RESIZE (1 << NBD_FLAG_SEND_RESIZE_BIT) +#define NBD_FLAG_SEND_CACHE (1 << NBD_FLAG_SEND_CACHE_BIT) +#define NBD_FLAG_SEND_FAST_ZERO (1 << NBD_FLAG_SEND_FAST_ZERO_BIT) + +/* New-style handshake (global) flags, sent from server to client, and + control what will happen during handshake phase. */ +#define NBD_FLAG_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_NO_ZEROES (1 << 1) /* End handshake without zeroes. */ + +/* New-style client flags, sent from client to server to control what happens + during handshake phase. */ +#define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */ +#define NBD_FLAG_C_NO_ZEROES (1 << 1) /* End handshake without zeroes. */ + +/* Option requests. */ +#define NBD_OPT_EXPORT_NAME (1) +#define NBD_OPT_ABORT (2) +#define NBD_OPT_LIST (3) +/* #define NBD_OPT_PEEK_EXPORT (4) not in use */ +#define NBD_OPT_STARTTLS (5) +#define NBD_OPT_INFO (6) +#define NBD_OPT_GO (7) +#define NBD_OPT_STRUCTURED_REPLY (8) +#define NBD_OPT_LIST_META_CONTEXT (9) +#define NBD_OPT_SET_META_CONTEXT (10) + +/* Option reply types. */ +#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value)) + +#define NBD_REP_ACK (1) /* Data sending finished. */ +#define NBD_REP_SERVER (2) /* Export description. */ +#define NBD_REP_INFO (3) /* NBD_OPT_INFO/GO. */ +#define NBD_REP_META_CONTEXT (4) /* NBD_OPT_{LIST,SET}_META_CONTEXT */ + +#define NBD_REP_ERR_UNSUP NBD_REP_ERR(1) /* Unknown option */ +#define NBD_REP_ERR_POLICY NBD_REP_ERR(2) /* Server denied */ +#define NBD_REP_ERR_INVALID NBD_REP_ERR(3) /* Invalid length */ +#define NBD_REP_ERR_PLATFORM NBD_REP_ERR(4) /* Not compiled in */ +#define NBD_REP_ERR_TLS_REQD NBD_REP_ERR(5) /* TLS required */ +#define NBD_REP_ERR_UNKNOWN NBD_REP_ERR(6) /* Export unknown */ +#define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR(7) /* Server shutting down */ +#define NBD_REP_ERR_BLOCK_SIZE_REQD NBD_REP_ERR(8) /* Need INFO_BLOCK_SIZE */ + +/* Info types, used during NBD_REP_INFO */ +#define NBD_INFO_EXPORT 0 +#define NBD_INFO_NAME 1 +#define NBD_INFO_DESCRIPTION 2 +#define NBD_INFO_BLOCK_SIZE 3 + +/* Request flags, sent from client to server during transmission phase */ +#define NBD_CMD_FLAG_FUA (1 << 0) /* 'force unit access' during write */ +#define NBD_CMD_FLAG_NO_HOLE (1 << 1) /* don't punch hole on zero run */ +#define NBD_CMD_FLAG_DF (1 << 2) /* don't fragment structured read */ +#define NBD_CMD_FLAG_REQ_ONE (1 << 3) /* only one extent in BLOCK_STATUS + * reply chunk */ +#define NBD_CMD_FLAG_FAST_ZERO (1 << 4) /* fail if WRITE_ZEROES is not fast */ + +/* Supported request types */ +enum { + NBD_CMD_READ = 0, + NBD_CMD_WRITE = 1, + NBD_CMD_DISC = 2, + NBD_CMD_FLUSH = 3, + NBD_CMD_TRIM = 4, + NBD_CMD_CACHE = 5, + NBD_CMD_WRITE_ZEROES = 6, + NBD_CMD_BLOCK_STATUS = 7, +}; + +#define NBD_DEFAULT_PORT 10809 + +/* Maximum size of a single READ/WRITE data buffer */ +#define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024) + +/* + * Maximum size of a protocol string (export name, metadata context name, + * etc.). Use malloc rather than stack allocation for storage of a + * string. + */ +#define NBD_MAX_STRING_SIZE 4096 + +/* Two types of reply structures */ +#define NBD_SIMPLE_REPLY_MAGIC 0x67446698 +#define NBD_STRUCTURED_REPLY_MAGIC 0x668e33ef + +/* Structured reply flags */ +#define NBD_REPLY_FLAG_DONE (1 << 0) /* This reply-chunk is last */ + +/* Structured reply types */ +#define NBD_REPLY_ERR(value) ((1 << 15) | (value)) + +#define NBD_REPLY_TYPE_NONE 0 +#define NBD_REPLY_TYPE_OFFSET_DATA 1 +#define NBD_REPLY_TYPE_OFFSET_HOLE 2 +#define NBD_REPLY_TYPE_BLOCK_STATUS 5 +#define NBD_REPLY_TYPE_ERROR NBD_REPLY_ERR(1) +#define NBD_REPLY_TYPE_ERROR_OFFSET NBD_REPLY_ERR(2) + +/* Extent flags for base:allocation in NBD_REPLY_TYPE_BLOCK_STATUS */ +#define NBD_STATE_HOLE (1 << 0) +#define NBD_STATE_ZERO (1 << 1) + +/* Extent flags for qemu:dirty-bitmap in NBD_REPLY_TYPE_BLOCK_STATUS */ +#define NBD_STATE_DIRTY (1 << 0) + +/* No flags needed for qemu:allocation-depth in NBD_REPLY_TYPE_BLOCK_STATUS */ + +static inline bool nbd_reply_type_is_error(int type) +{ + return type & (1 << 15); +} + +/* NBD errors are based on errno numbers, so there is a 1:1 mapping, + * but only a limited set of errno values is specified in the protocol. + * Everything else is squashed to EINVAL. + */ +#define NBD_SUCCESS 0 +#define NBD_EPERM 1 +#define NBD_EIO 5 +#define NBD_ENOMEM 12 +#define NBD_EINVAL 22 +#define NBD_ENOSPC 28 +#define NBD_EOVERFLOW 75 +#define NBD_ENOTSUP 95 +#define NBD_ESHUTDOWN 108 + +/* Details collected by NBD_OPT_EXPORT_NAME and NBD_OPT_GO */ +struct NBDExportInfo { + /* Set by client before nbd_receive_negotiate() */ + bool request_sizes; + char *x_dirty_bitmap; + + /* Set by client before nbd_receive_negotiate(), or by server results + * during nbd_receive_export_list() */ + char *name; /* must be non-NULL */ + + /* In-out fields, set by client before nbd_receive_negotiate() and + * updated by server results during nbd_receive_negotiate() */ + bool structured_reply; + bool base_allocation; /* base:allocation context for NBD_CMD_BLOCK_STATUS */ + + /* Set by server results during nbd_receive_negotiate() and + * nbd_receive_export_list() */ + uint64_t size; + uint16_t flags; + uint32_t min_block; + uint32_t opt_block; + uint32_t max_block; + + uint32_t context_id; + + /* Set by server results during nbd_receive_export_list() */ + char *description; + int n_contexts; + char **contexts; +}; +typedef struct NBDExportInfo NBDExportInfo; + +int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc, + QCryptoTLSCreds *tlscreds, + const char *hostname, QIOChannel **outioc, + NBDExportInfo *info, Error **errp); +void nbd_free_export_list(NBDExportInfo *info, int count); +int nbd_receive_export_list(QIOChannel *ioc, QCryptoTLSCreds *tlscreds, + const char *hostname, NBDExportInfo **info, + Error **errp); +int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info, + Error **errp); +int nbd_send_request(QIOChannel *ioc, NBDRequest *request); +int coroutine_fn nbd_receive_reply(BlockDriverState *bs, QIOChannel *ioc, + NBDReply *reply, Error **errp); +int nbd_client(int fd); +int nbd_disconnect(int fd); +int nbd_errno_to_system_errno(int err); + +typedef struct NBDExport NBDExport; +typedef struct NBDClient NBDClient; + +void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk); + +AioContext *nbd_export_aio_context(NBDExport *exp); +NBDExport *nbd_export_find(const char *name); + +void nbd_client_new(QIOChannelSocket *sioc, + QCryptoTLSCreds *tlscreds, + const char *tlsauthz, + void (*close_fn)(NBDClient *, bool)); +void nbd_client_get(NBDClient *client); +void nbd_client_put(NBDClient *client); + +void nbd_server_is_qemu_nbd(bool value); +bool nbd_server_is_running(void); +void nbd_server_start(SocketAddress *addr, const char *tls_creds, + const char *tls_authz, uint32_t max_connections, + Error **errp); +void nbd_server_start_options(NbdServerOptions *arg, Error **errp); + +/* nbd_read + * Reads @size bytes from @ioc. Returns 0 on success. + */ +static inline int nbd_read(QIOChannel *ioc, void *buffer, size_t size, + const char *desc, Error **errp) +{ + ERRP_GUARD(); + int ret = qio_channel_read_all(ioc, buffer, size, errp) < 0 ? -EIO : 0; + + if (ret < 0) { + if (desc) { + error_prepend(errp, "Failed to read %s: ", desc); + } + return ret; + } + + return 0; +} + +#define DEF_NBD_READ_N(bits) \ +static inline int nbd_read##bits(QIOChannel *ioc, \ + uint##bits##_t *val, \ + const char *desc, Error **errp) \ +{ \ + int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \ + if (ret < 0) { \ + return ret; \ + } \ + *val = be##bits##_to_cpu(*val); \ + return 0; \ +} + +DEF_NBD_READ_N(16) /* Defines nbd_read16(). */ +DEF_NBD_READ_N(32) /* Defines nbd_read32(). */ +DEF_NBD_READ_N(64) /* Defines nbd_read64(). */ + +#undef DEF_NBD_READ_N + +static inline bool nbd_reply_is_simple(NBDReply *reply) +{ + return reply->magic == NBD_SIMPLE_REPLY_MAGIC; +} + +static inline bool nbd_reply_is_structured(NBDReply *reply) +{ + return reply->magic == NBD_STRUCTURED_REPLY_MAGIC; +} + +const char *nbd_reply_type_lookup(uint16_t type); +const char *nbd_opt_lookup(uint32_t opt); +const char *nbd_rep_lookup(uint32_t rep); +const char *nbd_info_lookup(uint16_t info); +const char *nbd_cmd_lookup(uint16_t info); +const char *nbd_err_lookup(int err); + +/* nbd/client-connection.c */ +typedef struct NBDClientConnection NBDClientConnection; + +void nbd_client_connection_enable_retry(NBDClientConnection *conn); + +NBDClientConnection *nbd_client_connection_new(const SocketAddress *saddr, + bool do_negotiation, + const char *export_name, + const char *x_dirty_bitmap, + QCryptoTLSCreds *tlscreds); +void nbd_client_connection_release(NBDClientConnection *conn); + +QIOChannel *coroutine_fn +nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info, + bool blocking, Error **errp); + +void coroutine_fn nbd_co_establish_connection_cancel(NBDClientConnection *conn); + +#endif diff --git a/include/block/nvme.h b/include/block/nvme.h new file mode 100644 index 000000000..e3bd47bf7 --- /dev/null +++ b/include/block/nvme.h @@ -0,0 +1,1501 @@ +#ifndef BLOCK_NVME_H +#define BLOCK_NVME_H + +typedef struct QEMU_PACKED NvmeBar { + uint64_t cap; + uint32_t vs; + uint32_t intms; + uint32_t intmc; + uint32_t cc; + uint8_t rsvd24[4]; + uint32_t csts; + uint32_t nssr; + uint32_t aqa; + uint64_t asq; + uint64_t acq; + uint32_t cmbloc; + uint32_t cmbsz; + uint32_t bpinfo; + uint32_t bprsel; + uint64_t bpmbl; + uint64_t cmbmsc; + uint32_t cmbsts; + uint8_t rsvd92[3492]; + uint32_t pmrcap; + uint32_t pmrctl; + uint32_t pmrsts; + uint32_t pmrebs; + uint32_t pmrswtp; + uint32_t pmrmscl; + uint32_t pmrmscu; + uint8_t css[484]; +} NvmeBar; + +enum NvmeBarRegs { + NVME_REG_CAP = offsetof(NvmeBar, cap), + NVME_REG_VS = offsetof(NvmeBar, vs), + NVME_REG_INTMS = offsetof(NvmeBar, intms), + NVME_REG_INTMC = offsetof(NvmeBar, intmc), + NVME_REG_CC = offsetof(NvmeBar, cc), + NVME_REG_CSTS = offsetof(NvmeBar, csts), + NVME_REG_NSSR = offsetof(NvmeBar, nssr), + NVME_REG_AQA = offsetof(NvmeBar, aqa), + NVME_REG_ASQ = offsetof(NvmeBar, asq), + NVME_REG_ACQ = offsetof(NvmeBar, acq), + NVME_REG_CMBLOC = offsetof(NvmeBar, cmbloc), + NVME_REG_CMBSZ = offsetof(NvmeBar, cmbsz), + NVME_REG_BPINFO = offsetof(NvmeBar, bpinfo), + NVME_REG_BPRSEL = offsetof(NvmeBar, bprsel), + NVME_REG_BPMBL = offsetof(NvmeBar, bpmbl), + NVME_REG_CMBMSC = offsetof(NvmeBar, cmbmsc), + NVME_REG_CMBSTS = offsetof(NvmeBar, cmbsts), + NVME_REG_PMRCAP = offsetof(NvmeBar, pmrcap), + NVME_REG_PMRCTL = offsetof(NvmeBar, pmrctl), + NVME_REG_PMRSTS = offsetof(NvmeBar, pmrsts), + NVME_REG_PMREBS = offsetof(NvmeBar, pmrebs), + NVME_REG_PMRSWTP = offsetof(NvmeBar, pmrswtp), + NVME_REG_PMRMSCL = offsetof(NvmeBar, pmrmscl), + NVME_REG_PMRMSCU = offsetof(NvmeBar, pmrmscu), +}; + +enum NvmeCapShift { + CAP_MQES_SHIFT = 0, + CAP_CQR_SHIFT = 16, + CAP_AMS_SHIFT = 17, + CAP_TO_SHIFT = 24, + CAP_DSTRD_SHIFT = 32, + CAP_NSSRS_SHIFT = 36, + CAP_CSS_SHIFT = 37, + CAP_MPSMIN_SHIFT = 48, + CAP_MPSMAX_SHIFT = 52, + CAP_PMRS_SHIFT = 56, + CAP_CMBS_SHIFT = 57, +}; + +enum NvmeCapMask { + CAP_MQES_MASK = 0xffff, + CAP_CQR_MASK = 0x1, + CAP_AMS_MASK = 0x3, + CAP_TO_MASK = 0xff, + CAP_DSTRD_MASK = 0xf, + CAP_NSSRS_MASK = 0x1, + CAP_CSS_MASK = 0xff, + CAP_MPSMIN_MASK = 0xf, + CAP_MPSMAX_MASK = 0xf, + CAP_PMRS_MASK = 0x1, + CAP_CMBS_MASK = 0x1, +}; + +#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK) +#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK) +#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK) +#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK) +#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK) +#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK) +#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK) +#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK) +#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK) +#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK) +#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK) + +#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \ + << CAP_MQES_SHIFT) +#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \ + << CAP_CQR_SHIFT) +#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \ + << CAP_AMS_SHIFT) +#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \ + << CAP_TO_SHIFT) +#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \ + << CAP_DSTRD_SHIFT) +#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \ + << CAP_NSSRS_SHIFT) +#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \ + << CAP_CSS_SHIFT) +#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\ + << CAP_MPSMIN_SHIFT) +#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\ + << CAP_MPSMAX_SHIFT) +#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \ + << CAP_PMRS_SHIFT) +#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \ + << CAP_CMBS_SHIFT) + +enum NvmeCapCss { + NVME_CAP_CSS_NVM = 1 << 0, + NVME_CAP_CSS_CSI_SUPP = 1 << 6, + NVME_CAP_CSS_ADMIN_ONLY = 1 << 7, +}; + +enum NvmeCcShift { + CC_EN_SHIFT = 0, + CC_CSS_SHIFT = 4, + CC_MPS_SHIFT = 7, + CC_AMS_SHIFT = 11, + CC_SHN_SHIFT = 14, + CC_IOSQES_SHIFT = 16, + CC_IOCQES_SHIFT = 20, +}; + +enum NvmeCcMask { + CC_EN_MASK = 0x1, + CC_CSS_MASK = 0x7, + CC_MPS_MASK = 0xf, + CC_AMS_MASK = 0x7, + CC_SHN_MASK = 0x3, + CC_IOSQES_MASK = 0xf, + CC_IOCQES_MASK = 0xf, +}; + +#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK) +#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK) +#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK) +#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK) +#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK) +#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK) +#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK) + +enum NvmeCcCss { + NVME_CC_CSS_NVM = 0x0, + NVME_CC_CSS_CSI = 0x6, + NVME_CC_CSS_ADMIN_ONLY = 0x7, +}; + +#define NVME_SET_CC_EN(cc, val) \ + (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT) +#define NVME_SET_CC_CSS(cc, val) \ + (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT) +#define NVME_SET_CC_MPS(cc, val) \ + (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT) +#define NVME_SET_CC_AMS(cc, val) \ + (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT) +#define NVME_SET_CC_SHN(cc, val) \ + (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT) +#define NVME_SET_CC_IOSQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT) +#define NVME_SET_CC_IOCQES(cc, val) \ + (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT) + +enum NvmeCstsShift { + CSTS_RDY_SHIFT = 0, + CSTS_CFS_SHIFT = 1, + CSTS_SHST_SHIFT = 2, + CSTS_NSSRO_SHIFT = 4, +}; + +enum NvmeCstsMask { + CSTS_RDY_MASK = 0x1, + CSTS_CFS_MASK = 0x1, + CSTS_SHST_MASK = 0x3, + CSTS_NSSRO_MASK = 0x1, +}; + +enum NvmeCsts { + NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT, + NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT, + NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT, + NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT, + NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT, +}; + +#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK) +#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK) +#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK) +#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK) + +enum NvmeAqaShift { + AQA_ASQS_SHIFT = 0, + AQA_ACQS_SHIFT = 16, +}; + +enum NvmeAqaMask { + AQA_ASQS_MASK = 0xfff, + AQA_ACQS_MASK = 0xfff, +}; + +#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK) +#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK) + +enum NvmeCmblocShift { + CMBLOC_BIR_SHIFT = 0, + CMBLOC_CQMMS_SHIFT = 3, + CMBLOC_CQPDS_SHIFT = 4, + CMBLOC_CDPMLS_SHIFT = 5, + CMBLOC_CDPCILS_SHIFT = 6, + CMBLOC_CDMMMS_SHIFT = 7, + CMBLOC_CQDA_SHIFT = 8, + CMBLOC_OFST_SHIFT = 12, +}; + +enum NvmeCmblocMask { + CMBLOC_BIR_MASK = 0x7, + CMBLOC_CQMMS_MASK = 0x1, + CMBLOC_CQPDS_MASK = 0x1, + CMBLOC_CDPMLS_MASK = 0x1, + CMBLOC_CDPCILS_MASK = 0x1, + CMBLOC_CDMMMS_MASK = 0x1, + CMBLOC_CQDA_MASK = 0x1, + CMBLOC_OFST_MASK = 0xfffff, +}; + +#define NVME_CMBLOC_BIR(cmbloc) \ + ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK) +#define NVME_CMBLOC_CQMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK) +#define NVME_CMBLOC_CQPDS(cmbloc) \ + ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK) +#define NVME_CMBLOC_CDPMLS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK) +#define NVME_CMBLOC_CDPCILS(cmbloc) \ + ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK) +#define NVME_CMBLOC_CDMMMS(cmbloc) \ + ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK) +#define NVME_CMBLOC_CQDA(cmbloc) \ + ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK) +#define NVME_CMBLOC_OFST(cmbloc) \ + ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK) + +#define NVME_CMBLOC_SET_BIR(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT) +#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT) +#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT) +#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT) +#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT) +#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT) +#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT) +#define NVME_CMBLOC_SET_OFST(cmbloc, val) \ + (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT) + +#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT) + +enum NvmeCmbszShift { + CMBSZ_SQS_SHIFT = 0, + CMBSZ_CQS_SHIFT = 1, + CMBSZ_LISTS_SHIFT = 2, + CMBSZ_RDS_SHIFT = 3, + CMBSZ_WDS_SHIFT = 4, + CMBSZ_SZU_SHIFT = 8, + CMBSZ_SZ_SHIFT = 12, +}; + +enum NvmeCmbszMask { + CMBSZ_SQS_MASK = 0x1, + CMBSZ_CQS_MASK = 0x1, + CMBSZ_LISTS_MASK = 0x1, + CMBSZ_RDS_MASK = 0x1, + CMBSZ_WDS_MASK = 0x1, + CMBSZ_SZU_MASK = 0xf, + CMBSZ_SZ_MASK = 0xfffff, +}; + +#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK) +#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK) +#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK) +#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK) +#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK) +#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK) +#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK) + +#define NVME_CMBSZ_SET_SQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT) +#define NVME_CMBSZ_SET_CQS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT) +#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT) +#define NVME_CMBSZ_SET_RDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT) +#define NVME_CMBSZ_SET_WDS(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT) +#define NVME_CMBSZ_SET_SZU(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT) +#define NVME_CMBSZ_SET_SZ(cmbsz, val) \ + (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT) + +#define NVME_CMBSZ_GETSIZE(cmbsz) \ + (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz)))) + +enum NvmeCmbmscShift { + CMBMSC_CRE_SHIFT = 0, + CMBMSC_CMSE_SHIFT = 1, + CMBMSC_CBA_SHIFT = 12, +}; + +enum NvmeCmbmscMask { + CMBMSC_CRE_MASK = 0x1, + CMBMSC_CMSE_MASK = 0x1, + CMBMSC_CBA_MASK = ((1ULL << 52) - 1), +}; + +#define NVME_CMBMSC_CRE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CRE_SHIFT) & CMBMSC_CRE_MASK) +#define NVME_CMBMSC_CMSE(cmbmsc) \ + ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK) +#define NVME_CMBMSC_CBA(cmbmsc) \ + ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK) + + +#define NVME_CMBMSC_SET_CRE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT) +#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT) +#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \ + (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT) + +enum NvmeCmbstsShift { + CMBSTS_CBAI_SHIFT = 0, +}; +enum NvmeCmbstsMask { + CMBSTS_CBAI_MASK = 0x1, +}; + +#define NVME_CMBSTS_CBAI(cmbsts) \ + ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK) + +#define NVME_CMBSTS_SET_CBAI(cmbsts, val) \ + (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT) + +enum NvmePmrcapShift { + PMRCAP_RDS_SHIFT = 3, + PMRCAP_WDS_SHIFT = 4, + PMRCAP_BIR_SHIFT = 5, + PMRCAP_PMRTU_SHIFT = 8, + PMRCAP_PMRWBM_SHIFT = 10, + PMRCAP_PMRTO_SHIFT = 16, + PMRCAP_CMSS_SHIFT = 24, +}; + +enum NvmePmrcapMask { + PMRCAP_RDS_MASK = 0x1, + PMRCAP_WDS_MASK = 0x1, + PMRCAP_BIR_MASK = 0x7, + PMRCAP_PMRTU_MASK = 0x3, + PMRCAP_PMRWBM_MASK = 0xf, + PMRCAP_PMRTO_MASK = 0xff, + PMRCAP_CMSS_MASK = 0x1, +}; + +#define NVME_PMRCAP_RDS(pmrcap) \ + ((pmrcap >> PMRCAP_RDS_SHIFT) & PMRCAP_RDS_MASK) +#define NVME_PMRCAP_WDS(pmrcap) \ + ((pmrcap >> PMRCAP_WDS_SHIFT) & PMRCAP_WDS_MASK) +#define NVME_PMRCAP_BIR(pmrcap) \ + ((pmrcap >> PMRCAP_BIR_SHIFT) & PMRCAP_BIR_MASK) +#define NVME_PMRCAP_PMRTU(pmrcap) \ + ((pmrcap >> PMRCAP_PMRTU_SHIFT) & PMRCAP_PMRTU_MASK) +#define NVME_PMRCAP_PMRWBM(pmrcap) \ + ((pmrcap >> PMRCAP_PMRWBM_SHIFT) & PMRCAP_PMRWBM_MASK) +#define NVME_PMRCAP_PMRTO(pmrcap) \ + ((pmrcap >> PMRCAP_PMRTO_SHIFT) & PMRCAP_PMRTO_MASK) +#define NVME_PMRCAP_CMSS(pmrcap) \ + ((pmrcap >> PMRCAP_CMSS_SHIFT) & PMRCAP_CMSS_MASK) + +#define NVME_PMRCAP_SET_RDS(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT) +#define NVME_PMRCAP_SET_WDS(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT) +#define NVME_PMRCAP_SET_BIR(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT) +#define NVME_PMRCAP_SET_PMRTU(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT) +#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT) +#define NVME_PMRCAP_SET_PMRTO(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT) +#define NVME_PMRCAP_SET_CMSS(pmrcap, val) \ + (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT) + +enum NvmePmrctlShift { + PMRCTL_EN_SHIFT = 0, +}; + +enum NvmePmrctlMask { + PMRCTL_EN_MASK = 0x1, +}; + +#define NVME_PMRCTL_EN(pmrctl) ((pmrctl >> PMRCTL_EN_SHIFT) & PMRCTL_EN_MASK) + +#define NVME_PMRCTL_SET_EN(pmrctl, val) \ + (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT) + +enum NvmePmrstsShift { + PMRSTS_ERR_SHIFT = 0, + PMRSTS_NRDY_SHIFT = 8, + PMRSTS_HSTS_SHIFT = 9, + PMRSTS_CBAI_SHIFT = 12, +}; + +enum NvmePmrstsMask { + PMRSTS_ERR_MASK = 0xff, + PMRSTS_NRDY_MASK = 0x1, + PMRSTS_HSTS_MASK = 0x7, + PMRSTS_CBAI_MASK = 0x1, +}; + +#define NVME_PMRSTS_ERR(pmrsts) \ + ((pmrsts >> PMRSTS_ERR_SHIFT) & PMRSTS_ERR_MASK) +#define NVME_PMRSTS_NRDY(pmrsts) \ + ((pmrsts >> PMRSTS_NRDY_SHIFT) & PMRSTS_NRDY_MASK) +#define NVME_PMRSTS_HSTS(pmrsts) \ + ((pmrsts >> PMRSTS_HSTS_SHIFT) & PMRSTS_HSTS_MASK) +#define NVME_PMRSTS_CBAI(pmrsts) \ + ((pmrsts >> PMRSTS_CBAI_SHIFT) & PMRSTS_CBAI_MASK) + +#define NVME_PMRSTS_SET_ERR(pmrsts, val) \ + (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT) +#define NVME_PMRSTS_SET_NRDY(pmrsts, val) \ + (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT) +#define NVME_PMRSTS_SET_HSTS(pmrsts, val) \ + (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT) +#define NVME_PMRSTS_SET_CBAI(pmrsts, val) \ + (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT) + +enum NvmePmrebsShift { + PMREBS_PMRSZU_SHIFT = 0, + PMREBS_RBB_SHIFT = 4, + PMREBS_PMRWBZ_SHIFT = 8, +}; + +enum NvmePmrebsMask { + PMREBS_PMRSZU_MASK = 0xf, + PMREBS_RBB_MASK = 0x1, + PMREBS_PMRWBZ_MASK = 0xffffff, +}; + +#define NVME_PMREBS_PMRSZU(pmrebs) \ + ((pmrebs >> PMREBS_PMRSZU_SHIFT) & PMREBS_PMRSZU_MASK) +#define NVME_PMREBS_RBB(pmrebs) \ + ((pmrebs >> PMREBS_RBB_SHIFT) & PMREBS_RBB_MASK) +#define NVME_PMREBS_PMRWBZ(pmrebs) \ + ((pmrebs >> PMREBS_PMRWBZ_SHIFT) & PMREBS_PMRWBZ_MASK) + +#define NVME_PMREBS_SET_PMRSZU(pmrebs, val) \ + (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT) +#define NVME_PMREBS_SET_RBB(pmrebs, val) \ + (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT) +#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val) \ + (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT) + +enum NvmePmrswtpShift { + PMRSWTP_PMRSWTU_SHIFT = 0, + PMRSWTP_PMRSWTV_SHIFT = 8, +}; + +enum NvmePmrswtpMask { + PMRSWTP_PMRSWTU_MASK = 0xf, + PMRSWTP_PMRSWTV_MASK = 0xffffff, +}; + +#define NVME_PMRSWTP_PMRSWTU(pmrswtp) \ + ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT) & PMRSWTP_PMRSWTU_MASK) +#define NVME_PMRSWTP_PMRSWTV(pmrswtp) \ + ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT) & PMRSWTP_PMRSWTV_MASK) + +#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val) \ + (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT) +#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val) \ + (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT) + +enum NvmePmrmsclShift { + PMRMSCL_CMSE_SHIFT = 1, + PMRMSCL_CBA_SHIFT = 12, +}; + +enum NvmePmrmsclMask { + PMRMSCL_CMSE_MASK = 0x1, + PMRMSCL_CBA_MASK = 0xfffff, +}; + +#define NVME_PMRMSCL_CMSE(pmrmscl) \ + ((pmrmscl >> PMRMSCL_CMSE_SHIFT) & PMRMSCL_CMSE_MASK) +#define NVME_PMRMSCL_CBA(pmrmscl) \ + ((pmrmscl >> PMRMSCL_CBA_SHIFT) & PMRMSCL_CBA_MASK) + +#define NVME_PMRMSCL_SET_CMSE(pmrmscl, val) \ + (pmrmscl |= (uint32_t)(val & PMRMSCL_CMSE_MASK) << PMRMSCL_CMSE_SHIFT) +#define NVME_PMRMSCL_SET_CBA(pmrmscl, val) \ + (pmrmscl |= (uint32_t)(val & PMRMSCL_CBA_MASK) << PMRMSCL_CBA_SHIFT) + +enum NvmeSglDescriptorType { + NVME_SGL_DESCR_TYPE_DATA_BLOCK = 0x0, + NVME_SGL_DESCR_TYPE_BIT_BUCKET = 0x1, + NVME_SGL_DESCR_TYPE_SEGMENT = 0x2, + NVME_SGL_DESCR_TYPE_LAST_SEGMENT = 0x3, + NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK = 0x4, + + NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf, +}; + +enum NvmeSglDescriptorSubtype { + NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0, +}; + +typedef struct QEMU_PACKED NvmeSglDescriptor { + uint64_t addr; + uint32_t len; + uint8_t rsvd[3]; + uint8_t type; +} NvmeSglDescriptor; + +#define NVME_SGL_TYPE(type) ((type >> 4) & 0xf) +#define NVME_SGL_SUBTYPE(type) (type & 0xf) + +typedef union NvmeCmdDptr { + struct { + uint64_t prp1; + uint64_t prp2; + }; + + NvmeSglDescriptor sgl; +} NvmeCmdDptr; + +enum NvmePsdt { + NVME_PSDT_PRP = 0x0, + NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1, + NVME_PSDT_SGL_MPTR_SGL = 0x2, +}; + +typedef struct QEMU_PACKED NvmeCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t res1; + uint64_t mptr; + NvmeCmdDptr dptr; + uint32_t cdw10; + uint32_t cdw11; + uint32_t cdw12; + uint32_t cdw13; + uint32_t cdw14; + uint32_t cdw15; +} NvmeCmd; + +#define NVME_CMD_FLAGS_FUSE(flags) (flags & 0x3) +#define NVME_CMD_FLAGS_PSDT(flags) ((flags >> 6) & 0x3) + +enum NvmeAdminCommands { + NVME_ADM_CMD_DELETE_SQ = 0x00, + NVME_ADM_CMD_CREATE_SQ = 0x01, + NVME_ADM_CMD_GET_LOG_PAGE = 0x02, + NVME_ADM_CMD_DELETE_CQ = 0x04, + NVME_ADM_CMD_CREATE_CQ = 0x05, + NVME_ADM_CMD_IDENTIFY = 0x06, + NVME_ADM_CMD_ABORT = 0x08, + NVME_ADM_CMD_SET_FEATURES = 0x09, + NVME_ADM_CMD_GET_FEATURES = 0x0a, + NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c, + NVME_ADM_CMD_ACTIVATE_FW = 0x10, + NVME_ADM_CMD_DOWNLOAD_FW = 0x11, + NVME_ADM_CMD_NS_ATTACHMENT = 0x15, + NVME_ADM_CMD_FORMAT_NVM = 0x80, + NVME_ADM_CMD_SECURITY_SEND = 0x81, + NVME_ADM_CMD_SECURITY_RECV = 0x82, +}; + +enum NvmeIoCommands { + NVME_CMD_FLUSH = 0x00, + NVME_CMD_WRITE = 0x01, + NVME_CMD_READ = 0x02, + NVME_CMD_WRITE_UNCOR = 0x04, + NVME_CMD_COMPARE = 0x05, + NVME_CMD_WRITE_ZEROES = 0x08, + NVME_CMD_DSM = 0x09, + NVME_CMD_VERIFY = 0x0c, + NVME_CMD_COPY = 0x19, + NVME_CMD_ZONE_MGMT_SEND = 0x79, + NVME_CMD_ZONE_MGMT_RECV = 0x7a, + NVME_CMD_ZONE_APPEND = 0x7d, +}; + +typedef struct QEMU_PACKED NvmeDeleteQ { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[9]; + uint16_t qid; + uint16_t rsvd10; + uint32_t rsvd11[5]; +} NvmeDeleteQ; + +typedef struct QEMU_PACKED NvmeCreateCq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t cqid; + uint16_t qsize; + uint16_t cq_flags; + uint16_t irq_vector; + uint32_t rsvd12[4]; +} NvmeCreateCq; + +#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1) +#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1) + +enum NvmeFlagsCq { + NVME_CQ_PC = 1, + NVME_CQ_IEN = 2, +}; + +typedef struct QEMU_PACKED NvmeCreateSq { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t rsvd1[5]; + uint64_t prp1; + uint64_t rsvd8; + uint16_t sqid; + uint16_t qsize; + uint16_t sq_flags; + uint16_t cqid; + uint32_t rsvd12[4]; +} NvmeCreateSq; + +#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1) +#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3) + +enum NvmeFlagsSq { + NVME_SQ_PC = 1, + + NVME_SQ_PRIO_URGENT = 0, + NVME_SQ_PRIO_HIGH = 1, + NVME_SQ_PRIO_NORMAL = 2, + NVME_SQ_PRIO_LOW = 3, +}; + +typedef struct QEMU_PACKED NvmeIdentify { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + uint64_t prp1; + uint64_t prp2; + uint8_t cns; + uint8_t rsvd10; + uint16_t ctrlid; + uint16_t nvmsetid; + uint8_t rsvd11; + uint8_t csi; + uint32_t rsvd12[4]; +} NvmeIdentify; + +typedef struct QEMU_PACKED NvmeRwCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2; + uint64_t mptr; + NvmeCmdDptr dptr; + uint64_t slba; + uint16_t nlb; + uint16_t control; + uint32_t dsmgmt; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeRwCmd; + +enum { + NVME_RW_LR = 1 << 15, + NVME_RW_FUA = 1 << 14, + NVME_RW_DSM_FREQ_UNSPEC = 0, + NVME_RW_DSM_FREQ_TYPICAL = 1, + NVME_RW_DSM_FREQ_RARE = 2, + NVME_RW_DSM_FREQ_READS = 3, + NVME_RW_DSM_FREQ_WRITES = 4, + NVME_RW_DSM_FREQ_RW = 5, + NVME_RW_DSM_FREQ_ONCE = 6, + NVME_RW_DSM_FREQ_PREFETCH = 7, + NVME_RW_DSM_FREQ_TEMP = 8, + NVME_RW_DSM_LATENCY_NONE = 0 << 4, + NVME_RW_DSM_LATENCY_IDLE = 1 << 4, + NVME_RW_DSM_LATENCY_NORM = 2 << 4, + NVME_RW_DSM_LATENCY_LOW = 3 << 4, + NVME_RW_DSM_SEQ_REQ = 1 << 6, + NVME_RW_DSM_COMPRESSED = 1 << 7, + NVME_RW_PIREMAP = 1 << 9, + NVME_RW_PRINFO_PRACT = 1 << 13, + NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12, + NVME_RW_PRINFO_PRCHK_APP = 1 << 11, + NVME_RW_PRINFO_PRCHK_REF = 1 << 10, + NVME_RW_PRINFO_PRCHK_MASK = 7 << 10, + +}; + +#define NVME_RW_PRINFO(control) ((control >> 10) & 0xf) + +enum { + NVME_PRINFO_PRACT = 1 << 3, + NVME_PRINFO_PRCHK_GUARD = 1 << 2, + NVME_PRINFO_PRCHK_APP = 1 << 1, + NVME_PRINFO_PRCHK_REF = 1 << 0, + NVME_PRINFO_PRCHK_MASK = 7 << 0, +}; + +typedef struct QEMU_PACKED NvmeDsmCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint64_t rsvd2[2]; + NvmeCmdDptr dptr; + uint32_t nr; + uint32_t attributes; + uint32_t rsvd12[4]; +} NvmeDsmCmd; + +enum { + NVME_DSMGMT_IDR = 1 << 0, + NVME_DSMGMT_IDW = 1 << 1, + NVME_DSMGMT_AD = 1 << 2, +}; + +typedef struct QEMU_PACKED NvmeDsmRange { + uint32_t cattr; + uint32_t nlb; + uint64_t slba; +} NvmeDsmRange; + +enum { + NVME_COPY_FORMAT_0 = 0x0, +}; + +typedef struct QEMU_PACKED NvmeCopyCmd { + uint8_t opcode; + uint8_t flags; + uint16_t cid; + uint32_t nsid; + uint32_t rsvd2[4]; + NvmeCmdDptr dptr; + uint64_t sdlba; + uint8_t nr; + uint8_t control[3]; + uint16_t rsvd13; + uint16_t dspec; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeCopyCmd; + +typedef struct QEMU_PACKED NvmeCopySourceRange { + uint8_t rsvd0[8]; + uint64_t slba; + uint16_t nlb; + uint8_t rsvd18[6]; + uint32_t reftag; + uint16_t apptag; + uint16_t appmask; +} NvmeCopySourceRange; + +enum NvmeAsyncEventRequest { + NVME_AER_TYPE_ERROR = 0, + NVME_AER_TYPE_SMART = 1, + NVME_AER_TYPE_NOTICE = 2, + NVME_AER_TYPE_IO_SPECIFIC = 6, + NVME_AER_TYPE_VENDOR_SPECIFIC = 7, + NVME_AER_INFO_ERR_INVALID_DB_REGISTER = 0, + NVME_AER_INFO_ERR_INVALID_DB_VALUE = 1, + NVME_AER_INFO_ERR_DIAG_FAIL = 2, + NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3, + NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4, + NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5, + NVME_AER_INFO_SMART_RELIABILITY = 0, + NVME_AER_INFO_SMART_TEMP_THRESH = 1, + NVME_AER_INFO_SMART_SPARE_THRESH = 2, + NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED = 0, +}; + +typedef struct QEMU_PACKED NvmeAerResult { + uint8_t event_type; + uint8_t event_info; + uint8_t log_page; + uint8_t resv; +} NvmeAerResult; + +typedef struct QEMU_PACKED NvmeZonedResult { + uint64_t slba; +} NvmeZonedResult; + +typedef struct QEMU_PACKED NvmeCqe { + uint32_t result; + uint32_t dw1; + uint16_t sq_head; + uint16_t sq_id; + uint16_t cid; + uint16_t status; +} NvmeCqe; + +enum NvmeStatusCodes { + NVME_SUCCESS = 0x0000, + NVME_INVALID_OPCODE = 0x0001, + NVME_INVALID_FIELD = 0x0002, + NVME_CID_CONFLICT = 0x0003, + NVME_DATA_TRAS_ERROR = 0x0004, + NVME_POWER_LOSS_ABORT = 0x0005, + NVME_INTERNAL_DEV_ERROR = 0x0006, + NVME_CMD_ABORT_REQ = 0x0007, + NVME_CMD_ABORT_SQ_DEL = 0x0008, + NVME_CMD_ABORT_FAILED_FUSE = 0x0009, + NVME_CMD_ABORT_MISSING_FUSE = 0x000a, + NVME_INVALID_NSID = 0x000b, + NVME_CMD_SEQ_ERROR = 0x000c, + NVME_INVALID_SGL_SEG_DESCR = 0x000d, + NVME_INVALID_NUM_SGL_DESCRS = 0x000e, + NVME_DATA_SGL_LEN_INVALID = 0x000f, + NVME_MD_SGL_LEN_INVALID = 0x0010, + NVME_SGL_DESCR_TYPE_INVALID = 0x0011, + NVME_INVALID_USE_OF_CMB = 0x0012, + NVME_INVALID_PRP_OFFSET = 0x0013, + NVME_CMD_SET_CMB_REJECTED = 0x002b, + NVME_INVALID_CMD_SET = 0x002c, + NVME_LBA_RANGE = 0x0080, + NVME_CAP_EXCEEDED = 0x0081, + NVME_NS_NOT_READY = 0x0082, + NVME_NS_RESV_CONFLICT = 0x0083, + NVME_FORMAT_IN_PROGRESS = 0x0084, + NVME_INVALID_CQID = 0x0100, + NVME_INVALID_QID = 0x0101, + NVME_MAX_QSIZE_EXCEEDED = 0x0102, + NVME_ACL_EXCEEDED = 0x0103, + NVME_RESERVED = 0x0104, + NVME_AER_LIMIT_EXCEEDED = 0x0105, + NVME_INVALID_FW_SLOT = 0x0106, + NVME_INVALID_FW_IMAGE = 0x0107, + NVME_INVALID_IRQ_VECTOR = 0x0108, + NVME_INVALID_LOG_ID = 0x0109, + NVME_INVALID_FORMAT = 0x010a, + NVME_FW_REQ_RESET = 0x010b, + NVME_INVALID_QUEUE_DEL = 0x010c, + NVME_FID_NOT_SAVEABLE = 0x010d, + NVME_FEAT_NOT_CHANGEABLE = 0x010e, + NVME_FEAT_NOT_NS_SPEC = 0x010f, + NVME_FW_REQ_SUSYSTEM_RESET = 0x0110, + NVME_NS_ALREADY_ATTACHED = 0x0118, + NVME_NS_PRIVATE = 0x0119, + NVME_NS_NOT_ATTACHED = 0x011a, + NVME_NS_CTRL_LIST_INVALID = 0x011c, + NVME_CONFLICTING_ATTRS = 0x0180, + NVME_INVALID_PROT_INFO = 0x0181, + NVME_WRITE_TO_RO = 0x0182, + NVME_CMD_SIZE_LIMIT = 0x0183, + NVME_ZONE_BOUNDARY_ERROR = 0x01b8, + NVME_ZONE_FULL = 0x01b9, + NVME_ZONE_READ_ONLY = 0x01ba, + NVME_ZONE_OFFLINE = 0x01bb, + NVME_ZONE_INVALID_WRITE = 0x01bc, + NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd, + NVME_ZONE_TOO_MANY_OPEN = 0x01be, + NVME_ZONE_INVAL_TRANSITION = 0x01bf, + NVME_WRITE_FAULT = 0x0280, + NVME_UNRECOVERED_READ = 0x0281, + NVME_E2E_GUARD_ERROR = 0x0282, + NVME_E2E_APP_ERROR = 0x0283, + NVME_E2E_REF_ERROR = 0x0284, + NVME_CMP_FAILURE = 0x0285, + NVME_ACCESS_DENIED = 0x0286, + NVME_DULB = 0x0287, + NVME_MORE = 0x2000, + NVME_DNR = 0x4000, + NVME_NO_COMPLETE = 0xffff, +}; + +typedef struct QEMU_PACKED NvmeFwSlotInfoLog { + uint8_t afi; + uint8_t reserved1[7]; + uint8_t frs1[8]; + uint8_t frs2[8]; + uint8_t frs3[8]; + uint8_t frs4[8]; + uint8_t frs5[8]; + uint8_t frs6[8]; + uint8_t frs7[8]; + uint8_t reserved2[448]; +} NvmeFwSlotInfoLog; + +typedef struct QEMU_PACKED NvmeErrorLog { + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status_field; + uint16_t param_error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vs; + uint8_t resv[35]; +} NvmeErrorLog; + +typedef struct QEMU_PACKED NvmeSmartLog { + uint8_t critical_warning; + uint16_t temperature; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + uint8_t reserved1[26]; + uint64_t data_units_read[2]; + uint64_t data_units_written[2]; + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t number_of_error_log_entries[2]; + uint8_t reserved2[320]; +} NvmeSmartLog; + +#define NVME_SMART_WARN_MAX 6 +enum NvmeSmartWarn { + NVME_SMART_SPARE = 1 << 0, + NVME_SMART_TEMPERATURE = 1 << 1, + NVME_SMART_RELIABILITY = 1 << 2, + NVME_SMART_MEDIA_READ_ONLY = 1 << 3, + NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4, + NVME_SMART_PMR_UNRELIABLE = 1 << 5, +}; + +typedef struct NvmeEffectsLog { + uint32_t acs[256]; + uint32_t iocs[256]; + uint8_t resv[2048]; +} NvmeEffectsLog; + +enum { + NVME_CMD_EFF_CSUPP = 1 << 0, + NVME_CMD_EFF_LBCC = 1 << 1, + NVME_CMD_EFF_NCC = 1 << 2, + NVME_CMD_EFF_NIC = 1 << 3, + NVME_CMD_EFF_CCC = 1 << 4, + NVME_CMD_EFF_CSE_MASK = 3 << 16, + NVME_CMD_EFF_UUID_SEL = 1 << 19, +}; + +enum NvmeLogIdentifier { + NVME_LOG_ERROR_INFO = 0x01, + NVME_LOG_SMART_INFO = 0x02, + NVME_LOG_FW_SLOT_INFO = 0x03, + NVME_LOG_CHANGED_NSLIST = 0x04, + NVME_LOG_CMD_EFFECTS = 0x05, +}; + +typedef struct QEMU_PACKED NvmePSD { + uint16_t mp; + uint16_t reserved; + uint32_t enlat; + uint32_t exlat; + uint8_t rrt; + uint8_t rrl; + uint8_t rwt; + uint8_t rwl; + uint8_t resv[16]; +} NvmePSD; + +#define NVME_CONTROLLER_LIST_SIZE 2048 +#define NVME_IDENTIFY_DATA_SIZE 4096 + +enum NvmeIdCns { + NVME_ID_CNS_NS = 0x00, + NVME_ID_CNS_CTRL = 0x01, + NVME_ID_CNS_NS_ACTIVE_LIST = 0x02, + NVME_ID_CNS_NS_DESCR_LIST = 0x03, + NVME_ID_CNS_CS_NS = 0x05, + NVME_ID_CNS_CS_CTRL = 0x06, + NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07, + NVME_ID_CNS_NS_PRESENT_LIST = 0x10, + NVME_ID_CNS_NS_PRESENT = 0x11, + NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12, + NVME_ID_CNS_CTRL_LIST = 0x13, + NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a, + NVME_ID_CNS_CS_NS_PRESENT = 0x1b, + NVME_ID_CNS_IO_COMMAND_SET = 0x1c, +}; + +typedef struct QEMU_PACKED NvmeIdCtrl { + uint16_t vid; + uint16_t ssvid; + uint8_t sn[20]; + uint8_t mn[40]; + uint8_t fr[8]; + uint8_t rab; + uint8_t ieee[3]; + uint8_t cmic; + uint8_t mdts; + uint16_t cntlid; + uint32_t ver; + uint32_t rtd3r; + uint32_t rtd3e; + uint32_t oaes; + uint32_t ctratt; + uint8_t rsvd100[11]; + uint8_t cntrltype; + uint8_t fguid[16]; + uint8_t rsvd128[128]; + uint16_t oacs; + uint8_t acl; + uint8_t aerl; + uint8_t frmw; + uint8_t lpa; + uint8_t elpe; + uint8_t npss; + uint8_t avscc; + uint8_t apsta; + uint16_t wctemp; + uint16_t cctemp; + uint16_t mtfa; + uint32_t hmpre; + uint32_t hmmin; + uint8_t tnvmcap[16]; + uint8_t unvmcap[16]; + uint32_t rpmbs; + uint16_t edstt; + uint8_t dsto; + uint8_t fwug; + uint16_t kas; + uint16_t hctma; + uint16_t mntmt; + uint16_t mxtmt; + uint32_t sanicap; + uint8_t rsvd332[180]; + uint8_t sqes; + uint8_t cqes; + uint16_t maxcmd; + uint32_t nn; + uint16_t oncs; + uint16_t fuses; + uint8_t fna; + uint8_t vwc; + uint16_t awun; + uint16_t awupf; + uint8_t nvscc; + uint8_t rsvd531; + uint16_t acwu; + uint16_t ocfs; + uint32_t sgls; + uint8_t rsvd540[228]; + uint8_t subnqn[256]; + uint8_t rsvd1024[1024]; + NvmePSD psd[32]; + uint8_t vs[1024]; +} NvmeIdCtrl; + +typedef struct NvmeIdCtrlZoned { + uint8_t zasl; + uint8_t rsvd1[4095]; +} NvmeIdCtrlZoned; + +typedef struct NvmeIdCtrlNvm { + uint8_t vsl; + uint8_t wzsl; + uint8_t wusl; + uint8_t dmrl; + uint32_t dmrsl; + uint64_t dmsl; + uint8_t rsvd16[4080]; +} NvmeIdCtrlNvm; + +enum NvmeIdCtrlOaes { + NVME_OAES_NS_ATTR = 1 << 8, +}; + +enum NvmeIdCtrlOacs { + NVME_OACS_SECURITY = 1 << 0, + NVME_OACS_FORMAT = 1 << 1, + NVME_OACS_FW = 1 << 2, + NVME_OACS_NS_MGMT = 1 << 3, +}; + +enum NvmeIdCtrlOncs { + NVME_ONCS_COMPARE = 1 << 0, + NVME_ONCS_WRITE_UNCORR = 1 << 1, + NVME_ONCS_DSM = 1 << 2, + NVME_ONCS_WRITE_ZEROES = 1 << 3, + NVME_ONCS_FEATURES = 1 << 4, + NVME_ONCS_RESRVATIONS = 1 << 5, + NVME_ONCS_TIMESTAMP = 1 << 6, + NVME_ONCS_VERIFY = 1 << 7, + NVME_ONCS_COPY = 1 << 8, +}; + +enum NvmeIdCtrlOcfs { + NVME_OCFS_COPY_FORMAT_0 = 1 << 0, +}; + +enum NvmeIdctrlVwc { + NVME_VWC_PRESENT = 1 << 0, + NVME_VWC_NSID_BROADCAST_NO_SUPPORT = 0 << 1, + NVME_VWC_NSID_BROADCAST_RESERVED = 1 << 1, + NVME_VWC_NSID_BROADCAST_CTRL_SPEC = 2 << 1, + NVME_VWC_NSID_BROADCAST_SUPPORT = 3 << 1, +}; + +enum NvmeIdCtrlFrmw { + NVME_FRMW_SLOT1_RO = 1 << 0, +}; + +enum NvmeIdCtrlLpa { + NVME_LPA_NS_SMART = 1 << 0, + NVME_LPA_CSE = 1 << 1, + NVME_LPA_EXTENDED = 1 << 2, +}; + +enum NvmeIdCtrlCmic { + NVME_CMIC_MULTI_CTRL = 1 << 1, +}; + +enum NvmeNsAttachmentOperation { + NVME_NS_ATTACHMENT_ATTACH = 0x0, + NVME_NS_ATTACHMENT_DETACH = 0x1, +}; + +#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf) +#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf) +#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf) +#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf) + +#define NVME_CTRL_SGLS_SUPPORT_MASK (0x3 << 0) +#define NVME_CTRL_SGLS_SUPPORT_NO_ALIGN (0x1 << 0) +#define NVME_CTRL_SGLS_SUPPORT_DWORD_ALIGN (0x1 << 1) +#define NVME_CTRL_SGLS_KEYED (0x1 << 2) +#define NVME_CTRL_SGLS_BITBUCKET (0x1 << 16) +#define NVME_CTRL_SGLS_MPTR_CONTIGUOUS (0x1 << 17) +#define NVME_CTRL_SGLS_EXCESS_LENGTH (0x1 << 18) +#define NVME_CTRL_SGLS_MPTR_SGL (0x1 << 19) +#define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20) + +#define NVME_ARB_AB(arb) (arb & 0x7) +#define NVME_ARB_AB_NOLIMIT 0x7 +#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff) +#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff) +#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff) + +#define NVME_INTC_THR(intc) (intc & 0xff) +#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff) + +#define NVME_INTVC_NOCOALESCING (0x1 << 16) + +#define NVME_TEMP_THSEL(temp) ((temp >> 20) & 0x3) +#define NVME_TEMP_THSEL_OVER 0x0 +#define NVME_TEMP_THSEL_UNDER 0x1 + +#define NVME_TEMP_TMPSEL(temp) ((temp >> 16) & 0xf) +#define NVME_TEMP_TMPSEL_COMPOSITE 0x0 + +#define NVME_TEMP_TMPTH(temp) (temp & 0xffff) + +#define NVME_AEC_SMART(aec) (aec & 0xff) +#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1) +#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1) + +#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff) +#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000) + +enum NvmeFeatureIds { + NVME_ARBITRATION = 0x1, + NVME_POWER_MANAGEMENT = 0x2, + NVME_LBA_RANGE_TYPE = 0x3, + NVME_TEMPERATURE_THRESHOLD = 0x4, + NVME_ERROR_RECOVERY = 0x5, + NVME_VOLATILE_WRITE_CACHE = 0x6, + NVME_NUMBER_OF_QUEUES = 0x7, + NVME_INTERRUPT_COALESCING = 0x8, + NVME_INTERRUPT_VECTOR_CONF = 0x9, + NVME_WRITE_ATOMICITY = 0xa, + NVME_ASYNCHRONOUS_EVENT_CONF = 0xb, + NVME_TIMESTAMP = 0xe, + NVME_COMMAND_SET_PROFILE = 0x19, + NVME_SOFTWARE_PROGRESS_MARKER = 0x80, + NVME_FID_MAX = 0x100, +}; + +typedef enum NvmeFeatureCap { + NVME_FEAT_CAP_SAVE = 1 << 0, + NVME_FEAT_CAP_NS = 1 << 1, + NVME_FEAT_CAP_CHANGE = 1 << 2, +} NvmeFeatureCap; + +typedef enum NvmeGetFeatureSelect { + NVME_GETFEAT_SELECT_CURRENT = 0x0, + NVME_GETFEAT_SELECT_DEFAULT = 0x1, + NVME_GETFEAT_SELECT_SAVED = 0x2, + NVME_GETFEAT_SELECT_CAP = 0x3, +} NvmeGetFeatureSelect; + +#define NVME_GETSETFEAT_FID_MASK 0xff +#define NVME_GETSETFEAT_FID(dw10) (dw10 & NVME_GETSETFEAT_FID_MASK) + +#define NVME_GETFEAT_SELECT_SHIFT 8 +#define NVME_GETFEAT_SELECT_MASK 0x7 +#define NVME_GETFEAT_SELECT(dw10) \ + ((dw10 >> NVME_GETFEAT_SELECT_SHIFT) & NVME_GETFEAT_SELECT_MASK) + +#define NVME_SETFEAT_SAVE_SHIFT 31 +#define NVME_SETFEAT_SAVE_MASK 0x1 +#define NVME_SETFEAT_SAVE(dw10) \ + ((dw10 >> NVME_SETFEAT_SAVE_SHIFT) & NVME_SETFEAT_SAVE_MASK) + +typedef struct QEMU_PACKED NvmeRangeType { + uint8_t type; + uint8_t attributes; + uint8_t rsvd2[14]; + uint64_t slba; + uint64_t nlb; + uint8_t guid[16]; + uint8_t rsvd48[16]; +} NvmeRangeType; + +typedef struct QEMU_PACKED NvmeLBAF { + uint16_t ms; + uint8_t ds; + uint8_t rp; +} NvmeLBAF; + +typedef struct QEMU_PACKED NvmeLBAFE { + uint64_t zsze; + uint8_t zdes; + uint8_t rsvd9[7]; +} NvmeLBAFE; + +#define NVME_NSID_BROADCAST 0xffffffff + +typedef struct QEMU_PACKED NvmeIdNs { + uint64_t nsze; + uint64_t ncap; + uint64_t nuse; + uint8_t nsfeat; + uint8_t nlbaf; + uint8_t flbas; + uint8_t mc; + uint8_t dpc; + uint8_t dps; + uint8_t nmic; + uint8_t rescap; + uint8_t fpi; + uint8_t dlfeat; + uint16_t nawun; + uint16_t nawupf; + uint16_t nacwu; + uint16_t nabsn; + uint16_t nabo; + uint16_t nabspf; + uint16_t noiob; + uint8_t nvmcap[16]; + uint16_t npwg; + uint16_t npwa; + uint16_t npdg; + uint16_t npda; + uint16_t nows; + uint16_t mssrl; + uint32_t mcl; + uint8_t msrc; + uint8_t rsvd81[23]; + uint8_t nguid[16]; + uint64_t eui64; + NvmeLBAF lbaf[16]; + uint8_t rsvd192[192]; + uint8_t vs[3712]; +} NvmeIdNs; + +typedef struct QEMU_PACKED NvmeIdNsDescr { + uint8_t nidt; + uint8_t nidl; + uint8_t rsvd2[2]; +} NvmeIdNsDescr; + +enum NvmeNsIdentifierLength { + NVME_NIDL_EUI64 = 8, + NVME_NIDL_NGUID = 16, + NVME_NIDL_UUID = 16, + NVME_NIDL_CSI = 1, +}; + +enum NvmeNsIdentifierType { + NVME_NIDT_EUI64 = 0x01, + NVME_NIDT_NGUID = 0x02, + NVME_NIDT_UUID = 0x03, + NVME_NIDT_CSI = 0x04, +}; + +enum NvmeIdNsNmic { + NVME_NMIC_NS_SHARED = 1 << 0, +}; + +enum NvmeCsi { + NVME_CSI_NVM = 0x00, + NVME_CSI_ZONED = 0x02, +}; + +#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi))) + +typedef struct QEMU_PACKED NvmeIdNsZoned { + uint16_t zoc; + uint16_t ozcs; + uint32_t mar; + uint32_t mor; + uint32_t rrl; + uint32_t frl; + uint8_t rsvd20[2796]; + NvmeLBAFE lbafe[16]; + uint8_t rsvd3072[768]; + uint8_t vs[256]; +} NvmeIdNsZoned; + +/*Deallocate Logical Block Features*/ +#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10) +#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08) + +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat) ((dlfeat) & 0x7) +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED 0 +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES 1 +#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES 2 + + +#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1)) +#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1) +#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1) +#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf)) +#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1) +#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1)) +#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1) +#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1) +#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1) +#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1) +#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1)) +#define NVME_ID_NS_DPC_TYPE_MASK 0x7 + +enum NvmeIdNsDps { + NVME_ID_NS_DPS_TYPE_NONE = 0, + NVME_ID_NS_DPS_TYPE_1 = 1, + NVME_ID_NS_DPS_TYPE_2 = 2, + NVME_ID_NS_DPS_TYPE_3 = 3, + NVME_ID_NS_DPS_TYPE_MASK = 0x7, + NVME_ID_NS_DPS_FIRST_EIGHT = 8, +}; + +enum NvmeIdNsFlbas { + NVME_ID_NS_FLBAS_EXTENDED = 1 << 4, +}; + +enum NvmeIdNsMc { + NVME_ID_NS_MC_EXTENDED = 1 << 0, + NVME_ID_NS_MC_SEPARATE = 1 << 1, +}; + +#define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK) + +typedef struct NvmeDifTuple { + uint16_t guard; + uint16_t apptag; + uint32_t reftag; +} NvmeDifTuple; + +enum NvmeZoneAttr { + NVME_ZA_FINISHED_BY_CTLR = 1 << 0, + NVME_ZA_FINISH_RECOMMENDED = 1 << 1, + NVME_ZA_RESET_RECOMMENDED = 1 << 2, + NVME_ZA_ZD_EXT_VALID = 1 << 7, +}; + +typedef struct QEMU_PACKED NvmeZoneReportHeader { + uint64_t nr_zones; + uint8_t rsvd[56]; +} NvmeZoneReportHeader; + +enum NvmeZoneReceiveAction { + NVME_ZONE_REPORT = 0, + NVME_ZONE_REPORT_EXTENDED = 1, +}; + +enum NvmeZoneReportType { + NVME_ZONE_REPORT_ALL = 0, + NVME_ZONE_REPORT_EMPTY = 1, + NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2, + NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3, + NVME_ZONE_REPORT_CLOSED = 4, + NVME_ZONE_REPORT_FULL = 5, + NVME_ZONE_REPORT_READ_ONLY = 6, + NVME_ZONE_REPORT_OFFLINE = 7, +}; + +enum NvmeZoneType { + NVME_ZONE_TYPE_RESERVED = 0x00, + NVME_ZONE_TYPE_SEQ_WRITE = 0x02, +}; + +enum NvmeZoneSendAction { + NVME_ZONE_ACTION_RSD = 0x00, + NVME_ZONE_ACTION_CLOSE = 0x01, + NVME_ZONE_ACTION_FINISH = 0x02, + NVME_ZONE_ACTION_OPEN = 0x03, + NVME_ZONE_ACTION_RESET = 0x04, + NVME_ZONE_ACTION_OFFLINE = 0x05, + NVME_ZONE_ACTION_SET_ZD_EXT = 0x10, +}; + +typedef struct QEMU_PACKED NvmeZoneDescr { + uint8_t zt; + uint8_t zs; + uint8_t za; + uint8_t rsvd3[5]; + uint64_t zcap; + uint64_t zslba; + uint64_t wp; + uint8_t rsvd32[32]; +} NvmeZoneDescr; + +typedef enum NvmeZoneState { + NVME_ZONE_STATE_RESERVED = 0x00, + NVME_ZONE_STATE_EMPTY = 0x01, + NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02, + NVME_ZONE_STATE_EXPLICITLY_OPEN = 0x03, + NVME_ZONE_STATE_CLOSED = 0x04, + NVME_ZONE_STATE_READ_ONLY = 0x0d, + NVME_ZONE_STATE_FULL = 0x0e, + NVME_ZONE_STATE_OFFLINE = 0x0f, +} NvmeZoneState; + +static inline void _nvme_check_size(void) +{ + QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8); + QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32); + QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512); + QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlNvm) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096); + QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16); + QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4); + QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64); + QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 8); +} +#endif diff --git a/include/block/qapi.h b/include/block/qapi.h new file mode 100644 index 000000000..22c7807c8 --- /dev/null +++ b/include/block/qapi.h @@ -0,0 +1,45 @@ +/* + * Block layer qmp and info dump related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QAPI_H +#define BLOCK_QAPI_H + +#include "block/block.h" +#include "block/snapshot.h" + +BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, + BlockDriverState *bs, + bool flat, + Error **errp); +int bdrv_query_snapshot_info_list(BlockDriverState *bs, + SnapshotInfoList **p_list, + Error **errp); +void bdrv_query_image_info(BlockDriverState *bs, + ImageInfo **p_info, + Error **errp); + +void bdrv_snapshot_dump(QEMUSnapshotInfo *sn); +void bdrv_image_info_specific_dump(ImageInfoSpecific *info_spec); +void bdrv_image_info_dump(ImageInfo *info); +#endif diff --git a/include/block/qdict.h b/include/block/qdict.h new file mode 100644 index 000000000..ced2acfb9 --- /dev/null +++ b/include/block/qdict.h @@ -0,0 +1,32 @@ +/* + * Special QDict functions used by the block layer + * + * Copyright (c) 2013-2018 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_QDICT_H +#define BLOCK_QDICT_H + +#include "qapi/qmp/qdict.h" + +void qdict_copy_default(QDict *dst, QDict *src, const char *key); +void qdict_set_default_str(QDict *dst, const char *key, const char *val); + +void qdict_join(QDict *dest, QDict *src, bool overwrite); + +void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start); +void qdict_array_split(QDict *src, QList **dst); +int qdict_array_entries(QDict *src, const char *subqdict); + +typedef struct QDictRenames { + const char *from; + const char *to; +} QDictRenames; +bool qdict_rename_keys(QDict *qdict, const QDictRenames *renames, Error **errp); + +Visitor *qobject_input_visitor_new_flat_confused(QDict *qdict, + Error **errp); +#endif diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h new file mode 100644 index 000000000..21fc10c4c --- /dev/null +++ b/include/block/raw-aio.h @@ -0,0 +1,90 @@ +/* + * Declarations for AIO in the raw protocol + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#ifndef QEMU_RAW_AIO_H +#define QEMU_RAW_AIO_H + +#include "block/aio.h" +#include "qemu/coroutine.h" +#include "qemu/iov.h" + +/* AIO request types */ +#define QEMU_AIO_READ 0x0001 +#define QEMU_AIO_WRITE 0x0002 +#define QEMU_AIO_IOCTL 0x0004 +#define QEMU_AIO_FLUSH 0x0008 +#define QEMU_AIO_DISCARD 0x0010 +#define QEMU_AIO_WRITE_ZEROES 0x0020 +#define QEMU_AIO_COPY_RANGE 0x0040 +#define QEMU_AIO_TRUNCATE 0x0080 +#define QEMU_AIO_TYPE_MASK \ + (QEMU_AIO_READ | \ + QEMU_AIO_WRITE | \ + QEMU_AIO_IOCTL | \ + QEMU_AIO_FLUSH | \ + QEMU_AIO_DISCARD | \ + QEMU_AIO_WRITE_ZEROES | \ + QEMU_AIO_COPY_RANGE | \ + QEMU_AIO_TRUNCATE) + +/* AIO flags */ +#define QEMU_AIO_MISALIGNED 0x1000 +#define QEMU_AIO_BLKDEV 0x2000 +#define QEMU_AIO_NO_FALLBACK 0x4000 + + +/* linux-aio.c - Linux native implementation */ +#ifdef CONFIG_LINUX_AIO +typedef struct LinuxAioState LinuxAioState; +LinuxAioState *laio_init(Error **errp); +void laio_cleanup(LinuxAioState *s); +int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd, + uint64_t offset, QEMUIOVector *qiov, int type, + uint64_t dev_max_batch); +void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context); +void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, LinuxAioState *s); +void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s, + uint64_t dev_max_batch); +#endif +/* io_uring.c - Linux io_uring implementation */ +#ifdef CONFIG_LINUX_IO_URING +typedef struct LuringState LuringState; +LuringState *luring_init(Error **errp); +void luring_cleanup(LuringState *s); +int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd, + uint64_t offset, QEMUIOVector *qiov, int type); +void luring_detach_aio_context(LuringState *s, AioContext *old_context); +void luring_attach_aio_context(LuringState *s, AioContext *new_context); +void luring_io_plug(BlockDriverState *bs, LuringState *s); +void luring_io_unplug(BlockDriverState *bs, LuringState *s); +#endif + +#ifdef _WIN32 +typedef struct QEMUWin32AIOState QEMUWin32AIOState; +QEMUWin32AIOState *win32_aio_init(void); +void win32_aio_cleanup(QEMUWin32AIOState *aio); +int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); +BlockAIOCB *win32_aio_submit(BlockDriverState *bs, + QEMUWin32AIOState *aio, HANDLE hfile, + uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, + BlockCompletionFunc *cb, void *opaque, int type); +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, + AioContext *old_context); +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, + AioContext *new_context); +#endif + +#endif /* QEMU_RAW_AIO_H */ diff --git a/include/block/replication.h b/include/block/replication.h new file mode 100644 index 000000000..21931b4f0 --- /dev/null +++ b/include/block/replication.h @@ -0,0 +1,175 @@ +/* + * Replication filter + * + * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD. + * Copyright (c) 2016 Intel Corporation + * Copyright (c) 2016 FUJITSU LIMITED + * + * Author: + * Changlong Xie <xiecl.fnst@cn.fujitsu.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef REPLICATION_H +#define REPLICATION_H + +#include "qapi/qapi-types-block-core.h" +#include "qemu/module.h" +#include "qemu/queue.h" + +typedef struct ReplicationOps ReplicationOps; +typedef struct ReplicationState ReplicationState; + +/** + * SECTION:block/replication.h + * @title:Base Replication System + * @short_description: interfaces for handling replication + * + * The Replication Model provides a framework for handling Replication + * + * <example> + * <title>How to use replication interfaces</title> + * <programlisting> + * #include "block/replication.h" + * + * typedef struct BDRVReplicationState { + * ReplicationState *rs; + * } BDRVReplicationState; + * + * static void replication_start(ReplicationState *rs, ReplicationMode mode, + * Error **errp); + * static void replication_do_checkpoint(ReplicationState *rs, Error **errp); + * static void replication_get_error(ReplicationState *rs, Error **errp); + * static void replication_stop(ReplicationState *rs, bool failover, + * Error **errp); + * + * static ReplicationOps replication_ops = { + * .start = replication_start, + * .checkpoint = replication_do_checkpoint, + * .get_error = replication_get_error, + * .stop = replication_stop, + * } + * + * static int replication_open(BlockDriverState *bs, QDict *options, + * int flags, Error **errp) + * { + * BDRVReplicationState *s = bs->opaque; + * s->rs = replication_new(bs, &replication_ops); + * return 0; + * } + * + * static void replication_close(BlockDriverState *bs) + * { + * BDRVReplicationState *s = bs->opaque; + * replication_remove(s->rs); + * } + * + * BlockDriver bdrv_replication = { + * .format_name = "replication", + * .instance_size = sizeof(BDRVReplicationState), + * + * .bdrv_open = replication_open, + * .bdrv_close = replication_close, + * }; + * + * static void bdrv_replication_init(void) + * { + * bdrv_register(&bdrv_replication); + * } + * + * block_init(bdrv_replication_init); + * </programlisting> + * </example> + * + * We create an example about how to use replication interfaces in above. + * Then in migration, we can use replication_(start/stop/do_checkpoint/ + * get_error)_all to handle all replication operations. + */ + +/** + * ReplicationState: + * @opaque: opaque pointer value passed to this ReplicationState + * @ops: replication operation of this ReplicationState + * @node: node that we will insert into @replication_states QLIST + */ +struct ReplicationState { + void *opaque; + ReplicationOps *ops; + QLIST_ENTRY(ReplicationState) node; +}; + +/** + * ReplicationOps: + * @start: callback to start replication + * @stop: callback to stop replication + * @checkpoint: callback to do checkpoint + * @get_error: callback to check if error occurred during replication + */ +struct ReplicationOps { + void (*start)(ReplicationState *rs, ReplicationMode mode, Error **errp); + void (*stop)(ReplicationState *rs, bool failover, Error **errp); + void (*checkpoint)(ReplicationState *rs, Error **errp); + void (*get_error)(ReplicationState *rs, Error **errp); +}; + +/** + * replication_new: + * @opaque: opaque pointer value passed to ReplicationState + * @ops: replication operation of the new relevant ReplicationState + * + * Called to create a new ReplicationState instance, and then insert it + * into @replication_states QLIST + * + * Returns: the new ReplicationState instance + */ +ReplicationState *replication_new(void *opaque, ReplicationOps *ops); + +/** + * replication_remove: + * @rs: the ReplicationState instance to remove + * + * Called to remove a ReplicationState instance, and then delete it from + * @replication_states QLIST + */ +void replication_remove(ReplicationState *rs); + +/** + * replication_start_all: + * @mode: replication mode that could be "primary" or "secondary" + * @errp: returns an error if this function fails + * + * Start replication, called in migration/checkpoint thread + * + * Note: the caller of the function MUST make sure vm stopped + */ +void replication_start_all(ReplicationMode mode, Error **errp); + +/** + * replication_do_checkpoint_all: + * @errp: returns an error if this function fails + * + * This interface is called after all VM state is transferred to Secondary QEMU + */ +void replication_do_checkpoint_all(Error **errp); + +/** + * replication_get_error_all: + * @errp: returns an error if this function fails + * + * This interface is called to check if error occurred during replication + */ +void replication_get_error_all(Error **errp); + +/** + * replication_stop_all: + * @failover: boolean value that indicates if we need do failover or not + * @errp: returns an error if this function fails + * + * It is called on failover. The vm should be stopped before calling it, if you + * use this API to shutdown the guest, or other things except failover + */ +void replication_stop_all(bool failover, Error **errp); + +#endif /* REPLICATION_H */ diff --git a/include/block/snapshot.h b/include/block/snapshot.h new file mode 100644 index 000000000..940345692 --- /dev/null +++ b/include/block/snapshot.h @@ -0,0 +1,102 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef SNAPSHOT_H +#define SNAPSHOT_H + +#include "qapi/qapi-builtin-types.h" + +#define SNAPSHOT_OPT_BASE "snapshot." +#define SNAPSHOT_OPT_ID "snapshot.id" +#define SNAPSHOT_OPT_NAME "snapshot.name" + +extern QemuOptsList internal_snapshot_opts; + +typedef struct QEMUSnapshotInfo { + char id_str[128]; /* unique snapshot id */ + /* the following fields are informative. They are not needed for + the consistency of the snapshot */ + char name[256]; /* user chosen name */ + uint64_t vm_state_size; /* VM state info size */ + uint32_t date_sec; /* UTC date of the snapshot */ + uint32_t date_nsec; + uint64_t vm_clock_nsec; /* VM clock relative to boot */ + uint64_t icount; /* record/replay step */ +} QEMUSnapshotInfo; + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name); +bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name, + QEMUSnapshotInfo *sn_info, + Error **errp); +int bdrv_can_snapshot(BlockDriverState *bs); +int bdrv_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info); +int bdrv_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id, + Error **errp); +int bdrv_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); +int bdrv_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info); +int bdrv_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); +int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp); + + +/* Group operations. All block drivers are involved. + * These functions will properly handle dataplane (take aio_context_acquire + * when appropriate for appropriate block drivers */ + +bool bdrv_all_can_snapshot(bool has_devices, strList *devices, + Error **errp); +int bdrv_all_delete_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); +int bdrv_all_goto_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); +int bdrv_all_has_snapshot(const char *name, + bool has_devices, strList *devices, + Error **errp); +int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, + BlockDriverState *vm_state_bs, + uint64_t vm_state_size, + bool has_devices, + strList *devices, + Error **errp); + +BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs, + bool has_devices, strList *devices, + Error **errp); + +#endif diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h new file mode 100644 index 000000000..7dd7d730a --- /dev/null +++ b/include/block/thread-pool.h @@ -0,0 +1,37 @@ +/* + * QEMU block layer thread pool + * + * Copyright IBM, Corp. 2008 + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#ifndef QEMU_THREAD_POOL_H +#define QEMU_THREAD_POOL_H + +#include "block/block.h" + +typedef int ThreadPoolFunc(void *opaque); + +typedef struct ThreadPool ThreadPool; + +ThreadPool *thread_pool_new(struct AioContext *ctx); +void thread_pool_free(ThreadPool *pool); + +BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool, + ThreadPoolFunc *func, void *arg, + BlockCompletionFunc *cb, void *opaque); +int coroutine_fn thread_pool_submit_co(ThreadPool *pool, + ThreadPoolFunc *func, void *arg); +void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg); + +#endif diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h new file mode 100644 index 000000000..9541b3243 --- /dev/null +++ b/include/block/throttle-groups.h @@ -0,0 +1,91 @@ +/* + * QEMU block throttling group infrastructure + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * BenoƮt Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef THROTTLE_GROUPS_H +#define THROTTLE_GROUPS_H + +#include "qemu/throttle.h" +#include "block/block_int.h" +#include "qom/object.h" + +/* The ThrottleGroupMember structure indicates membership in a ThrottleGroup + * and holds related data. + */ + +typedef struct ThrottleGroupMember { + AioContext *aio_context; + /* throttled_reqs_lock protects the CoQueues for throttled requests. */ + CoMutex throttled_reqs_lock; + CoQueue throttled_reqs[2]; + + /* Nonzero if the I/O limits are currently being ignored; generally + * it is zero. Accessed with atomic operations. + */ + unsigned int io_limits_disabled; + + /* Number of pending throttle_group_restart_queue_entry() coroutines. + * Accessed with atomic operations. + */ + unsigned int restart_pending; + + /* The following fields are protected by the ThrottleGroup lock. + * See the ThrottleGroup documentation for details. + * throttle_state tells us if I/O limits are configured. */ + ThrottleState *throttle_state; + ThrottleTimers throttle_timers; + unsigned pending_reqs[2]; + QLIST_ENTRY(ThrottleGroupMember) round_robin; + +} ThrottleGroupMember; + +#define TYPE_THROTTLE_GROUP "throttle-group" +OBJECT_DECLARE_SIMPLE_TYPE(ThrottleGroup, THROTTLE_GROUP) + +const char *throttle_group_get_name(ThrottleGroupMember *tgm); + +ThrottleState *throttle_group_incref(const char *name); +void throttle_group_unref(ThrottleState *ts); + +void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg); +void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg); + +void throttle_group_register_tgm(ThrottleGroupMember *tgm, + const char *groupname, + AioContext *ctx); +void throttle_group_unregister_tgm(ThrottleGroupMember *tgm); +void throttle_group_restart_tgm(ThrottleGroupMember *tgm); + +void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm, + int64_t bytes, + bool is_write); +void throttle_group_attach_aio_context(ThrottleGroupMember *tgm, + AioContext *new_context); +void throttle_group_detach_aio_context(ThrottleGroupMember *tgm); +/* + * throttle_group_exists() must be called under the global + * mutex. + */ +bool throttle_group_exists(const char *name); + +#endif diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h new file mode 100644 index 000000000..f50f923e7 --- /dev/null +++ b/include/block/write-threshold.h @@ -0,0 +1,47 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani <fromani@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#ifndef BLOCK_WRITE_THRESHOLD_H +#define BLOCK_WRITE_THRESHOLD_H + +#include "qemu/typedefs.h" + +/* + * bdrv_write_threshold_set: + * + * Set the write threshold for block devices, in bytes. + * Notify when a write exceeds the threshold, meaning the device + * is becoming full, so it can be transparently resized. + * To be used with thin-provisioned block devices. + * + * Use threshold_bytes == 0 to disable. + */ +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes); + +/* + * bdrv_write_threshold_get + * + * Get the configured write threshold, in bytes. + * Zero means no threshold configured. + */ +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs); + +/* + * bdrv_write_threshold_check_write + * + * Check whether the specified request exceeds the write threshold. + * If so, send a corresponding event and disable write threshold checking. + */ +void bdrv_write_threshold_check_write(BlockDriverState *bs, int64_t offset, + int64_t bytes); + +#endif |