aboutsummaryrefslogtreecommitdiffstats
path: root/include/block
diff options
context:
space:
mode:
Diffstat (limited to 'include/block')
-rw-r--r--include/block/accounting.h123
-rw-r--r--include/block/aio-wait.h149
-rw-r--r--include/block/aio.h770
-rw-r--r--include/block/aio_task.h54
-rw-r--r--include/block/block-copy.h96
-rw-r--r--include/block/block-hmp-cmds.h54
-rw-r--r--include/block/block.h864
-rw-r--r--include/block/block_backup.h25
-rw-r--r--include/block/block_int.h1504
-rw-r--r--include/block/blockjob.h176
-rw-r--r--include/block/blockjob_int.h122
-rw-r--r--include/block/dirty-bitmap.h121
-rw-r--r--include/block/export.h89
-rw-r--r--include/block/fuse.h30
-rw-r--r--include/block/nbd.h427
-rw-r--r--include/block/nvme.h1501
-rw-r--r--include/block/qapi.h45
-rw-r--r--include/block/qdict.h32
-rw-r--r--include/block/raw-aio.h90
-rw-r--r--include/block/replication.h175
-rw-r--r--include/block/snapshot.h102
-rw-r--r--include/block/thread-pool.h37
-rw-r--r--include/block/throttle-groups.h91
-rw-r--r--include/block/write-threshold.h47
24 files changed, 6724 insertions, 0 deletions
diff --git a/include/block/accounting.h b/include/block/accounting.h
new file mode 100644
index 000000000..878b4c358
--- /dev/null
+++ b/include/block/accounting.h
@@ -0,0 +1,123 @@
+/*
+ * QEMU System Emulator block accounting
+ *
+ * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_ACCOUNTING_H
+#define BLOCK_ACCOUNTING_H
+
+#include "qemu/timed-average.h"
+#include "qemu/thread.h"
+#include "qapi/qapi-builtin-types.h"
+
+typedef struct BlockAcctTimedStats BlockAcctTimedStats;
+typedef struct BlockAcctStats BlockAcctStats;
+
+enum BlockAcctType {
+ BLOCK_ACCT_NONE = 0,
+ BLOCK_ACCT_READ,
+ BLOCK_ACCT_WRITE,
+ BLOCK_ACCT_FLUSH,
+ BLOCK_ACCT_UNMAP,
+ BLOCK_MAX_IOTYPE,
+};
+
+struct BlockAcctTimedStats {
+ BlockAcctStats *stats;
+ TimedAverage latency[BLOCK_MAX_IOTYPE];
+ unsigned interval_length; /* in seconds */
+ QSLIST_ENTRY(BlockAcctTimedStats) entries;
+};
+
+typedef struct BlockLatencyHistogram {
+ /* The following histogram is represented like this:
+ *
+ * 5| *
+ * 4| *
+ * 3| * *
+ * 2| * * *
+ * 1| * * * *
+ * +------------------
+ * 10 50 100
+ *
+ * BlockLatencyHistogram histogram = {
+ * .nbins = 4,
+ * .boundaries = {10, 50, 100},
+ * .bins = {3, 1, 5, 2},
+ * };
+ *
+ * @boundaries array define histogram intervals as follows:
+ * [0, boundaries[0]), [boundaries[0], boundaries[1]), ...
+ * [boundaries[nbins-2], +inf)
+ *
+ * So, for example above, histogram intervals are:
+ * [0, 10), [10, 50), [50, 100), [100, +inf)
+ */
+ int nbins;
+ uint64_t *boundaries; /* @nbins-1 numbers here
+ (all boundaries, except 0 and +inf) */
+ uint64_t *bins;
+} BlockLatencyHistogram;
+
+struct BlockAcctStats {
+ QemuMutex lock;
+ uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
+ uint64_t nr_ops[BLOCK_MAX_IOTYPE];
+ uint64_t invalid_ops[BLOCK_MAX_IOTYPE];
+ uint64_t failed_ops[BLOCK_MAX_IOTYPE];
+ uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
+ uint64_t merged[BLOCK_MAX_IOTYPE];
+ int64_t last_access_time_ns;
+ QSLIST_HEAD(, BlockAcctTimedStats) intervals;
+ bool account_invalid;
+ bool account_failed;
+ BlockLatencyHistogram latency_histogram[BLOCK_MAX_IOTYPE];
+};
+
+typedef struct BlockAcctCookie {
+ int64_t bytes;
+ int64_t start_time_ns;
+ enum BlockAcctType type;
+} BlockAcctCookie;
+
+void block_acct_init(BlockAcctStats *stats);
+void block_acct_setup(BlockAcctStats *stats, bool account_invalid,
+ bool account_failed);
+void block_acct_cleanup(BlockAcctStats *stats);
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length);
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+ BlockAcctTimedStats *s);
+void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
+ int64_t bytes, enum BlockAcctType type);
+void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type);
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+ int num_requests);
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats);
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+ enum BlockAcctType type);
+int block_latency_histogram_set(BlockAcctStats *stats, enum BlockAcctType type,
+ uint64List *boundaries);
+void block_latency_histograms_clear(BlockAcctStats *stats);
+
+#endif
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
new file mode 100644
index 000000000..b39eefb38
--- /dev/null
+++ b/include/block/aio-wait.h
@@ -0,0 +1,149 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_AIO_WAIT_H
+#define QEMU_AIO_WAIT_H
+
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+
+/**
+ * AioWait:
+ *
+ * An object that facilitates synchronous waiting on a condition. A single
+ * global AioWait object (global_aio_wait) is used internally.
+ *
+ * The main loop can wait on an operation running in an IOThread as follows:
+ *
+ * AioContext *ctx = ...;
+ * MyWork work = { .done = false };
+ * schedule_my_work_in_iothread(ctx, &work);
+ * AIO_WAIT_WHILE(ctx, !work.done);
+ *
+ * The IOThread must call aio_wait_kick() to notify the main loop when
+ * work.done changes:
+ *
+ * static void do_work(...)
+ * {
+ * ...
+ * work.done = true;
+ * aio_wait_kick();
+ * }
+ */
+typedef struct {
+ /* Number of waiting AIO_WAIT_WHILE() callers. Accessed with atomic ops. */
+ unsigned num_waiters;
+} AioWait;
+
+extern AioWait global_aio_wait;
+
+/**
+ * AIO_WAIT_WHILE:
+ * @ctx: the aio context, or NULL if multiple aio contexts (for which the
+ * caller does not hold a lock) are involved in the polling condition.
+ * @cond: wait while this conditional expression is true
+ *
+ * Wait while a condition is true. Use this to implement synchronous
+ * operations that require event loop activity.
+ *
+ * The caller must be sure that something calls aio_wait_kick() when the value
+ * of @cond might have changed.
+ *
+ * The caller's thread must be the IOThread that owns @ctx or the main loop
+ * thread (with @ctx acquired exactly once). This function cannot be used to
+ * wait on conditions between two IOThreads since that could lead to deadlock,
+ * go via the main loop instead.
+ */
+#define AIO_WAIT_WHILE(ctx, cond) ({ \
+ bool waited_ = false; \
+ AioWait *wait_ = &global_aio_wait; \
+ AioContext *ctx_ = (ctx); \
+ /* Increment wait_->num_waiters before evaluating cond. */ \
+ qatomic_inc(&wait_->num_waiters); \
+ if (ctx_ && in_aio_context_home_thread(ctx_)) { \
+ while ((cond)) { \
+ aio_poll(ctx_, true); \
+ waited_ = true; \
+ } \
+ } else { \
+ assert(qemu_get_current_aio_context() == \
+ qemu_get_aio_context()); \
+ while ((cond)) { \
+ if (ctx_) { \
+ aio_context_release(ctx_); \
+ } \
+ aio_poll(qemu_get_aio_context(), true); \
+ if (ctx_) { \
+ aio_context_acquire(ctx_); \
+ } \
+ waited_ = true; \
+ } \
+ } \
+ qatomic_dec(&wait_->num_waiters); \
+ waited_; })
+
+/**
+ * aio_wait_kick:
+ * Wake up the main thread if it is waiting on AIO_WAIT_WHILE(). During
+ * synchronous operations performed in an IOThread, the main thread lets the
+ * IOThread's event loop run, waiting for the operation to complete. A
+ * aio_wait_kick() call will wake up the main thread.
+ */
+void aio_wait_kick(void);
+
+/**
+ * aio_wait_bh_oneshot:
+ * @ctx: the aio context
+ * @cb: the BH callback function
+ * @opaque: user data for the BH callback function
+ *
+ * Run a BH in @ctx and wait for it to complete.
+ *
+ * Must be called from the main loop thread with @ctx acquired exactly once.
+ * Note that main loop event processing may occur.
+ */
+void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
+
+/**
+ * in_aio_context_home_thread:
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the thread that normally runs @ctx. Note
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
+ * still only has one home thread that is responsible for running it.
+ */
+static inline bool in_aio_context_home_thread(AioContext *ctx)
+{
+ if (ctx == qemu_get_current_aio_context()) {
+ return true;
+ }
+
+ if (ctx == qemu_get_aio_context()) {
+ return qemu_mutex_iothread_locked();
+ } else {
+ return false;
+ }
+}
+
+#endif /* QEMU_AIO_WAIT_H */
diff --git a/include/block/aio.h b/include/block/aio.h
new file mode 100644
index 000000000..47fbe9d81
--- /dev/null
+++ b/include/block/aio.h
@@ -0,0 +1,770 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
+#include "qemu/coroutine.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+
+typedef struct BlockAIOCB BlockAIOCB;
+typedef void BlockCompletionFunc(void *opaque, int ret);
+
+typedef struct AIOCBInfo {
+ void (*cancel_async)(BlockAIOCB *acb);
+ AioContext *(*get_aio_context)(BlockAIOCB *acb);
+ size_t aiocb_size;
+} AIOCBInfo;
+
+struct BlockAIOCB {
+ const AIOCBInfo *aiocb_info;
+ BlockDriverState *bs;
+ BlockCompletionFunc *cb;
+ void *opaque;
+ int refcnt;
+};
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque);
+void qemu_aio_unref(void *p);
+void qemu_aio_ref(void *p);
+
+typedef struct AioHandler AioHandler;
+typedef QLIST_HEAD(, AioHandler) AioHandlerList;
+typedef void QEMUBHFunc(void *opaque);
+typedef bool AioPollFn(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct Coroutine;
+struct ThreadPool;
+struct LinuxAioState;
+struct LuringState;
+
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+ /*
+ * update:
+ * @ctx: the AioContext
+ * @old_node: the existing handler or NULL if this file descriptor is being
+ * monitored for the first time
+ * @new_node: the new handler or NULL if this file descriptor is being
+ * removed
+ *
+ * Add/remove/modify a monitored file descriptor.
+ *
+ * Called with ctx->list_lock acquired.
+ */
+ void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+ /*
+ * wait:
+ * @ctx: the AioContext
+ * @ready_list: list for handlers that become ready
+ * @timeout: maximum duration to wait, in nanoseconds
+ *
+ * Wait for file descriptors to become ready and place them on ready_list.
+ *
+ * Called with ctx->list_lock incremented but not locked.
+ *
+ * Returns: number of ready file descriptors.
+ */
+ int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+ /*
+ * need_wait:
+ * @ctx: the AioContext
+ *
+ * Tell aio_poll() when to stop userspace polling early because ->wait()
+ * has fds ready.
+ *
+ * File descriptor monitoring implementations that cannot poll fd readiness
+ * from userspace should use aio_poll_disabled() here. This ensures that
+ * file descriptors are not starved by handlers that frequently make
+ * progress via userspace polling.
+ *
+ * Returns: true if ->wait() should be called, false otherwise.
+ */
+ bool (*need_wait)(AioContext *ctx);
+} FDMonOps;
+
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call. All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+ BHList bh_list;
+ QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
+struct AioContext {
+ GSource source;
+
+ /* Used by AioContext users to protect from multi-threaded access. */
+ QemuRecMutex lock;
+
+ /* The list of registered AIO handlers. Protected by ctx->list_lock. */
+ AioHandlerList aio_handlers;
+
+ /* The list of AIO handlers to be deleted. Protected by ctx->list_lock. */
+ AioHandlerList deleted_aio_handlers;
+
+ /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+ * only written from the AioContext home thread, or under the BQL in
+ * the case of the main AioContext. However, it is read from any
+ * thread so it is still accessed with atomic primitives.
+ *
+ * If this field is 0, everything (file descriptors, bottom halves,
+ * timers) will be re-evaluated before the next blocking poll() or
+ * io_uring wait; therefore, the event_notifier_set call can be
+ * skipped. If it is non-zero, you may need to wake up a concurrent
+ * aio_poll or the glib main event loop, making event_notifier_set
+ * necessary.
+ *
+ * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+ * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+ * Bits 1-31 simply count the number of active calls to aio_poll
+ * that are in the prepare or poll phase.
+ *
+ * The GSource and aio_poll must use a different mechanism because
+ * there is no certainty that a call to GSource's prepare callback
+ * (via g_main_context_prepare) is indeed followed by check and
+ * dispatch. It's not clear whether this would be a bug, but let's
+ * play safe and allow it---it will just cause extra calls to
+ * event_notifier_set until the next call to dispatch.
+ *
+ * Instead, the aio_poll calls include both the prepare and the
+ * dispatch phase, hence a simple counter is enough for them.
+ */
+ uint32_t notify_me;
+
+ /* A lock to protect between QEMUBH and AioHandler adders and deleter,
+ * and to ensure that no callbacks are removed while we're walking and
+ * dispatching them.
+ */
+ QemuLockCnt list_lock;
+
+ /* Bottom Halves pending aio_bh_poll() processing */
+ BHList bh_list;
+
+ /* Chained BH list slices for each nested aio_bh_poll() call */
+ QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
+
+ /* Used by aio_notify.
+ *
+ * "notified" is used to avoid expensive event_notifier_test_and_clear
+ * calls. When it is clear, the EventNotifier is clear, or one thread
+ * is going to clear "notified" before processing more events. False
+ * positives are possible, i.e. "notified" could be set even though the
+ * EventNotifier is clear.
+ *
+ * Note that event_notifier_set *cannot* be optimized the same way. For
+ * more information on the problem that would result, see "#ifdef BUG2"
+ * in the docs/aio_notify_accept.promela formal model.
+ */
+ bool notified;
+ EventNotifier notifier;
+
+ QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+ QEMUBH *co_schedule_bh;
+
+ /* Thread pool for performing work and receiving completion callbacks.
+ * Has its own locking.
+ */
+ struct ThreadPool *thread_pool;
+
+#ifdef CONFIG_LINUX_AIO
+ /*
+ * State for native Linux AIO. Uses aio_context_acquire/release for
+ * locking.
+ */
+ struct LinuxAioState *linux_aio;
+#endif
+#ifdef CONFIG_LINUX_IO_URING
+ /*
+ * State for Linux io_uring. Uses aio_context_acquire/release for
+ * locking.
+ */
+ struct LuringState *linux_io_uring;
+
+ /* State for file descriptor monitoring using Linux io_uring */
+ struct io_uring fdmon_io_uring;
+ AioHandlerSList submit_list;
+#endif
+
+ /* TimerLists for calling timers - one per clock type. Has its own
+ * locking.
+ */
+ QEMUTimerListGroup tlg;
+
+ int external_disable_cnt;
+
+ /* Number of AioHandlers without .io_poll() */
+ int poll_disable_cnt;
+
+ /* Polling mode parameters */
+ int64_t poll_ns; /* current polling time in nanoseconds */
+ int64_t poll_max_ns; /* maximum polling time in nanoseconds */
+ int64_t poll_grow; /* polling time growth factor */
+ int64_t poll_shrink; /* polling time shrink factor */
+
+ /* AIO engine parameters */
+ int64_t aio_max_batch; /* maximum number of requests in a batch */
+
+ /*
+ * List of handlers participating in userspace polling. Protected by
+ * ctx->list_lock. Iterated and modified mostly by the event loop thread
+ * from aio_poll() with ctx->list_lock incremented. aio_set_fd_handler()
+ * only touches the list to delete nodes if ctx->list_lock's count is zero.
+ */
+ AioHandlerList poll_aio_handlers;
+
+ /* Are we in polling mode or monitoring file descriptors? */
+ bool poll_started;
+
+ /* epoll(7) state used when built with CONFIG_EPOLL */
+ int epollfd;
+
+ const FDMonOps *fdmon_ops;
+};
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(Error **errp);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/* Take ownership of the AioContext. If the AioContext will be shared between
+ * threads, and a thread does not want to be interrupted, it will have to
+ * take ownership around calls to aio_poll(). Otherwise, aio_poll()
+ * automatically takes care of calling aio_context_acquire and
+ * aio_context_release.
+ *
+ * Note that this is separate from bdrv_drained_begin/bdrv_drained_end. A
+ * thread still has to call those to avoid being interrupted by the guest.
+ *
+ * Bottom halves, timers and callbacks can be created or removed without
+ * acquiring the AioContext.
+ */
+void aio_context_acquire(AioContext *ctx);
+
+/* Relinquish ownership of the AioContext. */
+void aio_context_release(AioContext *ctx);
+
+/**
+ * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
+ * run only once and as soon as possible.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+ const char *name);
+
+/**
+ * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
+ * only once and as soon as possible.
+ *
+ * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
+ * name string.
+ */
+#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
+ aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_bh_new_full: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe. The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+ const char *name);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new(ctx, cb, opaque) \
+ aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_poll to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_poll again very soon,
+ * or go through another iteration of the GLib main loop. Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_notify_accept: Acknowledge receiving an aio_notify.
+ *
+ * aio_notify() uses an EventNotifier in order to wake up a sleeping
+ * aio_poll() or g_main_context_iteration(). Calls to aio_notify() are
+ * usually rare, but the AioContext has to clear the EventNotifier on
+ * every aio_poll() or g_main_context_iteration() in order to avoid
+ * busy waiting. This event_notifier_test_and_clear() cannot be done
+ * using the usual aio_context_set_event_notifier(), because it must
+ * be done before processing all events (file descriptors, bottom halves,
+ * timers).
+ *
+ * aio_notify_accept() is an optimized event_notifier_test_and_clear()
+ * that is specific to an AioContext's notifier; it is used internally
+ * to clear the EventNotifier only if aio_notify() had been called.
+ */
+void aio_notify_accept(AioContext *ctx);
+
+/**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked. This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet. While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex. This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new. It also implies canceling the bottom half if it was
+ * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+void aio_dispatch(AioContext *ctx);
+
+/* Progress in completing AIO work to occur. This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers. If @blocking == true, this should always be true except
+ * if someone called aio_notify.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking. If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool aio_poll(AioContext *ctx, bool blocking);
+
+/* Register a file descriptor and associated callbacks. Behaves very similarly
+ * to qemu_set_fd_handler. Unlike qemu_set_fd_handler, these callbacks will
+ * be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+ int fd,
+ bool is_external,
+ IOHandler *io_read,
+ IOHandler *io_write,
+ AioPollFn *io_poll,
+ void *opaque);
+
+/* Set polling begin/end callbacks for a file descriptor that has already been
+ * registered with aio_set_fd_handler. Do nothing if the file descriptor is
+ * not registered.
+ */
+void aio_set_fd_poll(AioContext *ctx, int fd,
+ IOHandler *io_poll_begin,
+ IOHandler *io_poll_end);
+
+/* Register an event notifier and associated callbacks. Behaves very similarly
+ * to event_notifier_set_handler. Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+ EventNotifier *notifier,
+ bool is_external,
+ EventNotifierHandler *io_read,
+ AioPollFn *io_poll);
+
+/* Set polling begin/end callbacks for an event notifier that has already been
+ * registered with aio_set_event_notifier. Do nothing if the event notifier is
+ * not registered.
+ */
+void aio_set_event_notifier_poll(AioContext *ctx,
+ EventNotifier *notifier,
+ EventNotifierHandler *io_poll_begin,
+ EventNotifierHandler *io_poll_end);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPool bound to this AioContext */
+struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
+
+/* Setup the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
+
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
+/* Setup the LuringState bound to this AioContext */
+struct LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
+
+/* Return the LuringState bound to this AioContext */
+struct LuringState *aio_get_linux_io_uring(AioContext *ctx);
+/**
+ * aio_timer_new_with_attrs:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ * to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer (with attributes) attached to the context @ctx.
+ * The function is responsible for memory allocation.
+ *
+ * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
+ * Use that unless you really need dynamic memory allocation.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
+ QEMUClockType type,
+ int scale, int attributes,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_new:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer attached to the context @ctx.
+ * See aio_timer_new_with_attrs for details.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
+ int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_timer_init_with_attrs:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ * to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer (with attributes) attached to the context @ctx.
+ * The caller is responsible for memory allocation.
+ */
+static inline void aio_timer_init_with_attrs(AioContext *ctx,
+ QEMUTimer *ts, QEMUClockType type,
+ int scale, int attributes,
+ QEMUTimerCB *cb, void *opaque)
+{
+ timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_init:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer attached to the context @ctx.
+ * See aio_timer_init_with_attrs for details.
+ */
+static inline void aio_timer_init(AioContext *ctx,
+ QEMUTimer *ts, QEMUClockType type,
+ int scale,
+ QEMUTimerCB *cb, void *opaque)
+{
+ timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
+/**
+ * aio_disable_external:
+ * @ctx: the aio context
+ *
+ * Disable the further processing of external clients.
+ */
+static inline void aio_disable_external(AioContext *ctx)
+{
+ qatomic_inc(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_enable_external:
+ * @ctx: the aio context
+ *
+ * Enable the processing of external clients.
+ */
+static inline void aio_enable_external(AioContext *ctx)
+{
+ int old;
+
+ old = qatomic_fetch_dec(&ctx->external_disable_cnt);
+ assert(old > 0);
+ if (old == 1) {
+ /* Kick event loop so it re-arms file descriptors */
+ aio_notify(ctx);
+ }
+}
+
+/**
+ * aio_external_disabled:
+ * @ctx: the aio context
+ *
+ * Return true if the external clients are disabled.
+ */
+static inline bool aio_external_disabled(AioContext *ctx)
+{
+ return qatomic_read(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_node_check:
+ * @ctx: the aio context
+ * @is_external: Whether or not the checked node is an external event source.
+ *
+ * Check if the node's is_external flag is okay to be polled by the ctx at this
+ * moment. True means green light.
+ */
+static inline bool aio_node_check(AioContext *ctx, bool is_external)
+{
+ return !is_external || !qatomic_read(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active. In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context. The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext. If
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+void qemu_set_current_aio_context(AioContext *ctx);
+
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ *
+ * Initialize the aio context.
+ */
+void aio_context_setup(AioContext *ctx);
+
+/**
+ * aio_context_destroy:
+ * @ctx: the aio context
+ *
+ * Destroy the aio context.
+ */
+void aio_context_destroy(AioContext *ctx);
+
+/* Used internally, do not call outside AioContext code */
+void aio_context_use_g_source(AioContext *ctx);
+
+/**
+ * aio_context_set_poll_params:
+ * @ctx: the aio context
+ * @max_ns: how long to busy poll for, in nanoseconds
+ * @grow: polling time growth factor
+ * @shrink: polling time shrink factor
+ *
+ * Poll mode can be disabled by setting poll_max_ns to 0.
+ */
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+ int64_t grow, int64_t shrink,
+ Error **errp);
+
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ * engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+ Error **errp);
+
+#endif
diff --git a/include/block/aio_task.h b/include/block/aio_task.h
new file mode 100644
index 000000000..50bc1e181
--- /dev/null
+++ b/include/block/aio_task.h
@@ -0,0 +1,54 @@
+/*
+ * Aio tasks loops
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_AIO_TASK_H
+#define BLOCK_AIO_TASK_H
+
+#include "qemu/coroutine.h"
+
+typedef struct AioTaskPool AioTaskPool;
+typedef struct AioTask AioTask;
+typedef int coroutine_fn (*AioTaskFunc)(AioTask *task);
+struct AioTask {
+ AioTaskPool *pool;
+ AioTaskFunc func;
+ int ret;
+};
+
+AioTaskPool *coroutine_fn aio_task_pool_new(int max_busy_tasks);
+void aio_task_pool_free(AioTaskPool *);
+
+/* error code of failed task or 0 if all is OK */
+int aio_task_pool_status(AioTaskPool *pool);
+
+bool aio_task_pool_empty(AioTaskPool *pool);
+
+/* User provides filled @task, however task->pool will be set automatically */
+void coroutine_fn aio_task_pool_start_task(AioTaskPool *pool, AioTask *task);
+
+void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool);
+void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool);
+void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool);
+
+#endif /* BLOCK_AIO_TASK_H */
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
new file mode 100644
index 000000000..99370fa38
--- /dev/null
+++ b/include/block/block-copy.h
@@ -0,0 +1,96 @@
+/*
+ * block_copy API
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ * Copyright (c) 2019 Virtuozzo International GmbH.
+ *
+ * Authors:
+ * Dietmar Maurer (dietmar@proxmox.com)
+ * Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+#include "block/block.h"
+#include "qemu/co-shared-resource.h"
+
+/* All APIs are thread-safe */
+
+typedef void (*BlockCopyAsyncCallbackFunc)(void *opaque);
+typedef struct BlockCopyState BlockCopyState;
+typedef struct BlockCopyCallState BlockCopyCallState;
+
+BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+ Error **errp);
+
+/* Function should be called prior any actual copy request */
+void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
+ bool compress);
+void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm);
+
+void block_copy_state_free(BlockCopyState *s);
+
+int64_t block_copy_reset_unallocated(BlockCopyState *s,
+ int64_t offset, int64_t *count);
+
+int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
+ bool ignore_ratelimit);
+
+/*
+ * Run block-copy in a coroutine, create corresponding BlockCopyCallState
+ * object and return pointer to it. Never returns NULL.
+ *
+ * Caller is responsible to call block_copy_call_free() to free
+ * BlockCopyCallState object.
+ *
+ * @max_workers means maximum of parallel coroutines to execute sub-requests,
+ * must be > 0.
+ *
+ * @max_chunk means maximum length for one IO operation. Zero means unlimited.
+ */
+BlockCopyCallState *block_copy_async(BlockCopyState *s,
+ int64_t offset, int64_t bytes,
+ int max_workers, int64_t max_chunk,
+ BlockCopyAsyncCallbackFunc cb,
+ void *cb_opaque);
+
+/*
+ * Free finished BlockCopyCallState. Trying to free running
+ * block-copy will crash.
+ */
+void block_copy_call_free(BlockCopyCallState *call_state);
+
+/*
+ * Note, that block-copy call is marked finished prior to calling
+ * the callback.
+ */
+bool block_copy_call_finished(BlockCopyCallState *call_state);
+bool block_copy_call_succeeded(BlockCopyCallState *call_state);
+bool block_copy_call_failed(BlockCopyCallState *call_state);
+bool block_copy_call_cancelled(BlockCopyCallState *call_state);
+int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read);
+
+void block_copy_set_speed(BlockCopyState *s, uint64_t speed);
+void block_copy_kick(BlockCopyCallState *call_state);
+
+/*
+ * Cancel running block-copy call.
+ *
+ * Cancel leaves block-copy state valid: dirty bits are correct and you may use
+ * cancel + <run block_copy with same parameters> to emulate pause/resume.
+ *
+ * Note also, that the cancel is async: it only marks block-copy call to be
+ * cancelled. So, the call may be cancelled (block_copy_call_cancelled() reports
+ * true) but not yet finished (block_copy_call_finished() reports false).
+ */
+void block_copy_call_cancel(BlockCopyCallState *call_state);
+
+BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s);
+int64_t block_copy_cluster_size(BlockCopyState *s);
+void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip);
+
+#endif /* BLOCK_COPY_H */
diff --git a/include/block/block-hmp-cmds.h b/include/block/block-hmp-cmds.h
new file mode 100644
index 000000000..3412e108c
--- /dev/null
+++ b/include/block/block-hmp-cmds.h
@@ -0,0 +1,54 @@
+/*
+ * HMP commands related to the block layer
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2020 Red Hat, Inc.
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_HMP_COMMANDS_H
+#define BLOCK_HMP_COMMANDS_H
+
+void hmp_drive_add(Monitor *mon, const QDict *qdict);
+
+void hmp_commit(Monitor *mon, const QDict *qdict);
+void hmp_drive_del(Monitor *mon, const QDict *qdict);
+
+void hmp_drive_mirror(Monitor *mon, const QDict *qdict);
+void hmp_drive_backup(Monitor *mon, const QDict *qdict);
+
+void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
+void hmp_block_job_cancel(Monitor *mon, const QDict *qdict);
+void hmp_block_job_pause(Monitor *mon, const QDict *qdict);
+void hmp_block_job_resume(Monitor *mon, const QDict *qdict);
+void hmp_block_job_complete(Monitor *mon, const QDict *qdict);
+
+void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_snapshot_blkdev_internal(Monitor *mon, const QDict *qdict);
+void hmp_snapshot_delete_blkdev_internal(Monitor *mon, const QDict *qdict);
+
+void hmp_nbd_server_start(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_remove(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
+
+void hmp_block_resize(Monitor *mon, const QDict *qdict);
+void hmp_block_stream(Monitor *mon, const QDict *qdict);
+void hmp_block_passwd(Monitor *mon, const QDict *qdict);
+void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
+void hmp_eject(Monitor *mon, const QDict *qdict);
+
+void hmp_qemu_io(Monitor *mon, const QDict *qdict);
+
+void hmp_info_block(Monitor *mon, const QDict *qdict);
+void hmp_info_blockstats(Monitor *mon, const QDict *qdict);
+void hmp_info_block_jobs(Monitor *mon, const QDict *qdict);
+void hmp_info_snapshots(Monitor *mon, const QDict *qdict);
+
+#endif
diff --git a/include/block/block.h b/include/block/block.h
new file mode 100644
index 000000000..e5dd22b03
--- /dev/null
+++ b/include/block/block.h
@@ -0,0 +1,864 @@
+#ifndef BLOCK_H
+#define BLOCK_H
+
+#include "block/aio.h"
+#include "block/aio-wait.h"
+#include "qemu/iov.h"
+#include "qemu/coroutine.h"
+#include "block/accounting.h"
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
+#include "qemu/hbitmap.h"
+#include "qemu/transactions.h"
+
+/*
+ * generated_co_wrapper
+ *
+ * Function specifier, which does nothing but mark functions to be
+ * generated by scripts/block-coroutine-wrapper.py
+ *
+ * Read more in docs/devel/block-coroutine-wrapper.rst
+ */
+#define generated_co_wrapper
+
+/* block.c */
+typedef struct BlockDriver BlockDriver;
+typedef struct BdrvChild BdrvChild;
+typedef struct BdrvChildClass BdrvChildClass;
+
+typedef struct BlockDriverInfo {
+ /* in bytes, 0 if irrelevant */
+ int cluster_size;
+ /* offset at which the VM state can be saved (0 if not possible) */
+ int64_t vm_state_offset;
+ bool is_dirty;
+ /*
+ * True if this block driver only supports compressed writes
+ */
+ bool needs_compressed_writes;
+} BlockDriverInfo;
+
+typedef struct BlockFragInfo {
+ uint64_t allocated_clusters;
+ uint64_t total_clusters;
+ uint64_t fragmented_clusters;
+ uint64_t compressed_clusters;
+} BlockFragInfo;
+
+typedef enum {
+ BDRV_REQ_COPY_ON_READ = 0x1,
+ BDRV_REQ_ZERO_WRITE = 0x2,
+
+ /*
+ * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
+ * that the block driver should unmap (discard) blocks if it is guaranteed
+ * that the result will read back as zeroes. The flag is only passed to the
+ * driver if the block device is opened with BDRV_O_UNMAP.
+ */
+ BDRV_REQ_MAY_UNMAP = 0x4,
+
+ BDRV_REQ_FUA = 0x10,
+ BDRV_REQ_WRITE_COMPRESSED = 0x20,
+
+ /* Signifies that this write request will not change the visible disk
+ * content. */
+ BDRV_REQ_WRITE_UNCHANGED = 0x40,
+
+ /* Forces request serialisation. Use only with write requests. */
+ BDRV_REQ_SERIALISING = 0x80,
+
+ /* Execute the request only if the operation can be offloaded or otherwise
+ * be executed efficiently, but return an error instead of using a slow
+ * fallback. */
+ BDRV_REQ_NO_FALLBACK = 0x100,
+
+ /*
+ * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
+ * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
+ * filter is involved), in which case it signals that the COR operation
+ * need not read the data into memory (qiov) but only ensure they are
+ * copied to the top layer (i.e., that COR operation is done).
+ */
+ BDRV_REQ_PREFETCH = 0x200,
+
+ /*
+ * If we need to wait for other requests, just fail immediately. Used
+ * only together with BDRV_REQ_SERIALISING.
+ */
+ BDRV_REQ_NO_WAIT = 0x400,
+
+ /* Mask of valid flags */
+ BDRV_REQ_MASK = 0x7ff,
+} BdrvRequestFlags;
+
+typedef struct BlockSizes {
+ uint32_t phys;
+ uint32_t log;
+} BlockSizes;
+
+typedef struct HDGeometry {
+ uint32_t heads;
+ uint32_t sectors;
+ uint32_t cylinders;
+} HDGeometry;
+
+#define BDRV_O_NO_SHARE 0x0001 /* don't share permissions */
+#define BDRV_O_RDWR 0x0002
+#define BDRV_O_RESIZE 0x0004 /* request permission for resizing the node */
+#define BDRV_O_SNAPSHOT 0x0008 /* open the file read only and save writes in a snapshot */
+#define BDRV_O_TEMPORARY 0x0010 /* delete the file after use */
+#define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
+#define BDRV_O_NATIVE_AIO 0x0080 /* use native AIO instead of the thread pool */
+#define BDRV_O_NO_BACKING 0x0100 /* don't open the backing file */
+#define BDRV_O_NO_FLUSH 0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
+#define BDRV_O_INACTIVE 0x0800 /* consistency hint for migration handoff */
+#define BDRV_O_CHECK 0x1000 /* open solely for consistency check */
+#define BDRV_O_ALLOW_RDWR 0x2000 /* allow reopen to change from r/o to r/w */
+#define BDRV_O_UNMAP 0x4000 /* execute guest UNMAP/TRIM operations */
+#define BDRV_O_PROTOCOL 0x8000 /* if no block driver is explicitly given:
+ select an appropriate protocol driver,
+ ignoring the format layer */
+#define BDRV_O_NO_IO 0x10000 /* don't initialize for I/O */
+#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */
+#define BDRV_O_IO_URING 0x40000 /* use io_uring instead of the thread pool */
+
+#define BDRV_O_CACHE_MASK (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
+
+
+/* Option names of options parsed by the block layer */
+
+#define BDRV_OPT_CACHE_WB "cache.writeback"
+#define BDRV_OPT_CACHE_DIRECT "cache.direct"
+#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
+#define BDRV_OPT_READ_ONLY "read-only"
+#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
+#define BDRV_OPT_DISCARD "discard"
+#define BDRV_OPT_FORCE_SHARE "force-share"
+
+
+#define BDRV_SECTOR_BITS 9
+#define BDRV_SECTOR_SIZE (1ULL << BDRV_SECTOR_BITS)
+
+#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
+ INT_MAX >> BDRV_SECTOR_BITS)
+#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
+
+/*
+ * We want allow aligning requests and disk length up to any 32bit alignment
+ * and don't afraid of overflow.
+ * To achieve it, and in the same time use some pretty number as maximum disk
+ * size, let's define maximum "length" (a limit for any offset/bytes request and
+ * for disk size) to be the greatest power of 2 less than INT64_MAX.
+ */
+#define BDRV_MAX_ALIGNMENT (1L << 30)
+#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
+
+/*
+ * Allocation status flags for bdrv_block_status() and friends.
+ *
+ * Public flags:
+ * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
+ * BDRV_BLOCK_ZERO: offset reads as zero
+ * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
+ * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
+ * layer rather than any backing, set by block layer
+ * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
+ * layer, set by block layer
+ *
+ * Internal flags:
+ * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
+ * that the block layer recompute the answer from the returned
+ * BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
+ * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
+ * zeroes in file child of current block node inside
+ * returned region. Only valid together with both
+ * BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
+ * appear with BDRV_BLOCK_ZERO.
+ *
+ * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
+ * host offset within the returned BDS that is allocated for the
+ * corresponding raw guest data. However, whether that offset
+ * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
+ *
+ * DATA ZERO OFFSET_VALID
+ * t t t sectors read as zero, returned file is zero at offset
+ * t f t sectors read as valid from file at offset
+ * f t t sectors preallocated, read as zero, returned file not
+ * necessarily zero at offset
+ * f f t sectors preallocated but read from backing_hd,
+ * returned file contains garbage at offset
+ * t t f sectors preallocated, read as zero, unknown offset
+ * t f f sectors read from unknown file or offset
+ * f t f not allocated or unknown offset, read as zero
+ * f f f not allocated or unknown offset, read from backing_hd
+ */
+#define BDRV_BLOCK_DATA 0x01
+#define BDRV_BLOCK_ZERO 0x02
+#define BDRV_BLOCK_OFFSET_VALID 0x04
+#define BDRV_BLOCK_RAW 0x08
+#define BDRV_BLOCK_ALLOCATED 0x10
+#define BDRV_BLOCK_EOF 0x20
+#define BDRV_BLOCK_RECURSE 0x40
+
+typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
+
+typedef struct BDRVReopenState {
+ BlockDriverState *bs;
+ int flags;
+ BlockdevDetectZeroesOptions detect_zeroes;
+ bool backing_missing;
+ BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
+ BlockDriverState *old_file_bs; /* keep pointer for permissions update */
+ QDict *options;
+ QDict *explicit_options;
+ void *opaque;
+} BDRVReopenState;
+
+/*
+ * Block operation types
+ */
+typedef enum BlockOpType {
+ BLOCK_OP_TYPE_BACKUP_SOURCE,
+ BLOCK_OP_TYPE_BACKUP_TARGET,
+ BLOCK_OP_TYPE_CHANGE,
+ BLOCK_OP_TYPE_COMMIT_SOURCE,
+ BLOCK_OP_TYPE_COMMIT_TARGET,
+ BLOCK_OP_TYPE_DATAPLANE,
+ BLOCK_OP_TYPE_DRIVE_DEL,
+ BLOCK_OP_TYPE_EJECT,
+ BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
+ BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
+ BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
+ BLOCK_OP_TYPE_MIRROR_SOURCE,
+ BLOCK_OP_TYPE_MIRROR_TARGET,
+ BLOCK_OP_TYPE_RESIZE,
+ BLOCK_OP_TYPE_STREAM,
+ BLOCK_OP_TYPE_REPLACE,
+ BLOCK_OP_TYPE_MAX,
+} BlockOpType;
+
+/* Block node permission constants */
+enum {
+ /**
+ * A user that has the "permission" of consistent reads is guaranteed that
+ * their view of the contents of the block device is complete and
+ * self-consistent, representing the contents of a disk at a specific
+ * point.
+ *
+ * For most block devices (including their backing files) this is true, but
+ * the property cannot be maintained in a few situations like for
+ * intermediate nodes of a commit block job.
+ */
+ BLK_PERM_CONSISTENT_READ = 0x01,
+
+ /** This permission is required to change the visible disk contents. */
+ BLK_PERM_WRITE = 0x02,
+
+ /**
+ * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
+ * required for writes to the block node when the caller promises that
+ * the visible disk content doesn't change.
+ *
+ * As the BLK_PERM_WRITE permission is strictly stronger, either is
+ * sufficient to perform an unchanging write.
+ */
+ BLK_PERM_WRITE_UNCHANGED = 0x04,
+
+ /** This permission is required to change the size of a block node. */
+ BLK_PERM_RESIZE = 0x08,
+
+ /**
+ * This permission is required to change the node that this BdrvChild
+ * points to.
+ */
+ BLK_PERM_GRAPH_MOD = 0x10,
+
+ BLK_PERM_ALL = 0x1f,
+
+ DEFAULT_PERM_PASSTHROUGH = BLK_PERM_CONSISTENT_READ
+ | BLK_PERM_WRITE
+ | BLK_PERM_WRITE_UNCHANGED
+ | BLK_PERM_RESIZE,
+
+ DEFAULT_PERM_UNCHANGED = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
+};
+
+/*
+ * Flags that parent nodes assign to child nodes to specify what kind of
+ * role(s) they take.
+ *
+ * At least one of DATA, METADATA, FILTERED, or COW must be set for
+ * every child.
+ */
+enum BdrvChildRoleBits {
+ /*
+ * This child stores data.
+ * Any node may have an arbitrary number of such children.
+ */
+ BDRV_CHILD_DATA = (1 << 0),
+
+ /*
+ * This child stores metadata.
+ * Any node may have an arbitrary number of metadata-storing
+ * children.
+ */
+ BDRV_CHILD_METADATA = (1 << 1),
+
+ /*
+ * A child that always presents exactly the same visible data as
+ * the parent, e.g. by virtue of the parent forwarding all reads
+ * and writes.
+ * This flag is mutually exclusive with DATA, METADATA, and COW.
+ * Any node may have at most one filtered child at a time.
+ */
+ BDRV_CHILD_FILTERED = (1 << 2),
+
+ /*
+ * Child from which to read all data that isn't allocated in the
+ * parent (i.e., the backing child); such data is copied to the
+ * parent through COW (and optionally COR).
+ * This field is mutually exclusive with DATA, METADATA, and
+ * FILTERED.
+ * Any node may have at most one such backing child at a time.
+ */
+ BDRV_CHILD_COW = (1 << 3),
+
+ /*
+ * The primary child. For most drivers, this is the child whose
+ * filename applies best to the parent node.
+ * Any node may have at most one primary child at a time.
+ */
+ BDRV_CHILD_PRIMARY = (1 << 4),
+
+ /* Useful combination of flags */
+ BDRV_CHILD_IMAGE = BDRV_CHILD_DATA
+ | BDRV_CHILD_METADATA
+ | BDRV_CHILD_PRIMARY,
+};
+
+/* Mask of BdrvChildRoleBits values */
+typedef unsigned int BdrvChildRole;
+
+char *bdrv_perm_names(uint64_t perm);
+uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
+
+/* disk I/O throttling */
+void bdrv_init(void);
+void bdrv_init_with_whitelist(void);
+bool bdrv_uses_whitelist(void);
+int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
+BlockDriver *bdrv_find_protocol(const char *filename,
+ bool allow_protocol_prefix,
+ Error **errp);
+BlockDriver *bdrv_find_format(const char *format_name);
+int bdrv_create(BlockDriver *drv, const char* filename,
+ QemuOpts *opts, Error **errp);
+int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
+
+BlockDriverState *bdrv_new(void);
+int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
+ Error **errp);
+int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
+ Error **errp);
+int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
+ Error **errp);
+BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
+ int flags, Error **errp);
+int bdrv_drop_filter(BlockDriverState *bs, Error **errp);
+
+int bdrv_parse_aio(const char *mode, int *flags);
+int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
+int bdrv_parse_discard_flags(const char *mode, int *flags);
+BdrvChild *bdrv_open_child(const char *filename,
+ QDict *options, const char *bdref_key,
+ BlockDriverState* parent,
+ const BdrvChildClass *child_class,
+ BdrvChildRole child_role,
+ bool allow_none, Error **errp);
+BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
+int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
+ Error **errp);
+int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
+ const char *bdref_key, Error **errp);
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+ QDict *options, int flags, Error **errp);
+BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
+ const char *node_name,
+ QDict *options, int flags,
+ Error **errp);
+BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
+ int flags, Error **errp);
+BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+ BlockDriverState *bs, QDict *options,
+ bool keep_old_opts);
+void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue);
+int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
+ Error **errp);
+int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
+ Error **errp);
+int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
+ int64_t bytes, BdrvRequestFlags flags);
+int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
+int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes);
+int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf,
+ int64_t bytes);
+int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
+ const void *buf, int64_t bytes);
+/*
+ * Efficiently zero a region of the disk image. Note that this is a regular
+ * I/O request like read or write and should have a reasonable size. This
+ * function is not suitable for zeroing the entire image in a single request
+ * because it may allocate memory for the entire region.
+ */
+int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
+ int64_t bytes, BdrvRequestFlags flags);
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+ const char *backing_file);
+void bdrv_refresh_filename(BlockDriverState *bs);
+
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
+ PreallocMode prealloc, BdrvRequestFlags flags,
+ Error **errp);
+int generated_co_wrapper
+bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+ PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
+
+int64_t bdrv_nb_sectors(BlockDriverState *bs);
+int64_t bdrv_getlength(BlockDriverState *bs);
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
+BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
+ BlockDriverState *in_bs, Error **errp);
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp);
+int bdrv_commit(BlockDriverState *bs);
+int bdrv_make_empty(BdrvChild *c, Error **errp);
+int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
+ const char *backing_fmt, bool warn);
+void bdrv_register(BlockDriver *bdrv);
+int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
+ const char *backing_file_str);
+BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
+ BlockDriverState *bs);
+BlockDriverState *bdrv_find_base(BlockDriverState *bs);
+bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+ Error **errp);
+int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
+ Error **errp);
+void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base);
+int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp);
+void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs);
+
+
+typedef struct BdrvCheckResult {
+ int corruptions;
+ int leaks;
+ int check_errors;
+ int corruptions_fixed;
+ int leaks_fixed;
+ int64_t image_end_offset;
+ BlockFragInfo bfi;
+} BdrvCheckResult;
+
+typedef enum {
+ BDRV_FIX_LEAKS = 1,
+ BDRV_FIX_ERRORS = 2,
+} BdrvCheckMode;
+
+int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
+ BdrvCheckMode fix);
+
+/* The units of offset and total_work_size may be chosen arbitrarily by the
+ * block driver; total_work_size may change during the course of the amendment
+ * operation */
+typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset,
+ int64_t total_work_size, void *opaque);
+int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts,
+ BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
+ bool force,
+ Error **errp);
+
+/* check if a named node can be replaced when doing drive-mirror */
+BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+ const char *node_name, Error **errp);
+
+/* async block I/O */
+void bdrv_aio_cancel(BlockAIOCB *acb);
+void bdrv_aio_cancel_async(BlockAIOCB *acb);
+
+/* sg packet commands */
+int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
+
+/* Invalidate any cached metadata used by image formats */
+int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
+ Error **errp);
+void bdrv_invalidate_cache_all(Error **errp);
+int bdrv_inactivate_all(void);
+
+/* Ensure contents are flushed to disk. */
+int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
+int bdrv_flush_all(void);
+void bdrv_close_all(void);
+void bdrv_drain(BlockDriverState *bs);
+void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
+void bdrv_drain_all_begin(void);
+void bdrv_drain_all_end(void);
+void bdrv_drain_all(void);
+
+#define BDRV_POLL_WHILE(bs, cond) ({ \
+ BlockDriverState *bs_ = (bs); \
+ AIO_WAIT_WHILE(bdrv_get_aio_context(bs_), \
+ cond); })
+
+int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
+ int64_t bytes);
+int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+int bdrv_has_zero_init_1(BlockDriverState *bs);
+int bdrv_has_zero_init(BlockDriverState *bs);
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
+int bdrv_block_status(BlockDriverState *bs, int64_t offset,
+ int64_t bytes, int64_t *pnum, int64_t *map,
+ BlockDriverState **file);
+int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
+ int64_t offset, int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file);
+int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
+ int64_t *pnum);
+int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
+ bool include_base, int64_t offset, int64_t bytes,
+ int64_t *pnum);
+int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
+ int64_t bytes);
+
+bool bdrv_is_read_only(BlockDriverState *bs);
+int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
+ bool ignore_allow_rdw, Error **errp);
+int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
+ Error **errp);
+bool bdrv_is_writable(BlockDriverState *bs);
+bool bdrv_is_sg(BlockDriverState *bs);
+bool bdrv_is_inserted(BlockDriverState *bs);
+void bdrv_lock_medium(BlockDriverState *bs, bool locked);
+void bdrv_eject(BlockDriverState *bs, bool eject_flag);
+const char *bdrv_get_format_name(BlockDriverState *bs);
+BlockDriverState *bdrv_find_node(const char *node_name);
+BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp);
+XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp);
+BlockDriverState *bdrv_lookup_bs(const char *device,
+ const char *node_name,
+ Error **errp);
+bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
+BlockDriverState *bdrv_next_node(BlockDriverState *bs);
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
+
+typedef struct BdrvNextIterator {
+ enum {
+ BDRV_NEXT_BACKEND_ROOTS,
+ BDRV_NEXT_MONITOR_OWNED,
+ } phase;
+ BlockBackend *blk;
+ BlockDriverState *bs;
+} BdrvNextIterator;
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it);
+BlockDriverState *bdrv_next(BdrvNextIterator *it);
+void bdrv_next_cleanup(BdrvNextIterator *it);
+
+BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
+bool bdrv_supports_compressed_writes(BlockDriverState *bs);
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+ void *opaque, bool read_only);
+const char *bdrv_get_node_name(const BlockDriverState *bs);
+const char *bdrv_get_device_name(const BlockDriverState *bs);
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
+int bdrv_get_flags(BlockDriverState *bs);
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
+ Error **errp);
+BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
+void bdrv_round_to_clusters(BlockDriverState *bs,
+ int64_t offset, int64_t bytes,
+ int64_t *cluster_offset,
+ int64_t *cluster_bytes);
+
+void bdrv_get_backing_filename(BlockDriverState *bs,
+ char *filename, int filename_size);
+char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp);
+char *bdrv_get_full_backing_filename_from_filename(const char *backed,
+ const char *backing,
+ Error **errp);
+char *bdrv_dirname(BlockDriverState *bs, Error **errp);
+
+int path_has_protocol(const char *path);
+int path_is_absolute(const char *path);
+char *path_combine(const char *base_path, const char *filename);
+
+int generated_co_wrapper
+bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int generated_co_wrapper
+bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+ int64_t pos, int size);
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+ int64_t pos, int size);
+
+void bdrv_img_create(const char *filename, const char *fmt,
+ const char *base_filename, const char *base_fmt,
+ char *options, uint64_t img_size, int flags,
+ bool quiet, Error **errp);
+
+/* Returns the alignment in bytes that is required so that no bounce buffer
+ * is required throughout the stack */
+size_t bdrv_min_mem_align(BlockDriverState *bs);
+/* Returns optimal alignment in bytes for bounce buffer */
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
+void *qemu_blockalign(BlockDriverState *bs, size_t size);
+void *qemu_blockalign0(BlockDriverState *bs, size_t size);
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
+
+void bdrv_enable_copy_on_read(BlockDriverState *bs);
+void bdrv_disable_copy_on_read(BlockDriverState *bs);
+
+void bdrv_ref(BlockDriverState *bs);
+void bdrv_unref(BlockDriverState *bs);
+void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
+BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+ BlockDriverState *child_bs,
+ const char *child_name,
+ const BdrvChildClass *child_class,
+ BdrvChildRole child_role,
+ Error **errp);
+
+bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
+void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
+void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
+void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
+void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
+bool bdrv_op_blocker_is_empty(BlockDriverState *bs);
+
+#define BLKDBG_EVENT(child, evt) \
+ do { \
+ if (child) { \
+ bdrv_debug_event(child->bs, evt); \
+ } \
+ } while (0)
+
+void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
+
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+ const char *tag);
+int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
+
+/**
+ * bdrv_get_aio_context:
+ *
+ * Returns: the currently bound #AioContext
+ */
+AioContext *bdrv_get_aio_context(BlockDriverState *bs);
+
+/**
+ * Move the current coroutine to the AioContext of @bs and return the old
+ * AioContext of the coroutine. Increase bs->in_flight so that draining @bs
+ * will wait for the operation to proceed until the corresponding
+ * bdrv_co_leave().
+ *
+ * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as
+ * this will deadlock.
+ */
+AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs);
+
+/**
+ * Ends a section started by bdrv_co_enter(). Move the current coroutine back
+ * to old_ctx and decrease bs->in_flight again.
+ */
+void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
+
+/**
+ * Locks the AioContext of @bs if it's not the current AioContext. This avoids
+ * double locking which could lead to deadlocks: This is a coroutine_fn, so we
+ * know we already own the lock of the current AioContext.
+ *
+ * May only be called in the main thread.
+ */
+void coroutine_fn bdrv_co_lock(BlockDriverState *bs);
+
+/**
+ * Unlocks the AioContext of @bs if it's not the current AioContext.
+ */
+void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);
+
+/**
+ * Transfer control to @co in the aio context of @bs
+ */
+void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);
+
+void bdrv_set_aio_context_ignore(BlockDriverState *bs,
+ AioContext *new_context, GSList **ignore);
+int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+ Error **errp);
+int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+ BdrvChild *ignore_child, Error **errp);
+bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
+ GSList **ignore, Error **errp);
+bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+ GSList **ignore, Error **errp);
+AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c);
+AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
+
+int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz);
+int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo);
+
+void bdrv_io_plug(BlockDriverState *bs);
+void bdrv_io_unplug(BlockDriverState *bs);
+
+/**
+ * bdrv_parent_drained_begin_single:
+ *
+ * Begin a quiesced section for the parent of @c. If @poll is true, wait for
+ * any pending activity to cease.
+ */
+void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
+
+/**
+ * bdrv_parent_drained_end_single:
+ *
+ * End a quiesced section for the parent of @c.
+ *
+ * This polls @bs's AioContext until all scheduled sub-drained_ends
+ * have settled, which may result in graph changes.
+ */
+void bdrv_parent_drained_end_single(BdrvChild *c);
+
+/**
+ * bdrv_drain_poll:
+ *
+ * Poll for pending requests in @bs, its parents (except for @ignore_parent),
+ * and if @recursive is true its children as well (used for subtree drain).
+ *
+ * If @ignore_bds_parents is true, parents that are BlockDriverStates must
+ * ignore the drain request because they will be drained separately (used for
+ * drain_all).
+ *
+ * This is part of bdrv_drained_begin.
+ */
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+ BdrvChild *ignore_parent, bool ignore_bds_parents);
+
+/**
+ * bdrv_drained_begin:
+ *
+ * Begin a quiesced section for exclusive access to the BDS, by disabling
+ * external request sources including NBD server, block jobs, and device model.
+ *
+ * This function can be recursive.
+ */
+void bdrv_drained_begin(BlockDriverState *bs);
+
+/**
+ * bdrv_do_drained_begin_quiesce:
+ *
+ * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
+ * running requests to complete.
+ */
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+ BdrvChild *parent, bool ignore_bds_parents);
+
+/**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
+ * bdrv_drained_end:
+ *
+ * End a quiescent section started by bdrv_drained_begin().
+ *
+ * This polls @bs's AioContext until all scheduled sub-drained_ends
+ * have settled. On one hand, that may result in graph changes. On
+ * the other, this requires that the caller either runs in the main
+ * loop; or that all involved nodes (@bs and all of its parents) are
+ * in the caller's AioContext.
+ */
+void bdrv_drained_end(BlockDriverState *bs);
+
+/**
+ * bdrv_drained_end_no_poll:
+ *
+ * Same as bdrv_drained_end(), but do not poll for the subgraph to
+ * actually become unquiesced. Therefore, no graph changes will occur
+ * with this function.
+ *
+ * *drained_end_counter is incremented for every background operation
+ * that is scheduled, and will be decremented for every operation once
+ * it settles. The caller must poll until it reaches 0. The counter
+ * should be accessed using atomic operations only.
+ */
+void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
+
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
+void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
+ Error **errp);
+void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
+
+bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
+ uint32_t granularity, Error **errp);
+/**
+ *
+ * bdrv_register_buf/bdrv_unregister_buf:
+ *
+ * Register/unregister a buffer for I/O. For example, VFIO drivers are
+ * interested to know the memory areas that would later be used for I/O, so
+ * that they can prepare IOMMU mapping etc., to get better performance.
+ */
+void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
+void bdrv_unregister_buf(BlockDriverState *bs, void *host);
+
+/**
+ *
+ * bdrv_co_copy_range:
+ *
+ * Do offloaded copy between two children. If the operation is not implemented
+ * by the driver, or if the backend storage doesn't support it, a negative
+ * error code will be returned.
+ *
+ * Note: block layer doesn't emulate or fallback to a bounce buffer approach
+ * because usually the caller shouldn't attempt offloaded copy any more (e.g.
+ * calling copy_file_range(2)) after the first error, thus it should fall back
+ * to a read+write path in the caller level.
+ *
+ * @src: Source child to copy data from
+ * @src_offset: offset in @src image to read data
+ * @dst: Destination child to copy data to
+ * @dst_offset: offset in @dst image to write data
+ * @bytes: number of bytes to copy
+ * @flags: request flags. Supported flags:
+ * BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero
+ * write on @dst as if bdrv_co_pwrite_zeroes is
+ * called. Used to simplify caller code, or
+ * during BlockDriver.bdrv_co_copy_range_from()
+ * recursion.
+ * BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping
+ * requests currently in flight.
+ *
+ * Returns: 0 if succeeded; negative error code if failed.
+ **/
+int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset,
+ int64_t bytes, BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags);
+
+void bdrv_cancel_in_flight(BlockDriverState *bs);
+
+#endif
diff --git a/include/block/block_backup.h b/include/block/block_backup.h
new file mode 100644
index 000000000..157596c29
--- /dev/null
+++ b/include/block/block_backup.h
@@ -0,0 +1,25 @@
+/*
+ * QEMU backup
+ *
+ * Copyright (c) 2013 Proxmox Server Solutions
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Authors:
+ * Dietmar Maurer <dietmar@proxmox.com>
+ * Changlong Xie <xiecl.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_BACKUP_H
+#define BLOCK_BACKUP_H
+
+#include "block/block_int.h"
+
+void backup_do_checkpoint(BlockJob *job, Error **errp);
+
+#endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
new file mode 100644
index 000000000..f4c75e8ba
--- /dev/null
+++ b/include/block/block_int.h
@@ -0,0 +1,1504 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_INT_H
+#define BLOCK_INT_H
+
+#include "block/accounting.h"
+#include "block/block.h"
+#include "block/aio-wait.h"
+#include "qemu/queue.h"
+#include "qemu/coroutine.h"
+#include "qemu/stats64.h"
+#include "qemu/timer.h"
+#include "qemu/hbitmap.h"
+#include "block/snapshot.h"
+#include "qemu/throttle.h"
+#include "qemu/rcu.h"
+
+#define BLOCK_FLAG_LAZY_REFCOUNTS 8
+
+#define BLOCK_OPT_SIZE "size"
+#define BLOCK_OPT_ENCRYPT "encryption"
+#define BLOCK_OPT_ENCRYPT_FORMAT "encrypt.format"
+#define BLOCK_OPT_COMPAT6 "compat6"
+#define BLOCK_OPT_HWVERSION "hwversion"
+#define BLOCK_OPT_BACKING_FILE "backing_file"
+#define BLOCK_OPT_BACKING_FMT "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE "table_size"
+#define BLOCK_OPT_PREALLOC "preallocation"
+#define BLOCK_OPT_SUBFMT "subformat"
+#define BLOCK_OPT_COMPAT_LEVEL "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS "lazy_refcounts"
+#define BLOCK_OPT_ADAPTER_TYPE "adapter_type"
+#define BLOCK_OPT_REDUNDANCY "redundancy"
+#define BLOCK_OPT_NOCOW "nocow"
+#define BLOCK_OPT_EXTENT_SIZE_HINT "extent_size_hint"
+#define BLOCK_OPT_OBJECT_SIZE "object_size"
+#define BLOCK_OPT_REFCOUNT_BITS "refcount_bits"
+#define BLOCK_OPT_DATA_FILE "data_file"
+#define BLOCK_OPT_DATA_FILE_RAW "data_file_raw"
+#define BLOCK_OPT_COMPRESSION_TYPE "compression_type"
+#define BLOCK_OPT_EXTL2 "extended_l2"
+
+#define BLOCK_PROBE_BUF_SIZE 512
+
+enum BdrvTrackedRequestType {
+ BDRV_TRACKED_READ,
+ BDRV_TRACKED_WRITE,
+ BDRV_TRACKED_DISCARD,
+ BDRV_TRACKED_TRUNCATE,
+};
+
+/*
+ * That is not quite good that BdrvTrackedRequest structure is public,
+ * as block/io.c is very careful about incoming offset/bytes being
+ * correct. Be sure to assert bdrv_check_request() succeeded after any
+ * modification of BdrvTrackedRequest object out of block/io.c
+ */
+typedef struct BdrvTrackedRequest {
+ BlockDriverState *bs;
+ int64_t offset;
+ int64_t bytes;
+ enum BdrvTrackedRequestType type;
+
+ bool serialising;
+ int64_t overlap_offset;
+ int64_t overlap_bytes;
+
+ QLIST_ENTRY(BdrvTrackedRequest) list;
+ Coroutine *co; /* owner, used for deadlock detection */
+ CoQueue wait_queue; /* coroutines blocked on this request */
+
+ struct BdrvTrackedRequest *waiting_for;
+} BdrvTrackedRequest;
+
+int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, size_t qiov_offset,
+ Error **errp);
+int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp);
+
+struct BlockDriver {
+ const char *format_name;
+ int instance_size;
+
+ /* set to true if the BlockDriver is a block filter. Block filters pass
+ * certain callbacks that refer to data (see block.c) to their bs->file
+ * or bs->backing (whichever one exists) if the driver doesn't implement
+ * them. Drivers that do not wish to forward must implement them and return
+ * -ENOTSUP.
+ * Note that filters are not allowed to modify data.
+ *
+ * Filters generally cannot have more than a single filtered child,
+ * because the data they present must at all times be the same as
+ * that on their filtered child. That would be impossible to
+ * achieve for multiple filtered children.
+ * (And this filtered child must then be bs->file or bs->backing.)
+ */
+ bool is_filter;
+ /*
+ * Set to true if the BlockDriver is a format driver. Format nodes
+ * generally do not expect their children to be other format nodes
+ * (except for backing files), and so format probing is disabled
+ * on those children.
+ */
+ bool is_format;
+ /*
+ * Return true if @to_replace can be replaced by a BDS with the
+ * same data as @bs without it affecting @bs's behavior (that is,
+ * without it being visible to @bs's parents).
+ */
+ bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
+ BlockDriverState *to_replace);
+
+ int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
+ int (*bdrv_probe_device)(const char *filename);
+
+ /* Any driver implementing this callback is expected to be able to handle
+ * NULL file names in its .bdrv_open() implementation */
+ void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
+ /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
+ * this field set to true, except ones that are defined only by their
+ * child's bs.
+ * An example of the last type will be the quorum block driver.
+ */
+ bool bdrv_needs_filename;
+
+ /*
+ * Set if a driver can support backing files. This also implies the
+ * following semantics:
+ *
+ * - Return status 0 of .bdrv_co_block_status means that corresponding
+ * blocks are not allocated in this layer of backing-chain
+ * - For such (unallocated) blocks, read will:
+ * - fill buffer with zeros if there is no backing file
+ * - read from the backing file otherwise, where the block layer
+ * takes care of reading zeros beyond EOF if backing file is short
+ */
+ bool supports_backing;
+
+ /* For handling image reopen for split or non-split files */
+ int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
+ BlockReopenQueue *queue, Error **errp);
+ void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
+ void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state);
+ void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
+ void (*bdrv_join_options)(QDict *options, QDict *old_options);
+
+ int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp);
+
+ /* Protocol drivers should implement this instead of bdrv_open */
+ int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
+ Error **errp);
+ void (*bdrv_close)(BlockDriverState *bs);
+
+
+ int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts,
+ Error **errp);
+ int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv,
+ const char *filename,
+ QemuOpts *opts,
+ Error **errp);
+
+ int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs,
+ BlockdevAmendOptions *opts,
+ bool force,
+ Error **errp);
+
+ int (*bdrv_amend_options)(BlockDriverState *bs,
+ QemuOpts *opts,
+ BlockDriverAmendStatusCB *status_cb,
+ void *cb_opaque,
+ bool force,
+ Error **errp);
+
+ int (*bdrv_make_empty)(BlockDriverState *bs);
+
+ /*
+ * Refreshes the bs->exact_filename field. If that is impossible,
+ * bs->exact_filename has to be left empty.
+ */
+ void (*bdrv_refresh_filename)(BlockDriverState *bs);
+
+ /*
+ * Gathers the open options for all children into @target.
+ * A simple format driver (without backing file support) might
+ * implement this function like this:
+ *
+ * QINCREF(bs->file->bs->full_open_options);
+ * qdict_put(target, "file", bs->file->bs->full_open_options);
+ *
+ * If not specified, the generic implementation will simply put
+ * all children's options under their respective name.
+ *
+ * @backing_overridden is true when bs->backing seems not to be
+ * the child that would result from opening bs->backing_file.
+ * Therefore, if it is true, the backing child's options should be
+ * gathered; otherwise, there is no need since the backing child
+ * is the one implied by the image header.
+ *
+ * Note that ideally this function would not be needed. Every
+ * block driver which implements it is probably doing something
+ * shady regarding its runtime option structure.
+ */
+ void (*bdrv_gather_child_options)(BlockDriverState *bs, QDict *target,
+ bool backing_overridden);
+
+ /*
+ * Returns an allocated string which is the directory name of this BDS: It
+ * will be used to make relative filenames absolute by prepending this
+ * function's return value to them.
+ */
+ char *(*bdrv_dirname)(BlockDriverState *bs, Error **errp);
+
+ /* aio */
+ BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
+ BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
+ BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+ BlockCompletionFunc *cb, void *opaque);
+ BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
+ int64_t offset, int bytes,
+ BlockCompletionFunc *cb, void *opaque);
+
+ int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+
+ /**
+ * @offset: position in bytes to read at
+ * @bytes: number of bytes to read
+ * @qiov: the buffers to fill with read data
+ * @flags: currently unused, always 0
+ *
+ * @offset and @bytes will be a multiple of 'request_alignment',
+ * but the length of individual @qiov elements does not have to
+ * be a multiple.
+ *
+ * @bytes will always equal the total size of @qiov, and will be
+ * no larger than 'max_transfer'.
+ *
+ * The buffer in @qiov may point directly to guest memory.
+ */
+ int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+ int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+ int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+ int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
+ /**
+ * @offset: position in bytes to write at
+ * @bytes: number of bytes to write
+ * @qiov: the buffers containing data to write
+ * @flags: zero or more bits allowed by 'supported_write_flags'
+ *
+ * @offset and @bytes will be a multiple of 'request_alignment',
+ * but the length of individual @qiov elements does not have to
+ * be a multiple.
+ *
+ * @bytes will always equal the total size of @qiov, and will be
+ * no larger than 'max_transfer'.
+ *
+ * The buffer in @qiov may point directly to guest memory.
+ */
+ int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+ int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
+ BdrvRequestFlags flags);
+
+ /*
+ * Efficiently zero a region of the disk image. Typically an image format
+ * would use a compact metadata representation to implement this. This
+ * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
+ * will be called instead.
+ */
+ int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, BdrvRequestFlags flags);
+ int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes);
+
+ /* Map [offset, offset + nbytes) range onto a child of @bs to copy from,
+ * and invoke bdrv_co_copy_range_from(child, ...), or invoke
+ * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from.
+ *
+ * See the comment of bdrv_co_copy_range for the parameter and return value
+ * semantics.
+ */
+ int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs,
+ BdrvChild *src,
+ int64_t offset,
+ BdrvChild *dst,
+ int64_t dst_offset,
+ int64_t bytes,
+ BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags);
+
+ /* Map [offset, offset + nbytes) range onto a child of bs to copy data to,
+ * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy
+ * operation if @bs is the leaf and @src has the same BlockDriver. Return
+ * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver.
+ *
+ * See the comment of bdrv_co_copy_range for the parameter and return value
+ * semantics.
+ */
+ int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs,
+ BdrvChild *src,
+ int64_t src_offset,
+ BdrvChild *dst,
+ int64_t dst_offset,
+ int64_t bytes,
+ BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags);
+
+ /*
+ * Building block for bdrv_block_status[_above] and
+ * bdrv_is_allocated[_above]. The driver should answer only
+ * according to the current layer, and should only need to set
+ * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID,
+ * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing
+ * layer, the result should be 0 (and not BDRV_BLOCK_ZERO). See
+ * block.h for the overall meaning of the bits. As a hint, the
+ * flag want_zero is true if the caller cares more about precise
+ * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for
+ * overall allocation (favor larger *pnum, perhaps by reporting
+ * _DATA instead of _ZERO). The block layer guarantees input
+ * clamped to bdrv_getlength() and aligned to request_alignment,
+ * as well as non-NULL pnum, map, and file; in turn, the driver
+ * must return an error or set pnum to an aligned non-zero value.
+ *
+ * Note that @bytes is just a hint on how big of a region the
+ * caller wants to inspect. It is not a limit on *pnum.
+ * Implementations are free to return larger values of *pnum if
+ * doing so does not incur a performance penalty.
+ *
+ * block/io.c's bdrv_co_block_status() will utilize an unclamped
+ * *pnum value for the block-status cache on protocol nodes, prior
+ * to clamping *pnum for return to its caller.
+ */
+ int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs,
+ bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+ int64_t *map, BlockDriverState **file);
+
+ /*
+ * This informs the driver that we are no longer interested in the result
+ * of in-flight requests, so don't waste the time if possible.
+ *
+ * One example usage is to avoid waiting for an nbd target node reconnect
+ * timeout during job-cancel with force=true.
+ */
+ void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
+
+ /*
+ * Invalidate any cached meta-data.
+ */
+ void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs,
+ Error **errp);
+ int (*bdrv_inactivate)(BlockDriverState *bs);
+
+ /*
+ * Flushes all data for all layers by calling bdrv_co_flush for underlying
+ * layers, if needed. This function is needed for deterministic
+ * synchronization of the flush finishing callback.
+ */
+ int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
+
+ /* Delete a created file. */
+ int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs,
+ Error **errp);
+
+ /*
+ * Flushes all data that was already written to the OS all the way down to
+ * the disk (for example file-posix.c calls fsync()).
+ */
+ int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
+
+ /*
+ * Flushes all internal caches to the OS. The data may still sit in a
+ * writeback cache of the host OS, but it will survive a crash of the qemu
+ * process.
+ */
+ int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
+
+ /*
+ * Drivers setting this field must be able to work with just a plain
+ * filename with '<protocol_name>:' as a prefix, and no other options.
+ * Options may be extracted from the filename by implementing
+ * bdrv_parse_filename.
+ */
+ const char *protocol_name;
+
+ /*
+ * Truncate @bs to @offset bytes using the given @prealloc mode
+ * when growing. Modes other than PREALLOC_MODE_OFF should be
+ * rejected when shrinking @bs.
+ *
+ * If @exact is true, @bs must be resized to exactly @offset.
+ * Otherwise, it is sufficient for @bs (if it is a host block
+ * device and thus there is no way to resize it) to be at least
+ * @offset bytes in length.
+ *
+ * If @exact is true and this function fails but would succeed
+ * with @exact = false, it should return -ENOTSUP.
+ */
+ int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
+ bool exact, PreallocMode prealloc,
+ BdrvRequestFlags flags, Error **errp);
+
+ int64_t (*bdrv_getlength)(BlockDriverState *bs);
+ bool has_variable_length;
+ int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
+ BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
+ Error **errp);
+
+ int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov);
+ int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
+
+ int (*bdrv_snapshot_create)(BlockDriverState *bs,
+ QEMUSnapshotInfo *sn_info);
+ int (*bdrv_snapshot_goto)(BlockDriverState *bs,
+ const char *snapshot_id);
+ int (*bdrv_snapshot_delete)(BlockDriverState *bs,
+ const char *snapshot_id,
+ const char *name,
+ Error **errp);
+ int (*bdrv_snapshot_list)(BlockDriverState *bs,
+ QEMUSnapshotInfo **psn_info);
+ int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
+ const char *snapshot_id,
+ const char *name,
+ Error **errp);
+ int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+ ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs,
+ Error **errp);
+ BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs);
+
+ int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
+ QEMUIOVector *qiov,
+ int64_t pos);
+ int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
+ QEMUIOVector *qiov,
+ int64_t pos);
+
+ int (*bdrv_change_backing_file)(BlockDriverState *bs,
+ const char *backing_file, const char *backing_fmt);
+
+ /* removable device specific */
+ bool (*bdrv_is_inserted)(BlockDriverState *bs);
+ void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
+ void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
+
+ /* to control generic scsi devices */
+ BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
+ unsigned long int req, void *buf,
+ BlockCompletionFunc *cb, void *opaque);
+ int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs,
+ unsigned long int req, void *buf);
+
+ /* List of options for creating images, terminated by name == NULL */
+ QemuOptsList *create_opts;
+
+ /* List of options for image amend */
+ QemuOptsList *amend_opts;
+
+ /*
+ * If this driver supports reopening images this contains a
+ * NULL-terminated list of the runtime options that can be
+ * modified. If an option in this list is unspecified during
+ * reopen then it _must_ be reset to its default value or return
+ * an error.
+ */
+ const char *const *mutable_opts;
+
+ /*
+ * Returns 0 for completed check, -errno for internal errors.
+ * The check results are stored in result.
+ */
+ int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs,
+ BdrvCheckResult *result,
+ BdrvCheckMode fix);
+
+ void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
+
+ /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
+ int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
+ const char *tag);
+ int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
+ const char *tag);
+ int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
+ bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
+
+ void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
+
+ /*
+ * Returns 1 if newly created images are guaranteed to contain only
+ * zeros, 0 otherwise.
+ */
+ int (*bdrv_has_zero_init)(BlockDriverState *bs);
+
+ /* Remove fd handlers, timers, and other event loop callbacks so the event
+ * loop is no longer in use. Called with no in-flight requests and in
+ * depth-first traversal order with parents before child nodes.
+ */
+ void (*bdrv_detach_aio_context)(BlockDriverState *bs);
+
+ /* Add fd handlers, timers, and other event loop callbacks so I/O requests
+ * can be processed again. Called with no in-flight requests and in
+ * depth-first traversal order with child nodes before parent nodes.
+ */
+ void (*bdrv_attach_aio_context)(BlockDriverState *bs,
+ AioContext *new_context);
+
+ /* io queue for linux-aio */
+ void (*bdrv_io_plug)(BlockDriverState *bs);
+ void (*bdrv_io_unplug)(BlockDriverState *bs);
+
+ /**
+ * Try to get @bs's logical and physical block size.
+ * On success, store them in @bsz and return zero.
+ * On failure, return negative errno.
+ */
+ int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
+ /**
+ * Try to get @bs's geometry (cyls, heads, sectors)
+ * On success, store them in @geo and return 0.
+ * On failure return -errno.
+ * Only drivers that want to override guest geometry implement this
+ * callback; see hd_geometry_guess().
+ */
+ int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
+
+ /**
+ * bdrv_co_drain_begin is called if implemented in the beginning of a
+ * drain operation to drain and stop any internal sources of requests in
+ * the driver.
+ * bdrv_co_drain_end is called if implemented at the end of the drain.
+ *
+ * They should be used by the driver to e.g. manage scheduled I/O
+ * requests, or toggle an internal state. After the end of the drain new
+ * requests will continue normally.
+ */
+ void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
+ void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
+
+ void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
+ Error **errp);
+ void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
+ Error **errp);
+
+ /**
+ * Informs the block driver that a permission change is intended. The
+ * driver checks whether the change is permissible and may take other
+ * preparations for the change (e.g. get file system locks). This operation
+ * is always followed either by a call to either .bdrv_set_perm or
+ * .bdrv_abort_perm_update.
+ *
+ * Checks whether the requested set of cumulative permissions in @perm
+ * can be granted for accessing @bs and whether no other users are using
+ * permissions other than those given in @shared (both arguments take
+ * BLK_PERM_* bitmasks).
+ *
+ * If both conditions are met, 0 is returned. Otherwise, -errno is returned
+ * and errp is set to an error describing the conflict.
+ */
+ int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
+ uint64_t shared, Error **errp);
+
+ /**
+ * Called to inform the driver that the set of cumulative set of used
+ * permissions for @bs has changed to @perm, and the set of sharable
+ * permission to @shared. The driver can use this to propagate changes to
+ * its children (i.e. request permissions only if a parent actually needs
+ * them).
+ *
+ * This function is only invoked after bdrv_check_perm(), so block drivers
+ * may rely on preparations made in their .bdrv_check_perm implementation.
+ */
+ void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared);
+
+ /*
+ * Called to inform the driver that after a previous bdrv_check_perm()
+ * call, the permission update is not performed and any preparations made
+ * for it (e.g. taken file locks) need to be undone.
+ *
+ * This function can be called even for nodes that never saw a
+ * bdrv_check_perm() call. It is a no-op then.
+ */
+ void (*bdrv_abort_perm_update)(BlockDriverState *bs);
+
+ /**
+ * Returns in @nperm and @nshared the permissions that the driver for @bs
+ * needs on its child @c, based on the cumulative permissions requested by
+ * the parents in @parent_perm and @parent_shared.
+ *
+ * If @c is NULL, return the permissions for attaching a new child for the
+ * given @child_class and @role.
+ *
+ * If @reopen_queue is non-NULL, don't return the currently needed
+ * permissions, but those that will be needed after applying the
+ * @reopen_queue.
+ */
+ void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
+ BdrvChildRole role,
+ BlockReopenQueue *reopen_queue,
+ uint64_t parent_perm, uint64_t parent_shared,
+ uint64_t *nperm, uint64_t *nshared);
+
+ bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
+ bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs,
+ const char *name,
+ uint32_t granularity,
+ Error **errp);
+ int (*bdrv_co_remove_persistent_dirty_bitmap)(BlockDriverState *bs,
+ const char *name,
+ Error **errp);
+
+ /**
+ * Register/unregister a buffer for I/O. For example, when the driver is
+ * interested to know the memory areas that will later be used in iovs, so
+ * that it can do IOMMU mapping with VFIO etc., in order to get better
+ * performance. In the case of VFIO drivers, this callback is used to do
+ * DMA mapping for hot buffers.
+ */
+ void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size);
+ void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host);
+ QLIST_ENTRY(BlockDriver) list;
+
+ /* Pointer to a NULL-terminated array of names of strong options
+ * that can be specified for bdrv_open(). A strong option is one
+ * that changes the data of a BDS.
+ * If this pointer is NULL, the array is considered empty.
+ * "filename" and "driver" are always considered strong. */
+ const char *const *strong_runtime_opts;
+};
+
+static inline bool block_driver_can_compress(BlockDriver *drv)
+{
+ return drv->bdrv_co_pwritev_compressed ||
+ drv->bdrv_co_pwritev_compressed_part;
+}
+
+typedef struct BlockLimits {
+ /* Alignment requirement, in bytes, for offset/length of I/O
+ * requests. Must be a power of 2 less than INT_MAX; defaults to
+ * 1 for drivers with modern byte interfaces, and to 512
+ * otherwise. */
+ uint32_t request_alignment;
+
+ /*
+ * Maximum number of bytes that can be discarded at once. Must be multiple
+ * of pdiscard_alignment, but need not be power of 2. May be 0 if no
+ * inherent 64-bit limit.
+ */
+ int64_t max_pdiscard;
+
+ /* Optimal alignment for discard requests in bytes. A power of 2
+ * is best but not mandatory. Must be a multiple of
+ * bl.request_alignment, and must be less than max_pdiscard if
+ * that is set. May be 0 if bl.request_alignment is good enough */
+ uint32_t pdiscard_alignment;
+
+ /*
+ * Maximum number of bytes that can zeroized at once. Must be multiple of
+ * pwrite_zeroes_alignment. 0 means no limit.
+ */
+ int64_t max_pwrite_zeroes;
+
+ /* Optimal alignment for write zeroes requests in bytes. A power
+ * of 2 is best but not mandatory. Must be a multiple of
+ * bl.request_alignment, and must be less than max_pwrite_zeroes
+ * if that is set. May be 0 if bl.request_alignment is good
+ * enough */
+ uint32_t pwrite_zeroes_alignment;
+
+ /* Optimal transfer length in bytes. A power of 2 is best but not
+ * mandatory. Must be a multiple of bl.request_alignment, or 0 if
+ * no preferred size */
+ uint32_t opt_transfer;
+
+ /* Maximal transfer length in bytes. Need not be power of 2, but
+ * must be multiple of opt_transfer and bl.request_alignment, or 0
+ * for no 32-bit limit. For now, anything larger than INT_MAX is
+ * clamped down. */
+ uint32_t max_transfer;
+
+ /* Maximal hardware transfer length in bytes. Applies whenever
+ * transfers to the device bypass the kernel I/O scheduler, for
+ * example with SG_IO. If larger than max_transfer or if zero,
+ * blk_get_max_hw_transfer will fall back to max_transfer.
+ */
+ uint64_t max_hw_transfer;
+
+ /* Maximal number of scatter/gather elements allowed by the hardware.
+ * Applies whenever transfers to the device bypass the kernel I/O
+ * scheduler, for example with SG_IO. If larger than max_iov
+ * or if zero, blk_get_max_hw_iov will fall back to max_iov.
+ */
+ int max_hw_iov;
+
+ /* memory alignment, in bytes so that no bounce buffer is needed */
+ size_t min_mem_alignment;
+
+ /* memory alignment, in bytes, for bounce buffer */
+ size_t opt_mem_alignment;
+
+ /* maximum number of iovec elements */
+ int max_iov;
+} BlockLimits;
+
+typedef struct BdrvOpBlocker BdrvOpBlocker;
+
+typedef struct BdrvAioNotifier {
+ void (*attached_aio_context)(AioContext *new_context, void *opaque);
+ void (*detach_aio_context)(void *opaque);
+
+ void *opaque;
+ bool deleted;
+
+ QLIST_ENTRY(BdrvAioNotifier) list;
+} BdrvAioNotifier;
+
+struct BdrvChildClass {
+ /* If true, bdrv_replace_node() doesn't change the node this BdrvChild
+ * points to. */
+ bool stay_at_node;
+
+ /* If true, the parent is a BlockDriverState and bdrv_next_all_states()
+ * will return it. This information is used for drain_all, where every node
+ * will be drained separately, so the drain only needs to be propagated to
+ * non-BDS parents. */
+ bool parent_is_bds;
+
+ void (*inherit_options)(BdrvChildRole role, bool parent_is_format,
+ int *child_flags, QDict *child_options,
+ int parent_flags, QDict *parent_options);
+
+ void (*change_media)(BdrvChild *child, bool load);
+ void (*resize)(BdrvChild *child);
+
+ /* Returns a name that is supposedly more useful for human users than the
+ * node name for identifying the node in question (in particular, a BB
+ * name), or NULL if the parent can't provide a better name. */
+ const char *(*get_name)(BdrvChild *child);
+
+ /* Returns a malloced string that describes the parent of the child for a
+ * human reader. This could be a node-name, BlockBackend name, qdev ID or
+ * QOM path of the device owning the BlockBackend, job type and ID etc. The
+ * caller is responsible for freeing the memory. */
+ char *(*get_parent_desc)(BdrvChild *child);
+
+ /*
+ * If this pair of functions is implemented, the parent doesn't issue new
+ * requests after returning from .drained_begin() until .drained_end() is
+ * called.
+ *
+ * These functions must not change the graph (and therefore also must not
+ * call aio_poll(), which could change the graph indirectly).
+ *
+ * If drained_end() schedules background operations, it must atomically
+ * increment *drained_end_counter for each such operation and atomically
+ * decrement it once the operation has settled.
+ *
+ * Note that this can be nested. If drained_begin() was called twice, new
+ * I/O is allowed only after drained_end() was called twice, too.
+ */
+ void (*drained_begin)(BdrvChild *child);
+ void (*drained_end)(BdrvChild *child, int *drained_end_counter);
+
+ /*
+ * Returns whether the parent has pending requests for the child. This
+ * callback is polled after .drained_begin() has been called until all
+ * activity on the child has stopped.
+ */
+ bool (*drained_poll)(BdrvChild *child);
+
+ /* Notifies the parent that the child has been activated/inactivated (e.g.
+ * when migration is completing) and it can start/stop requesting
+ * permissions and doing I/O on it. */
+ void (*activate)(BdrvChild *child, Error **errp);
+ int (*inactivate)(BdrvChild *child);
+
+ void (*attach)(BdrvChild *child);
+ void (*detach)(BdrvChild *child);
+
+ /* Notifies the parent that the filename of its child has changed (e.g.
+ * because the direct child was removed from the backing chain), so that it
+ * can update its reference. */
+ int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
+ const char *filename, Error **errp);
+
+ bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
+ GSList **ignore, Error **errp);
+ void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
+
+ AioContext *(*get_parent_aio_context)(BdrvChild *child);
+};
+
+extern const BdrvChildClass child_of_bds;
+
+struct BdrvChild {
+ BlockDriverState *bs;
+ char *name;
+ const BdrvChildClass *klass;
+ BdrvChildRole role;
+ void *opaque;
+
+ /**
+ * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask)
+ */
+ uint64_t perm;
+
+ /**
+ * Permissions that can still be granted to other users of @bs while this
+ * BdrvChild is still attached to it. (BLK_PERM_* bitmask)
+ */
+ uint64_t shared_perm;
+
+ /*
+ * This link is frozen: the child can neither be replaced nor
+ * detached from the parent.
+ */
+ bool frozen;
+
+ /*
+ * How many times the parent of this child has been drained
+ * (through klass->drained_*).
+ * Usually, this is equal to bs->quiesce_counter (potentially
+ * reduced by bdrv_drain_all_count). It may differ while the
+ * child is entering or leaving a drained section.
+ */
+ int parent_quiesce_counter;
+
+ QLIST_ENTRY(BdrvChild) next;
+ QLIST_ENTRY(BdrvChild) next_parent;
+};
+
+/*
+ * Allows bdrv_co_block_status() to cache one data region for a
+ * protocol node.
+ *
+ * @valid: Whether the cache is valid (should be accessed with atomic
+ * functions so this can be reset by RCU readers)
+ * @data_start: Offset where we know (or strongly assume) is data
+ * @data_end: Offset where the data region ends (which is not necessarily
+ * the start of a zeroed region)
+ */
+typedef struct BdrvBlockStatusCache {
+ struct rcu_head rcu;
+
+ bool valid;
+ int64_t data_start;
+ int64_t data_end;
+} BdrvBlockStatusCache;
+
+struct BlockDriverState {
+ /* Protected by big QEMU lock or read-only after opening. No special
+ * locking needed during I/O...
+ */
+ int open_flags; /* flags used to open the file, re-used for re-open */
+ bool encrypted; /* if true, the media is encrypted */
+ bool sg; /* if true, the device is a /dev/sg* */
+ bool probed; /* if true, format was probed rather than specified */
+ bool force_share; /* if true, always allow all shared permissions */
+ bool implicit; /* if true, this filter node was automatically inserted */
+
+ BlockDriver *drv; /* NULL means no media */
+ void *opaque;
+
+ AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
+ /* long-running tasks intended to always use the same AioContext as this
+ * BDS may register themselves in this list to be notified of changes
+ * regarding this BDS's context */
+ QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
+ bool walking_aio_notifiers; /* to make removal during iteration safe */
+
+ char filename[PATH_MAX];
+ /*
+ * If not empty, this image is a diff in relation to backing_file.
+ * Note that this is the name given in the image header and
+ * therefore may or may not be equal to .backing->bs->filename.
+ * If this field contains a relative path, it is to be resolved
+ * relatively to the overlay's location.
+ */
+ char backing_file[PATH_MAX];
+ /*
+ * The backing filename indicated by the image header. Contrary
+ * to backing_file, if we ever open this file, auto_backing_file
+ * is replaced by the resulting BDS's filename (i.e. after a
+ * bdrv_refresh_filename() run).
+ */
+ char auto_backing_file[PATH_MAX];
+ char backing_format[16]; /* if non-zero and backing_file exists */
+
+ QDict *full_open_options;
+ char exact_filename[PATH_MAX];
+
+ BdrvChild *backing;
+ BdrvChild *file;
+
+ /* I/O Limits */
+ BlockLimits bl;
+
+ /*
+ * Flags honored during pread
+ */
+ unsigned int supported_read_flags;
+ /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
+ * BDRV_REQ_WRITE_UNCHANGED).
+ * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
+ * writes will be issued as normal writes without the flag set.
+ * This is important to note for drivers that do not explicitly
+ * request a WRITE permission for their children and instead take
+ * the same permissions as their parent did (this is commonly what
+ * block filters do). Such drivers have to be aware that the
+ * parent may have taken a WRITE_UNCHANGED permission only and is
+ * issuing such requests. Drivers either must make sure that
+ * these requests do not result in plain WRITE accesses (usually
+ * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
+ * every incoming write request as-is, including potentially that
+ * flag), or they have to explicitly take the WRITE permission for
+ * their children. */
+ unsigned int supported_write_flags;
+ /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
+ * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
+ unsigned int supported_zero_flags;
+ /*
+ * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE).
+ *
+ * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure
+ * that any added space reads as all zeros. If this can't be guaranteed,
+ * the operation must fail.
+ */
+ unsigned int supported_truncate_flags;
+
+ /* the following member gives a name to every node on the bs graph. */
+ char node_name[32];
+ /* element of the list of named nodes building the graph */
+ QTAILQ_ENTRY(BlockDriverState) node_list;
+ /* element of the list of all BlockDriverStates (all_bdrv_states) */
+ QTAILQ_ENTRY(BlockDriverState) bs_list;
+ /* element of the list of monitor-owned BDS */
+ QTAILQ_ENTRY(BlockDriverState) monitor_list;
+ int refcnt;
+
+ /* operation blockers */
+ QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
+
+ /* The node that this node inherited default options from (and a reopen on
+ * which can affect this node by changing these defaults). This is always a
+ * parent node of this node. */
+ BlockDriverState *inherits_from;
+ QLIST_HEAD(, BdrvChild) children;
+ QLIST_HEAD(, BdrvChild) parents;
+
+ QDict *options;
+ QDict *explicit_options;
+ BlockdevDetectZeroesOptions detect_zeroes;
+
+ /* The error object in use for blocking operations on backing_hd */
+ Error *backing_blocker;
+
+ /* Protected by AioContext lock */
+
+ /* If we are reading a disk image, give its size in sectors.
+ * Generally read-only; it is written to by load_snapshot and
+ * save_snaphost, but the block layer is quiescent during those.
+ */
+ int64_t total_sectors;
+
+ /* threshold limit for writes, in bytes. "High water mark". */
+ uint64_t write_threshold_offset;
+
+ /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
+ * Reading from the list can be done with either the BQL or the
+ * dirty_bitmap_mutex. Modifying a bitmap only requires
+ * dirty_bitmap_mutex. */
+ QemuMutex dirty_bitmap_mutex;
+ QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+ /* Offset after the highest byte written to */
+ Stat64 wr_highest_offset;
+
+ /* If true, copy read backing sectors into image. Can be >1 if more
+ * than one client has requested copy-on-read. Accessed with atomic
+ * ops.
+ */
+ int copy_on_read;
+
+ /* number of in-flight requests; overall and serialising.
+ * Accessed with atomic ops.
+ */
+ unsigned int in_flight;
+ unsigned int serialising_in_flight;
+
+ /* counter for nested bdrv_io_plug.
+ * Accessed with atomic ops.
+ */
+ unsigned io_plugged;
+
+ /* do we need to tell the quest if we have a volatile write cache? */
+ int enable_write_cache;
+
+ /* Accessed with atomic ops. */
+ int quiesce_counter;
+ int recursive_quiesce_counter;
+
+ unsigned int write_gen; /* Current data generation */
+
+ /* Protected by reqs_lock. */
+ CoMutex reqs_lock;
+ QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+ CoQueue flush_queue; /* Serializing flush queue */
+ bool active_flush_req; /* Flush request in flight? */
+
+ /* Only read/written by whoever has set active_flush_req to true. */
+ unsigned int flushed_gen; /* Flushed write generation */
+
+ /* BdrvChild links to this node may never be frozen */
+ bool never_freeze;
+
+ /* Lock for block-status cache RCU writers */
+ CoMutex bsc_modify_lock;
+ /* Always non-NULL, but must only be dereferenced under an RCU read guard */
+ BdrvBlockStatusCache *block_status_cache;
+};
+
+struct BlockBackendRootState {
+ int open_flags;
+ BlockdevDetectZeroesOptions detect_zeroes;
+};
+
+typedef enum BlockMirrorBackingMode {
+ /* Reuse the existing backing chain from the source for the target.
+ * - sync=full: Set backing BDS to NULL.
+ * - sync=top: Use source's backing BDS.
+ * - sync=none: Use source as the backing BDS. */
+ MIRROR_SOURCE_BACKING_CHAIN,
+
+ /* Open the target's backing chain completely anew */
+ MIRROR_OPEN_BACKING_CHAIN,
+
+ /* Do not change the target's backing BDS after job completion */
+ MIRROR_LEAVE_BACKING_CHAIN,
+} BlockMirrorBackingMode;
+
+
+/* Essential block drivers which must always be statically linked into qemu, and
+ * which therefore can be accessed without using bdrv_find_format() */
+extern BlockDriver bdrv_file;
+extern BlockDriver bdrv_raw;
+extern BlockDriver bdrv_qcow2;
+
+int coroutine_fn bdrv_co_preadv(BdrvChild *child,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+ int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
+ int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+ BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+ int64_t offset, int64_t bytes,
+ QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+
+static inline int coroutine_fn bdrv_co_pread(BdrvChild *child,
+ int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+ QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+ return bdrv_co_preadv(child, offset, bytes, &qiov, flags);
+}
+
+static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child,
+ int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+ QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+ return bdrv_co_pwritev(child, offset, bytes, &qiov, flags);
+}
+
+extern unsigned int bdrv_drain_all_count;
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
+bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
+ uint64_t align);
+BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs);
+
+int get_tmp_filename(char *filename, int size);
+BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
+ const char *filename);
+
+void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
+ QDict *options);
+
+bool bdrv_backing_overridden(BlockDriverState *bs);
+
+
+/**
+ * bdrv_add_aio_context_notifier:
+ *
+ * If a long-running job intends to be always run in the same AioContext as a
+ * certain BDS, it may use this function to be notified of changes regarding the
+ * association of the BDS to an AioContext.
+ *
+ * attached_aio_context() is called after the target BDS has been attached to a
+ * new AioContext; detach_aio_context() is called before the target BDS is being
+ * detached from its old AioContext.
+ */
+void bdrv_add_aio_context_notifier(BlockDriverState *bs,
+ void (*attached_aio_context)(AioContext *new_context, void *opaque),
+ void (*detach_aio_context)(void *opaque), void *opaque);
+
+/**
+ * bdrv_remove_aio_context_notifier:
+ *
+ * Unsubscribe of change notifications regarding the BDS's AioContext. The
+ * parameters given here have to be the same as those given to
+ * bdrv_add_aio_context_notifier().
+ */
+void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
+ void (*aio_context_attached)(AioContext *,
+ void *),
+ void (*aio_context_detached)(void *),
+ void *opaque);
+
+/**
+ * bdrv_wakeup:
+ * @bs: The BlockDriverState for which an I/O operation has been completed.
+ *
+ * Wake up the main thread if it is waiting on BDRV_POLL_WHILE. During
+ * synchronous I/O on a BlockDriverState that is attached to another
+ * I/O thread, the main thread lets the I/O thread's event loop run,
+ * waiting for the I/O operation to complete. A bdrv_wakeup will wake
+ * up the main thread if necessary.
+ *
+ * Manual calls to bdrv_wakeup are rarely necessary, because
+ * bdrv_dec_in_flight already calls it.
+ */
+void bdrv_wakeup(BlockDriverState *bs);
+
+#ifdef _WIN32
+int is_windows_drive(const char *filename);
+#endif
+
+/**
+ * stream_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @base: Block device that will become the new base, or %NULL to
+ * flatten the whole backing file chain onto @bs.
+ * @backing_file_str: The file name that will be written to @bs as the
+ * the new backing file if the job completes. Ignored if @base is %NULL.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ * See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the stream job inserts into the graph above
+ * @bs. NULL means that a node name should be autogenerated.
+ * @errp: Error object.
+ *
+ * Start a streaming operation on @bs. Clusters that are unallocated
+ * in @bs, but allocated in any image between @base and @bs (both
+ * exclusive) will be written to @bs. At the end of a successful
+ * streaming job, the backing file of @bs will be changed to
+ * @backing_file_str in the written image and to @base in the live
+ * BlockDriverState.
+ */
+void stream_start(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *base, const char *backing_file_str,
+ BlockDriverState *bottom,
+ int creation_flags, int64_t speed,
+ BlockdevOnError on_error,
+ const char *filter_node_name,
+ Error **errp);
+
+/**
+ * commit_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Active block device.
+ * @top: Top block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ * See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @backing_file_str: String to use as the backing file in @top's overlay
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @top. NULL means
+ * that a node name should be autogenerated.
+ * @errp: Error object.
+ *
+ */
+void commit_start(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *base, BlockDriverState *top,
+ int creation_flags, int64_t speed,
+ BlockdevOnError on_error, const char *backing_file_str,
+ const char *filter_node_name, Error **errp);
+/**
+ * commit_active_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Active block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ * See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @bs. NULL means that
+ * a node name should be autogenerated.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @auto_complete: Auto complete the job.
+ * @errp: Error object.
+ *
+ */
+BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *base, int creation_flags,
+ int64_t speed, BlockdevOnError on_error,
+ const char *filter_node_name,
+ BlockCompletionFunc *cb, void *opaque,
+ bool auto_complete, Error **errp);
+/*
+ * mirror_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @replaces: Block graph node name to replace once the mirror is done. Can
+ * only be used when full mirroring is selected.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ * See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @granularity: The chosen granularity for the dirty bitmap.
+ * @buf_size: The amount of data that can be in flight at one time.
+ * @mode: Whether to collapse all images in the chain to the target.
+ * @backing_mode: How to establish the target's backing chain after completion.
+ * @zero_target: Whether the target should be explicitly zero-initialized
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @unmap: Whether to unmap target where source sectors only contain zeroes.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the mirror job inserts into the graph above @bs. NULL means that
+ * a node name should be autogenerated.
+ * @copy_mode: When to trigger writes to the target.
+ * @errp: Error object.
+ *
+ * Start a mirroring operation on @bs. Clusters that are allocated
+ * in @bs will be written to @target until the job is cancelled or
+ * manually completed. At the end of a successful mirroring job,
+ * @bs will be switched to read from @target.
+ */
+void mirror_start(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *target, const char *replaces,
+ int creation_flags, int64_t speed,
+ uint32_t granularity, int64_t buf_size,
+ MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+ bool zero_target,
+ BlockdevOnError on_source_error,
+ BlockdevOnError on_target_error,
+ bool unmap, const char *filter_node_name,
+ MirrorCopyMode copy_mode, Error **errp);
+
+/*
+ * backup_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @sync_mode: What parts of the disk image should be copied to the destination.
+ * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental'
+ * @bitmap_mode: The bitmap synchronization policy to use.
+ * @perf: Performance options. All actual fields assumed to be present,
+ * all ".has_*" fields are ignored.
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ * See @BlockJobCreateFlags
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @txn: Transaction that this job is part of (may be NULL).
+ *
+ * Create a backup operation on @bs. Clusters in @bs are written to @target
+ * until the job is cancelled or manually completed.
+ */
+BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+ BlockDriverState *target, int64_t speed,
+ MirrorSyncMode sync_mode,
+ BdrvDirtyBitmap *sync_bitmap,
+ BitmapSyncMode bitmap_mode,
+ bool compress,
+ const char *filter_node_name,
+ BackupPerf *perf,
+ BlockdevOnError on_source_error,
+ BlockdevOnError on_target_error,
+ int creation_flags,
+ BlockCompletionFunc *cb, void *opaque,
+ JobTxn *txn, Error **errp);
+
+BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
+ const char *child_name,
+ const BdrvChildClass *child_class,
+ BdrvChildRole child_role,
+ uint64_t perm, uint64_t shared_perm,
+ void *opaque, Error **errp);
+void bdrv_root_unref_child(BdrvChild *child);
+
+void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
+ uint64_t *shared_perm);
+
+/**
+ * Sets a BdrvChild's permissions. Avoid if the parent is a BDS; use
+ * bdrv_child_refresh_perms() instead and make the parent's
+ * .bdrv_child_perm() implementation return the correct values.
+ */
+int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+ Error **errp);
+
+/**
+ * Calls bs->drv->bdrv_child_perm() and updates the child's permission
+ * masks with the result.
+ * Drivers should invoke this function whenever an event occurs that
+ * makes their .bdrv_child_perm() implementation return different
+ * values than before, but which will not result in the block layer
+ * automatically refreshing the permissions.
+ */
+int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
+
+bool bdrv_recurse_can_replace(BlockDriverState *bs,
+ BlockDriverState *to_replace);
+
+/*
+ * Default implementation for BlockDriver.bdrv_child_perm() that can
+ * be used by block filters and image formats, as long as they use the
+ * child_of_bds child class and set an appropriate BdrvChildRole.
+ */
+void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
+ BdrvChildRole role, BlockReopenQueue *reopen_queue,
+ uint64_t perm, uint64_t shared,
+ uint64_t *nperm, uint64_t *nshared);
+
+const char *bdrv_get_parent_name(const BlockDriverState *bs);
+void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
+bool blk_dev_has_removable_media(BlockBackend *blk);
+bool blk_dev_has_tray(BlockBackend *blk);
+void blk_dev_eject_request(BlockBackend *blk, bool force);
+bool blk_dev_is_tray_open(BlockBackend *blk);
+bool blk_dev_is_medium_locked(BlockBackend *blk);
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
+void bdrv_restore_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *backup);
+bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
+ const BdrvDirtyBitmap *src,
+ HBitmap **backup, bool lock);
+
+void bdrv_inc_in_flight(BlockDriverState *bs);
+void bdrv_dec_in_flight(BlockDriverState *bs);
+
+void blockdev_close_all_bdrv_states(void);
+
+int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset,
+ int64_t bytes,
+ BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags);
+int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
+ BdrvChild *dst, int64_t dst_offset,
+ int64_t bytes,
+ BdrvRequestFlags read_flags,
+ BdrvRequestFlags write_flags);
+
+int refresh_total_sectors(BlockDriverState *bs, int64_t hint);
+
+void bdrv_set_monitor_owned(BlockDriverState *bs);
+BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp);
+
+/**
+ * Simple implementation of bdrv_co_create_opts for protocol drivers
+ * which only support creation via opening a file
+ * (usually existing raw storage device)
+ */
+int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
+ const char *filename,
+ QemuOpts *opts,
+ Error **errp);
+extern QemuOptsList bdrv_create_opts_simple;
+
+BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
+ const char *name,
+ BlockDriverState **pbs,
+ Error **errp);
+BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
+ BlockDirtyBitmapMergeSourceList *bms,
+ HBitmap **backup, Error **errp);
+BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
+ bool release,
+ BlockDriverState **bitmap_bs,
+ Error **errp);
+
+BdrvChild *bdrv_cow_child(BlockDriverState *bs);
+BdrvChild *bdrv_filter_child(BlockDriverState *bs);
+BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs);
+BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs);
+
+static inline BlockDriverState *child_bs(BdrvChild *child)
+{
+ return child ? child->bs : NULL;
+}
+
+static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs)
+{
+ return child_bs(bdrv_cow_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs)
+{
+ return child_bs(bdrv_filter_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs)
+{
+ return child_bs(bdrv_filter_or_cow_child(bs));
+}
+
+static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
+{
+ return child_bs(bdrv_primary_child(bs));
+}
+
+/**
+ * End all quiescent sections started by bdrv_drain_all_begin(). This is
+ * needed when deleting a BDS before bdrv_drain_all_end() is called.
+ *
+ * NOTE: this is an internal helper for bdrv_close() *only*. No one else
+ * should call it.
+ */
+void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
+
+/**
+ * Check whether the given offset is in the cached block-status data
+ * region.
+ *
+ * If it is, and @pnum is not NULL, *pnum is set to
+ * `bsc.data_end - offset`, i.e. how many bytes, starting from
+ * @offset, are data (according to the cache).
+ * Otherwise, *pnum is not touched.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum);
+
+/**
+ * If [offset, offset + bytes) overlaps with the currently cached
+ * block-status region, invalidate the cache.
+ *
+ * (To be used by I/O paths that cause data regions to be zero or
+ * holes.)
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+ int64_t offset, int64_t bytes);
+
+/**
+ * Mark the range [offset, offset + bytes) as a data region.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
+#endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
new file mode 100644
index 000000000..d200f33c1
--- /dev/null
+++ b/include/block/blockjob.h
@@ -0,0 +1,176 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_H
+#define BLOCKJOB_H
+
+#include "qemu/job.h"
+#include "block/block.h"
+#include "qemu/ratelimit.h"
+
+#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
+
+typedef struct BlockJobDriver BlockJobDriver;
+
+/**
+ * BlockJob:
+ *
+ * Long-running operation on a BlockDriverState.
+ */
+typedef struct BlockJob {
+ /** Data belonging to the generic Job infrastructure */
+ Job job;
+
+ /** The block device on which the job is operating. */
+ BlockBackend *blk;
+
+ /** Status that is published by the query-block-jobs QMP API */
+ BlockDeviceIoStatus iostatus;
+
+ /** Speed that was set with @block_job_set_speed. */
+ int64_t speed;
+
+ /** Rate limiting data structure for implementing @speed. */
+ RateLimit limit;
+
+ /** Block other operations when block job is running */
+ Error *blocker;
+
+ /** Called when a cancelled job is finalised. */
+ Notifier finalize_cancelled_notifier;
+
+ /** Called when a successfully completed job is finalised. */
+ Notifier finalize_completed_notifier;
+
+ /** Called when the job transitions to PENDING */
+ Notifier pending_notifier;
+
+ /** Called when the job transitions to READY */
+ Notifier ready_notifier;
+
+ /** Called when the job coroutine yields or terminates */
+ Notifier idle_notifier;
+
+ /** BlockDriverStates that are involved in this block job */
+ GSList *nodes;
+} BlockJob;
+
+/**
+ * block_job_next:
+ * @job: A block job, or %NULL.
+ *
+ * Get the next element from the list of block jobs after @job, or the
+ * first one if @job is %NULL.
+ *
+ * Returns the requested job, or %NULL if there are no more jobs left.
+ */
+BlockJob *block_job_next(BlockJob *job);
+
+/**
+ * block_job_get:
+ * @id: The id of the block job.
+ *
+ * Get the block job identified by @id (which must not be %NULL).
+ *
+ * Returns the requested job, or %NULL if it doesn't exist.
+ */
+BlockJob *block_job_get(const char *id);
+
+/**
+ * block_job_add_bdrv:
+ * @job: A block job
+ * @name: The name to assign to the new BdrvChild
+ * @bs: A BlockDriverState that is involved in @job
+ * @perm, @shared_perm: Permissions to request on the node
+ *
+ * Add @bs to the list of BlockDriverState that are involved in
+ * @job. This means that all operations will be blocked on @bs while
+ * @job exists.
+ */
+int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
+ uint64_t perm, uint64_t shared_perm, Error **errp);
+
+/**
+ * block_job_remove_all_bdrv:
+ * @job: The block job
+ *
+ * Remove all BlockDriverStates from the list of nodes that are involved in the
+ * job. This removes the blockers added with block_job_add_bdrv().
+ */
+void block_job_remove_all_bdrv(BlockJob *job);
+
+/**
+ * block_job_has_bdrv:
+ * @job: The block job
+ *
+ * Searches for @bs in the list of nodes that are involved in the
+ * job.
+ */
+bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
+
+/**
+ * block_job_set_speed:
+ * @job: The job to set the speed for.
+ * @speed: The new value
+ * @errp: Error object.
+ *
+ * Set a rate-limiting parameter for the job; the actual meaning may
+ * vary depending on the job type.
+ */
+bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
+
+/**
+ * block_job_query:
+ * @job: The job to get information about.
+ *
+ * Return information about a job.
+ */
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
+
+/**
+ * block_job_iostatus_reset:
+ * @job: The job whose I/O status should be reset.
+ *
+ * Reset I/O status on @job and on BlockDriverState objects it uses,
+ * other than job->blk.
+ */
+void block_job_iostatus_reset(BlockJob *job);
+
+/**
+ * block_job_is_internal:
+ * @job: The job to determine if it is user-visible or not.
+ *
+ * Returns true if the job should not be visible to the management layer.
+ */
+bool block_job_is_internal(BlockJob *job);
+
+/**
+ * block_job_driver:
+ *
+ * Returns the driver associated with a block job.
+ */
+const BlockJobDriver *block_job_driver(BlockJob *job);
+
+#endif
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
new file mode 100644
index 000000000..6633d83da
--- /dev/null
+++ b/include/block/blockjob_int.h
@@ -0,0 +1,122 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_INT_H
+#define BLOCKJOB_INT_H
+
+#include "block/blockjob.h"
+#include "block/block.h"
+
+/**
+ * BlockJobDriver:
+ *
+ * A class type for block job driver.
+ */
+struct BlockJobDriver {
+ /** Generic JobDriver callbacks and settings */
+ JobDriver job_driver;
+
+ /*
+ * Returns whether the job has pending requests for the child or will
+ * submit new requests before the next pause point. This callback is polled
+ * in the context of draining a job node after requesting that the job be
+ * paused, until all activity on the child has stopped.
+ */
+ bool (*drained_poll)(BlockJob *job);
+
+ /*
+ * If the callback is not NULL, it will be invoked before the job is
+ * resumed in a new AioContext. This is the place to move any resources
+ * besides job->blk to the new AioContext.
+ */
+ void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
+
+ void (*set_speed)(BlockJob *job, int64_t speed);
+};
+
+/**
+ * block_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to have one
+ * generated automatically.
+ * @driver: The class object for the newly-created job.
+ * @txn: The transaction this job belongs to, if any. %NULL otherwise.
+ * @bs: The block
+ * @perm, @shared_perm: Permissions to request for @bs
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @flags: Creation flags for the Block Job. See @JobCreateFlags.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Create a new long-running block device job and return it. The job
+ * will call @cb asynchronously when the job completes. Note that
+ * @bs may have been closed at the time the @cb it is called. If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
+void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+ JobTxn *txn, BlockDriverState *bs, uint64_t perm,
+ uint64_t shared_perm, int64_t speed, int flags,
+ BlockCompletionFunc *cb, void *opaque, Error **errp);
+
+/**
+ * block_job_free:
+ * Callback to be used for JobDriver.free in all block jobs. Frees block job
+ * specific resources in @job.
+ */
+void block_job_free(Job *job);
+
+/**
+ * block_job_user_resume:
+ * Callback to be used for JobDriver.user_resume in all block jobs. Resets the
+ * iostatus when the user resumes @job.
+ */
+void block_job_user_resume(Job *job);
+
+/**
+ * block_job_ratelimit_get_delay:
+ *
+ * Calculate and return delay for the next request in ns. See the documentation
+ * of ratelimit_calculate_delay() for details.
+ */
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM. Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+ int is_read, int error);
+
+#endif
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
new file mode 100644
index 000000000..40950ae3d
--- /dev/null
+++ b/include/block/dirty-bitmap.h
@@ -0,0 +1,121 @@
+#ifndef BLOCK_DIRTY_BITMAP_H
+#define BLOCK_DIRTY_BITMAP_H
+
+#include "qapi/qapi-types-block-core.h"
+#include "qemu/hbitmap.h"
+
+typedef enum BitmapCheckFlags {
+ BDRV_BITMAP_BUSY = 1,
+ BDRV_BITMAP_RO = 2,
+ BDRV_BITMAP_INCONSISTENT = 4,
+} BitmapCheckFlags;
+
+#define BDRV_BITMAP_DEFAULT (BDRV_BITMAP_BUSY | BDRV_BITMAP_RO | \
+ BDRV_BITMAP_INCONSISTENT)
+#define BDRV_BITMAP_ALLOW_RO (BDRV_BITMAP_BUSY | BDRV_BITMAP_INCONSISTENT)
+
+#define BDRV_BITMAP_MAX_NAME_SIZE 1023
+
+bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs);
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+ uint32_t granularity,
+ const char *name,
+ Error **errp);
+int bdrv_dirty_bitmap_create_successor(BdrvDirtyBitmap *bitmap,
+ Error **errp);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BdrvDirtyBitmap *bitmap,
+ Error **errp);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+ Error **errp);
+void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap *bitmap);
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
+ const char *name);
+int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags,
+ Error **errp);
+void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs);
+int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
+ Error **errp);
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap);
+BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
+uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_has_successor(BdrvDirtyBitmap *bitmap);
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap);
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+ int64_t offset, int64_t bytes);
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+ int64_t offset, int64_t bytes);
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
+
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+ uint64_t offset, uint64_t bytes);
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap);
+uint64_t bdrv_dirty_bitmap_serialization_coverage(int serialized_chunk_size,
+ const BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+ uint8_t *buf, uint64_t offset,
+ uint64_t bytes);
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+ uint8_t *buf, uint64_t offset,
+ uint64_t bytes, bool finish);
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+ uint64_t offset, uint64_t bytes,
+ bool finish);
+void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
+ uint64_t offset, uint64_t bytes,
+ bool finish);
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
+
+void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value);
+void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap *bitmap,
+ bool persistent);
+void bdrv_dirty_bitmap_set_inconsistent(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_set_busy(BdrvDirtyBitmap *bitmap, bool busy);
+void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
+ HBitmap **backup, Error **errp);
+void bdrv_dirty_bitmap_skip_store(BdrvDirtyBitmap *bitmap, bool skip);
+bool bdrv_dirty_bitmap_get(BdrvDirtyBitmap *bitmap, int64_t offset);
+
+/* Functions that require manual locking. */
+void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_get_locked(BdrvDirtyBitmap *bitmap, int64_t offset);
+void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+ int64_t offset, int64_t bytes);
+void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+ int64_t offset, int64_t bytes);
+int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs, int64_t bytes);
+bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap);
+bool bdrv_has_readonly_bitmaps(BlockDriverState *bs);
+bool bdrv_has_named_bitmaps(BlockDriverState *bs);
+bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_get_persistence(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_inconsistent(const BdrvDirtyBitmap *bitmap);
+
+BdrvDirtyBitmap *bdrv_dirty_bitmap_first(BlockDriverState *bs);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BdrvDirtyBitmap *bitmap);
+#define FOR_EACH_DIRTY_BITMAP(bs, bitmap) \
+for (bitmap = bdrv_dirty_bitmap_first(bs); bitmap; \
+ bitmap = bdrv_dirty_bitmap_next(bitmap))
+
+char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp);
+int64_t bdrv_dirty_bitmap_next_dirty(BdrvDirtyBitmap *bitmap, int64_t offset,
+ int64_t bytes);
+int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, int64_t offset,
+ int64_t bytes);
+bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap *bitmap,
+ int64_t start, int64_t end, int64_t max_dirty_count,
+ int64_t *dirty_start, int64_t *dirty_count);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+ Error **errp);
+
+#endif
diff --git a/include/block/export.h b/include/block/export.h
new file mode 100644
index 000000000..7feb02e10
--- /dev/null
+++ b/include/block/export.h
@@ -0,0 +1,89 @@
+/*
+ * Declarations for block exports
+ *
+ * Copyright (c) 2012, 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Kevin Wolf <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_EXPORT_H
+#define BLOCK_EXPORT_H
+
+#include "qapi/qapi-types-block-export.h"
+#include "qemu/queue.h"
+
+typedef struct BlockExport BlockExport;
+
+typedef struct BlockExportDriver {
+ /* The export type that this driver services */
+ BlockExportType type;
+
+ /*
+ * The size of the driver-specific state that contains BlockExport as its
+ * first field.
+ */
+ size_t instance_size;
+
+ /* Creates and starts a new block export */
+ int (*create)(BlockExport *, BlockExportOptions *, Error **);
+
+ /*
+ * Frees a removed block export. This function is only called after all
+ * references have been dropped.
+ */
+ void (*delete)(BlockExport *);
+
+ /*
+ * Start to disconnect all clients and drop other references held
+ * internally by the export driver. When the function returns, there may
+ * still be active references while the export is in the process of
+ * shutting down.
+ */
+ void (*request_shutdown)(BlockExport *);
+} BlockExportDriver;
+
+struct BlockExport {
+ const BlockExportDriver *drv;
+
+ /* Unique identifier for the export */
+ char *id;
+
+ /*
+ * Reference count for this block export. This includes strong references
+ * both from the owner (qemu-nbd or the monitor) and clients connected to
+ * the export.
+ */
+ int refcount;
+
+ /*
+ * True if one of the references in refcount belongs to the user. After the
+ * user has dropped their reference, they may not e.g. remove the same
+ * export a second time (which would decrease the refcount without having
+ * it incremented first).
+ */
+ bool user_owned;
+
+ /* The AioContext whose lock protects this BlockExport object. */
+ AioContext *ctx;
+
+ /* The block device to export */
+ BlockBackend *blk;
+
+ /* List entry for block_exports */
+ QLIST_ENTRY(BlockExport) next;
+};
+
+BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp);
+BlockExport *blk_exp_find(const char *id);
+void blk_exp_ref(BlockExport *exp);
+void blk_exp_unref(BlockExport *exp);
+void blk_exp_request_shutdown(BlockExport *exp);
+void blk_exp_close_all(void);
+void blk_exp_close_all_type(BlockExportType type);
+
+#endif
diff --git a/include/block/fuse.h b/include/block/fuse.h
new file mode 100644
index 000000000..ffa91fe36
--- /dev/null
+++ b/include/block/fuse.h
@@ -0,0 +1,30 @@
+/*
+ * Present a block device as a raw image through FUSE
+ *
+ * Copyright (c) 2020 Max Reitz <mreitz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 or later of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BLOCK_FUSE_H
+#define BLOCK_FUSE_H
+
+#ifdef CONFIG_FUSE
+
+#include "block/export.h"
+
+extern const BlockExportDriver blk_exp_fuse;
+
+#endif /* CONFIG_FUSE */
+
+#endif
diff --git a/include/block/nbd.h b/include/block/nbd.h
new file mode 100644
index 000000000..78d101b77
--- /dev/null
+++ b/include/block/nbd.h
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2016-2020 Red Hat, Inc.
+ * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
+ *
+ * Network Block Device
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef NBD_H
+#define NBD_H
+
+#include "block/export.h"
+#include "io/channel-socket.h"
+#include "crypto/tlscreds.h"
+#include "qapi/error.h"
+
+extern const BlockExportDriver blk_exp_nbd;
+
+/* Handshake phase structs - this struct is passed on the wire */
+
+struct NBDOption {
+ uint64_t magic; /* NBD_OPTS_MAGIC */
+ uint32_t option; /* NBD_OPT_* */
+ uint32_t length;
+} QEMU_PACKED;
+typedef struct NBDOption NBDOption;
+
+struct NBDOptionReply {
+ uint64_t magic; /* NBD_REP_MAGIC */
+ uint32_t option; /* NBD_OPT_* */
+ uint32_t type; /* NBD_REP_* */
+ uint32_t length;
+} QEMU_PACKED;
+typedef struct NBDOptionReply NBDOptionReply;
+
+typedef struct NBDOptionReplyMetaContext {
+ NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */
+ uint32_t context_id;
+ /* metadata context name follows */
+} QEMU_PACKED NBDOptionReplyMetaContext;
+
+/* Transmission phase structs
+ *
+ * Note: these are _NOT_ the same as the network representation of an NBD
+ * request and reply!
+ */
+struct NBDRequest {
+ uint64_t handle;
+ uint64_t from;
+ uint32_t len;
+ uint16_t flags; /* NBD_CMD_FLAG_* */
+ uint16_t type; /* NBD_CMD_* */
+};
+typedef struct NBDRequest NBDRequest;
+
+typedef struct NBDSimpleReply {
+ uint32_t magic; /* NBD_SIMPLE_REPLY_MAGIC */
+ uint32_t error;
+ uint64_t handle;
+} QEMU_PACKED NBDSimpleReply;
+
+/* Header of all structured replies */
+typedef struct NBDStructuredReplyChunk {
+ uint32_t magic; /* NBD_STRUCTURED_REPLY_MAGIC */
+ uint16_t flags; /* combination of NBD_REPLY_FLAG_* */
+ uint16_t type; /* NBD_REPLY_TYPE_* */
+ uint64_t handle; /* request handle */
+ uint32_t length; /* length of payload */
+} QEMU_PACKED NBDStructuredReplyChunk;
+
+typedef union NBDReply {
+ NBDSimpleReply simple;
+ NBDStructuredReplyChunk structured;
+ struct {
+ /* @magic and @handle fields have the same offset and size both in
+ * simple reply and structured reply chunk, so let them be accessible
+ * without ".simple." or ".structured." specification
+ */
+ uint32_t magic;
+ uint32_t _skip;
+ uint64_t handle;
+ } QEMU_PACKED;
+} NBDReply;
+
+/* Header of chunk for NBD_REPLY_TYPE_OFFSET_DATA */
+typedef struct NBDStructuredReadData {
+ NBDStructuredReplyChunk h; /* h.length >= 9 */
+ uint64_t offset;
+ /* At least one byte of data payload follows, calculated from h.length */
+} QEMU_PACKED NBDStructuredReadData;
+
+/* Complete chunk for NBD_REPLY_TYPE_OFFSET_HOLE */
+typedef struct NBDStructuredReadHole {
+ NBDStructuredReplyChunk h; /* h.length == 12 */
+ uint64_t offset;
+ uint32_t length;
+} QEMU_PACKED NBDStructuredReadHole;
+
+/* Header of all NBD_REPLY_TYPE_ERROR* errors */
+typedef struct NBDStructuredError {
+ NBDStructuredReplyChunk h; /* h.length >= 6 */
+ uint32_t error;
+ uint16_t message_length;
+} QEMU_PACKED NBDStructuredError;
+
+/* Header of NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDStructuredMeta {
+ NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */
+ uint32_t context_id;
+ /* extents follows */
+} QEMU_PACKED NBDStructuredMeta;
+
+/* Extent chunk for NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDExtent {
+ uint32_t length;
+ uint32_t flags; /* NBD_STATE_* */
+} QEMU_PACKED NBDExtent;
+
+/* Transmission (export) flags: sent from server to client during handshake,
+ but describe what will happen during transmission */
+enum {
+ NBD_FLAG_HAS_FLAGS_BIT = 0, /* Flags are there */
+ NBD_FLAG_READ_ONLY_BIT = 1, /* Device is read-only */
+ NBD_FLAG_SEND_FLUSH_BIT = 2, /* Send FLUSH */
+ NBD_FLAG_SEND_FUA_BIT = 3, /* Send FUA (Force Unit Access) */
+ NBD_FLAG_ROTATIONAL_BIT = 4, /* Use elevator algorithm -
+ rotational media */
+ NBD_FLAG_SEND_TRIM_BIT = 5, /* Send TRIM (discard) */
+ NBD_FLAG_SEND_WRITE_ZEROES_BIT = 6, /* Send WRITE_ZEROES */
+ NBD_FLAG_SEND_DF_BIT = 7, /* Send DF (Do not Fragment) */
+ NBD_FLAG_CAN_MULTI_CONN_BIT = 8, /* Multi-client cache consistent */
+ NBD_FLAG_SEND_RESIZE_BIT = 9, /* Send resize */
+ NBD_FLAG_SEND_CACHE_BIT = 10, /* Send CACHE (prefetch) */
+ NBD_FLAG_SEND_FAST_ZERO_BIT = 11, /* FAST_ZERO flag for WRITE_ZEROES */
+};
+
+#define NBD_FLAG_HAS_FLAGS (1 << NBD_FLAG_HAS_FLAGS_BIT)
+#define NBD_FLAG_READ_ONLY (1 << NBD_FLAG_READ_ONLY_BIT)
+#define NBD_FLAG_SEND_FLUSH (1 << NBD_FLAG_SEND_FLUSH_BIT)
+#define NBD_FLAG_SEND_FUA (1 << NBD_FLAG_SEND_FUA_BIT)
+#define NBD_FLAG_ROTATIONAL (1 << NBD_FLAG_ROTATIONAL_BIT)
+#define NBD_FLAG_SEND_TRIM (1 << NBD_FLAG_SEND_TRIM_BIT)
+#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
+#define NBD_FLAG_SEND_DF (1 << NBD_FLAG_SEND_DF_BIT)
+#define NBD_FLAG_CAN_MULTI_CONN (1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
+#define NBD_FLAG_SEND_RESIZE (1 << NBD_FLAG_SEND_RESIZE_BIT)
+#define NBD_FLAG_SEND_CACHE (1 << NBD_FLAG_SEND_CACHE_BIT)
+#define NBD_FLAG_SEND_FAST_ZERO (1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
+
+/* New-style handshake (global) flags, sent from server to client, and
+ control what will happen during handshake phase. */
+#define NBD_FLAG_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */
+#define NBD_FLAG_NO_ZEROES (1 << 1) /* End handshake without zeroes. */
+
+/* New-style client flags, sent from client to server to control what happens
+ during handshake phase. */
+#define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */
+#define NBD_FLAG_C_NO_ZEROES (1 << 1) /* End handshake without zeroes. */
+
+/* Option requests. */
+#define NBD_OPT_EXPORT_NAME (1)
+#define NBD_OPT_ABORT (2)
+#define NBD_OPT_LIST (3)
+/* #define NBD_OPT_PEEK_EXPORT (4) not in use */
+#define NBD_OPT_STARTTLS (5)
+#define NBD_OPT_INFO (6)
+#define NBD_OPT_GO (7)
+#define NBD_OPT_STRUCTURED_REPLY (8)
+#define NBD_OPT_LIST_META_CONTEXT (9)
+#define NBD_OPT_SET_META_CONTEXT (10)
+
+/* Option reply types. */
+#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value))
+
+#define NBD_REP_ACK (1) /* Data sending finished. */
+#define NBD_REP_SERVER (2) /* Export description. */
+#define NBD_REP_INFO (3) /* NBD_OPT_INFO/GO. */
+#define NBD_REP_META_CONTEXT (4) /* NBD_OPT_{LIST,SET}_META_CONTEXT */
+
+#define NBD_REP_ERR_UNSUP NBD_REP_ERR(1) /* Unknown option */
+#define NBD_REP_ERR_POLICY NBD_REP_ERR(2) /* Server denied */
+#define NBD_REP_ERR_INVALID NBD_REP_ERR(3) /* Invalid length */
+#define NBD_REP_ERR_PLATFORM NBD_REP_ERR(4) /* Not compiled in */
+#define NBD_REP_ERR_TLS_REQD NBD_REP_ERR(5) /* TLS required */
+#define NBD_REP_ERR_UNKNOWN NBD_REP_ERR(6) /* Export unknown */
+#define NBD_REP_ERR_SHUTDOWN NBD_REP_ERR(7) /* Server shutting down */
+#define NBD_REP_ERR_BLOCK_SIZE_REQD NBD_REP_ERR(8) /* Need INFO_BLOCK_SIZE */
+
+/* Info types, used during NBD_REP_INFO */
+#define NBD_INFO_EXPORT 0
+#define NBD_INFO_NAME 1
+#define NBD_INFO_DESCRIPTION 2
+#define NBD_INFO_BLOCK_SIZE 3
+
+/* Request flags, sent from client to server during transmission phase */
+#define NBD_CMD_FLAG_FUA (1 << 0) /* 'force unit access' during write */
+#define NBD_CMD_FLAG_NO_HOLE (1 << 1) /* don't punch hole on zero run */
+#define NBD_CMD_FLAG_DF (1 << 2) /* don't fragment structured read */
+#define NBD_CMD_FLAG_REQ_ONE (1 << 3) /* only one extent in BLOCK_STATUS
+ * reply chunk */
+#define NBD_CMD_FLAG_FAST_ZERO (1 << 4) /* fail if WRITE_ZEROES is not fast */
+
+/* Supported request types */
+enum {
+ NBD_CMD_READ = 0,
+ NBD_CMD_WRITE = 1,
+ NBD_CMD_DISC = 2,
+ NBD_CMD_FLUSH = 3,
+ NBD_CMD_TRIM = 4,
+ NBD_CMD_CACHE = 5,
+ NBD_CMD_WRITE_ZEROES = 6,
+ NBD_CMD_BLOCK_STATUS = 7,
+};
+
+#define NBD_DEFAULT_PORT 10809
+
+/* Maximum size of a single READ/WRITE data buffer */
+#define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
+
+/*
+ * Maximum size of a protocol string (export name, metadata context name,
+ * etc.). Use malloc rather than stack allocation for storage of a
+ * string.
+ */
+#define NBD_MAX_STRING_SIZE 4096
+
+/* Two types of reply structures */
+#define NBD_SIMPLE_REPLY_MAGIC 0x67446698
+#define NBD_STRUCTURED_REPLY_MAGIC 0x668e33ef
+
+/* Structured reply flags */
+#define NBD_REPLY_FLAG_DONE (1 << 0) /* This reply-chunk is last */
+
+/* Structured reply types */
+#define NBD_REPLY_ERR(value) ((1 << 15) | (value))
+
+#define NBD_REPLY_TYPE_NONE 0
+#define NBD_REPLY_TYPE_OFFSET_DATA 1
+#define NBD_REPLY_TYPE_OFFSET_HOLE 2
+#define NBD_REPLY_TYPE_BLOCK_STATUS 5
+#define NBD_REPLY_TYPE_ERROR NBD_REPLY_ERR(1)
+#define NBD_REPLY_TYPE_ERROR_OFFSET NBD_REPLY_ERR(2)
+
+/* Extent flags for base:allocation in NBD_REPLY_TYPE_BLOCK_STATUS */
+#define NBD_STATE_HOLE (1 << 0)
+#define NBD_STATE_ZERO (1 << 1)
+
+/* Extent flags for qemu:dirty-bitmap in NBD_REPLY_TYPE_BLOCK_STATUS */
+#define NBD_STATE_DIRTY (1 << 0)
+
+/* No flags needed for qemu:allocation-depth in NBD_REPLY_TYPE_BLOCK_STATUS */
+
+static inline bool nbd_reply_type_is_error(int type)
+{
+ return type & (1 << 15);
+}
+
+/* NBD errors are based on errno numbers, so there is a 1:1 mapping,
+ * but only a limited set of errno values is specified in the protocol.
+ * Everything else is squashed to EINVAL.
+ */
+#define NBD_SUCCESS 0
+#define NBD_EPERM 1
+#define NBD_EIO 5
+#define NBD_ENOMEM 12
+#define NBD_EINVAL 22
+#define NBD_ENOSPC 28
+#define NBD_EOVERFLOW 75
+#define NBD_ENOTSUP 95
+#define NBD_ESHUTDOWN 108
+
+/* Details collected by NBD_OPT_EXPORT_NAME and NBD_OPT_GO */
+struct NBDExportInfo {
+ /* Set by client before nbd_receive_negotiate() */
+ bool request_sizes;
+ char *x_dirty_bitmap;
+
+ /* Set by client before nbd_receive_negotiate(), or by server results
+ * during nbd_receive_export_list() */
+ char *name; /* must be non-NULL */
+
+ /* In-out fields, set by client before nbd_receive_negotiate() and
+ * updated by server results during nbd_receive_negotiate() */
+ bool structured_reply;
+ bool base_allocation; /* base:allocation context for NBD_CMD_BLOCK_STATUS */
+
+ /* Set by server results during nbd_receive_negotiate() and
+ * nbd_receive_export_list() */
+ uint64_t size;
+ uint16_t flags;
+ uint32_t min_block;
+ uint32_t opt_block;
+ uint32_t max_block;
+
+ uint32_t context_id;
+
+ /* Set by server results during nbd_receive_export_list() */
+ char *description;
+ int n_contexts;
+ char **contexts;
+};
+typedef struct NBDExportInfo NBDExportInfo;
+
+int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc,
+ QCryptoTLSCreds *tlscreds,
+ const char *hostname, QIOChannel **outioc,
+ NBDExportInfo *info, Error **errp);
+void nbd_free_export_list(NBDExportInfo *info, int count);
+int nbd_receive_export_list(QIOChannel *ioc, QCryptoTLSCreds *tlscreds,
+ const char *hostname, NBDExportInfo **info,
+ Error **errp);
+int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info,
+ Error **errp);
+int nbd_send_request(QIOChannel *ioc, NBDRequest *request);
+int coroutine_fn nbd_receive_reply(BlockDriverState *bs, QIOChannel *ioc,
+ NBDReply *reply, Error **errp);
+int nbd_client(int fd);
+int nbd_disconnect(int fd);
+int nbd_errno_to_system_errno(int err);
+
+typedef struct NBDExport NBDExport;
+typedef struct NBDClient NBDClient;
+
+void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk);
+
+AioContext *nbd_export_aio_context(NBDExport *exp);
+NBDExport *nbd_export_find(const char *name);
+
+void nbd_client_new(QIOChannelSocket *sioc,
+ QCryptoTLSCreds *tlscreds,
+ const char *tlsauthz,
+ void (*close_fn)(NBDClient *, bool));
+void nbd_client_get(NBDClient *client);
+void nbd_client_put(NBDClient *client);
+
+void nbd_server_is_qemu_nbd(bool value);
+bool nbd_server_is_running(void);
+void nbd_server_start(SocketAddress *addr, const char *tls_creds,
+ const char *tls_authz, uint32_t max_connections,
+ Error **errp);
+void nbd_server_start_options(NbdServerOptions *arg, Error **errp);
+
+/* nbd_read
+ * Reads @size bytes from @ioc. Returns 0 on success.
+ */
+static inline int nbd_read(QIOChannel *ioc, void *buffer, size_t size,
+ const char *desc, Error **errp)
+{
+ ERRP_GUARD();
+ int ret = qio_channel_read_all(ioc, buffer, size, errp) < 0 ? -EIO : 0;
+
+ if (ret < 0) {
+ if (desc) {
+ error_prepend(errp, "Failed to read %s: ", desc);
+ }
+ return ret;
+ }
+
+ return 0;
+}
+
+#define DEF_NBD_READ_N(bits) \
+static inline int nbd_read##bits(QIOChannel *ioc, \
+ uint##bits##_t *val, \
+ const char *desc, Error **errp) \
+{ \
+ int ret = nbd_read(ioc, val, sizeof(*val), desc, errp); \
+ if (ret < 0) { \
+ return ret; \
+ } \
+ *val = be##bits##_to_cpu(*val); \
+ return 0; \
+}
+
+DEF_NBD_READ_N(16) /* Defines nbd_read16(). */
+DEF_NBD_READ_N(32) /* Defines nbd_read32(). */
+DEF_NBD_READ_N(64) /* Defines nbd_read64(). */
+
+#undef DEF_NBD_READ_N
+
+static inline bool nbd_reply_is_simple(NBDReply *reply)
+{
+ return reply->magic == NBD_SIMPLE_REPLY_MAGIC;
+}
+
+static inline bool nbd_reply_is_structured(NBDReply *reply)
+{
+ return reply->magic == NBD_STRUCTURED_REPLY_MAGIC;
+}
+
+const char *nbd_reply_type_lookup(uint16_t type);
+const char *nbd_opt_lookup(uint32_t opt);
+const char *nbd_rep_lookup(uint32_t rep);
+const char *nbd_info_lookup(uint16_t info);
+const char *nbd_cmd_lookup(uint16_t info);
+const char *nbd_err_lookup(int err);
+
+/* nbd/client-connection.c */
+typedef struct NBDClientConnection NBDClientConnection;
+
+void nbd_client_connection_enable_retry(NBDClientConnection *conn);
+
+NBDClientConnection *nbd_client_connection_new(const SocketAddress *saddr,
+ bool do_negotiation,
+ const char *export_name,
+ const char *x_dirty_bitmap,
+ QCryptoTLSCreds *tlscreds);
+void nbd_client_connection_release(NBDClientConnection *conn);
+
+QIOChannel *coroutine_fn
+nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info,
+ bool blocking, Error **errp);
+
+void coroutine_fn nbd_co_establish_connection_cancel(NBDClientConnection *conn);
+
+#endif
diff --git a/include/block/nvme.h b/include/block/nvme.h
new file mode 100644
index 000000000..e3bd47bf7
--- /dev/null
+++ b/include/block/nvme.h
@@ -0,0 +1,1501 @@
+#ifndef BLOCK_NVME_H
+#define BLOCK_NVME_H
+
+typedef struct QEMU_PACKED NvmeBar {
+ uint64_t cap;
+ uint32_t vs;
+ uint32_t intms;
+ uint32_t intmc;
+ uint32_t cc;
+ uint8_t rsvd24[4];
+ uint32_t csts;
+ uint32_t nssr;
+ uint32_t aqa;
+ uint64_t asq;
+ uint64_t acq;
+ uint32_t cmbloc;
+ uint32_t cmbsz;
+ uint32_t bpinfo;
+ uint32_t bprsel;
+ uint64_t bpmbl;
+ uint64_t cmbmsc;
+ uint32_t cmbsts;
+ uint8_t rsvd92[3492];
+ uint32_t pmrcap;
+ uint32_t pmrctl;
+ uint32_t pmrsts;
+ uint32_t pmrebs;
+ uint32_t pmrswtp;
+ uint32_t pmrmscl;
+ uint32_t pmrmscu;
+ uint8_t css[484];
+} NvmeBar;
+
+enum NvmeBarRegs {
+ NVME_REG_CAP = offsetof(NvmeBar, cap),
+ NVME_REG_VS = offsetof(NvmeBar, vs),
+ NVME_REG_INTMS = offsetof(NvmeBar, intms),
+ NVME_REG_INTMC = offsetof(NvmeBar, intmc),
+ NVME_REG_CC = offsetof(NvmeBar, cc),
+ NVME_REG_CSTS = offsetof(NvmeBar, csts),
+ NVME_REG_NSSR = offsetof(NvmeBar, nssr),
+ NVME_REG_AQA = offsetof(NvmeBar, aqa),
+ NVME_REG_ASQ = offsetof(NvmeBar, asq),
+ NVME_REG_ACQ = offsetof(NvmeBar, acq),
+ NVME_REG_CMBLOC = offsetof(NvmeBar, cmbloc),
+ NVME_REG_CMBSZ = offsetof(NvmeBar, cmbsz),
+ NVME_REG_BPINFO = offsetof(NvmeBar, bpinfo),
+ NVME_REG_BPRSEL = offsetof(NvmeBar, bprsel),
+ NVME_REG_BPMBL = offsetof(NvmeBar, bpmbl),
+ NVME_REG_CMBMSC = offsetof(NvmeBar, cmbmsc),
+ NVME_REG_CMBSTS = offsetof(NvmeBar, cmbsts),
+ NVME_REG_PMRCAP = offsetof(NvmeBar, pmrcap),
+ NVME_REG_PMRCTL = offsetof(NvmeBar, pmrctl),
+ NVME_REG_PMRSTS = offsetof(NvmeBar, pmrsts),
+ NVME_REG_PMREBS = offsetof(NvmeBar, pmrebs),
+ NVME_REG_PMRSWTP = offsetof(NvmeBar, pmrswtp),
+ NVME_REG_PMRMSCL = offsetof(NvmeBar, pmrmscl),
+ NVME_REG_PMRMSCU = offsetof(NvmeBar, pmrmscu),
+};
+
+enum NvmeCapShift {
+ CAP_MQES_SHIFT = 0,
+ CAP_CQR_SHIFT = 16,
+ CAP_AMS_SHIFT = 17,
+ CAP_TO_SHIFT = 24,
+ CAP_DSTRD_SHIFT = 32,
+ CAP_NSSRS_SHIFT = 36,
+ CAP_CSS_SHIFT = 37,
+ CAP_MPSMIN_SHIFT = 48,
+ CAP_MPSMAX_SHIFT = 52,
+ CAP_PMRS_SHIFT = 56,
+ CAP_CMBS_SHIFT = 57,
+};
+
+enum NvmeCapMask {
+ CAP_MQES_MASK = 0xffff,
+ CAP_CQR_MASK = 0x1,
+ CAP_AMS_MASK = 0x3,
+ CAP_TO_MASK = 0xff,
+ CAP_DSTRD_MASK = 0xf,
+ CAP_NSSRS_MASK = 0x1,
+ CAP_CSS_MASK = 0xff,
+ CAP_MPSMIN_MASK = 0xf,
+ CAP_MPSMAX_MASK = 0xf,
+ CAP_PMRS_MASK = 0x1,
+ CAP_CMBS_MASK = 0x1,
+};
+
+#define NVME_CAP_MQES(cap) (((cap) >> CAP_MQES_SHIFT) & CAP_MQES_MASK)
+#define NVME_CAP_CQR(cap) (((cap) >> CAP_CQR_SHIFT) & CAP_CQR_MASK)
+#define NVME_CAP_AMS(cap) (((cap) >> CAP_AMS_SHIFT) & CAP_AMS_MASK)
+#define NVME_CAP_TO(cap) (((cap) >> CAP_TO_SHIFT) & CAP_TO_MASK)
+#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT) & CAP_DSTRD_MASK)
+#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT) & CAP_NSSRS_MASK)
+#define NVME_CAP_CSS(cap) (((cap) >> CAP_CSS_SHIFT) & CAP_CSS_MASK)
+#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
+#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+#define NVME_CAP_PMRS(cap) (((cap) >> CAP_PMRS_SHIFT) & CAP_PMRS_MASK)
+#define NVME_CAP_CMBS(cap) (((cap) >> CAP_CMBS_SHIFT) & CAP_CMBS_MASK)
+
+#define NVME_CAP_SET_MQES(cap, val) (cap |= (uint64_t)(val & CAP_MQES_MASK) \
+ << CAP_MQES_SHIFT)
+#define NVME_CAP_SET_CQR(cap, val) (cap |= (uint64_t)(val & CAP_CQR_MASK) \
+ << CAP_CQR_SHIFT)
+#define NVME_CAP_SET_AMS(cap, val) (cap |= (uint64_t)(val & CAP_AMS_MASK) \
+ << CAP_AMS_SHIFT)
+#define NVME_CAP_SET_TO(cap, val) (cap |= (uint64_t)(val & CAP_TO_MASK) \
+ << CAP_TO_SHIFT)
+#define NVME_CAP_SET_DSTRD(cap, val) (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \
+ << CAP_DSTRD_SHIFT)
+#define NVME_CAP_SET_NSSRS(cap, val) (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \
+ << CAP_NSSRS_SHIFT)
+#define NVME_CAP_SET_CSS(cap, val) (cap |= (uint64_t)(val & CAP_CSS_MASK) \
+ << CAP_CSS_SHIFT)
+#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
+ << CAP_MPSMIN_SHIFT)
+#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
+ << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val) (cap |= (uint64_t)(val & CAP_PMRS_MASK) \
+ << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val) (cap |= (uint64_t)(val & CAP_CMBS_MASK) \
+ << CAP_CMBS_SHIFT)
+
+enum NvmeCapCss {
+ NVME_CAP_CSS_NVM = 1 << 0,
+ NVME_CAP_CSS_CSI_SUPP = 1 << 6,
+ NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
+};
+
+enum NvmeCcShift {
+ CC_EN_SHIFT = 0,
+ CC_CSS_SHIFT = 4,
+ CC_MPS_SHIFT = 7,
+ CC_AMS_SHIFT = 11,
+ CC_SHN_SHIFT = 14,
+ CC_IOSQES_SHIFT = 16,
+ CC_IOCQES_SHIFT = 20,
+};
+
+enum NvmeCcMask {
+ CC_EN_MASK = 0x1,
+ CC_CSS_MASK = 0x7,
+ CC_MPS_MASK = 0xf,
+ CC_AMS_MASK = 0x7,
+ CC_SHN_MASK = 0x3,
+ CC_IOSQES_MASK = 0xf,
+ CC_IOCQES_MASK = 0xf,
+};
+
+#define NVME_CC_EN(cc) ((cc >> CC_EN_SHIFT) & CC_EN_MASK)
+#define NVME_CC_CSS(cc) ((cc >> CC_CSS_SHIFT) & CC_CSS_MASK)
+#define NVME_CC_MPS(cc) ((cc >> CC_MPS_SHIFT) & CC_MPS_MASK)
+#define NVME_CC_AMS(cc) ((cc >> CC_AMS_SHIFT) & CC_AMS_MASK)
+#define NVME_CC_SHN(cc) ((cc >> CC_SHN_SHIFT) & CC_SHN_MASK)
+#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
+#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
+
+enum NvmeCcCss {
+ NVME_CC_CSS_NVM = 0x0,
+ NVME_CC_CSS_CSI = 0x6,
+ NVME_CC_CSS_ADMIN_ONLY = 0x7,
+};
+
+#define NVME_SET_CC_EN(cc, val) \
+ (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val) \
+ (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val) \
+ (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+ (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+ (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
+enum NvmeCstsShift {
+ CSTS_RDY_SHIFT = 0,
+ CSTS_CFS_SHIFT = 1,
+ CSTS_SHST_SHIFT = 2,
+ CSTS_NSSRO_SHIFT = 4,
+};
+
+enum NvmeCstsMask {
+ CSTS_RDY_MASK = 0x1,
+ CSTS_CFS_MASK = 0x1,
+ CSTS_SHST_MASK = 0x3,
+ CSTS_NSSRO_MASK = 0x1,
+};
+
+enum NvmeCsts {
+ NVME_CSTS_READY = 1 << CSTS_RDY_SHIFT,
+ NVME_CSTS_FAILED = 1 << CSTS_CFS_SHIFT,
+ NVME_CSTS_SHST_NORMAL = 0 << CSTS_SHST_SHIFT,
+ NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT,
+ NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT,
+ NVME_CSTS_NSSRO = 1 << CSTS_NSSRO_SHIFT,
+};
+
+#define NVME_CSTS_RDY(csts) ((csts >> CSTS_RDY_SHIFT) & CSTS_RDY_MASK)
+#define NVME_CSTS_CFS(csts) ((csts >> CSTS_CFS_SHIFT) & CSTS_CFS_MASK)
+#define NVME_CSTS_SHST(csts) ((csts >> CSTS_SHST_SHIFT) & CSTS_SHST_MASK)
+#define NVME_CSTS_NSSRO(csts) ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK)
+
+enum NvmeAqaShift {
+ AQA_ASQS_SHIFT = 0,
+ AQA_ACQS_SHIFT = 16,
+};
+
+enum NvmeAqaMask {
+ AQA_ASQS_MASK = 0xfff,
+ AQA_ACQS_MASK = 0xfff,
+};
+
+#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
+#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+
+enum NvmeCmblocShift {
+ CMBLOC_BIR_SHIFT = 0,
+ CMBLOC_CQMMS_SHIFT = 3,
+ CMBLOC_CQPDS_SHIFT = 4,
+ CMBLOC_CDPMLS_SHIFT = 5,
+ CMBLOC_CDPCILS_SHIFT = 6,
+ CMBLOC_CDMMMS_SHIFT = 7,
+ CMBLOC_CQDA_SHIFT = 8,
+ CMBLOC_OFST_SHIFT = 12,
+};
+
+enum NvmeCmblocMask {
+ CMBLOC_BIR_MASK = 0x7,
+ CMBLOC_CQMMS_MASK = 0x1,
+ CMBLOC_CQPDS_MASK = 0x1,
+ CMBLOC_CDPMLS_MASK = 0x1,
+ CMBLOC_CDPCILS_MASK = 0x1,
+ CMBLOC_CDMMMS_MASK = 0x1,
+ CMBLOC_CQDA_MASK = 0x1,
+ CMBLOC_OFST_MASK = 0xfffff,
+};
+
+#define NVME_CMBLOC_BIR(cmbloc) \
+ ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK)
+#define NVME_CMBLOC_CQMMS(cmbloc) \
+ ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK)
+#define NVME_CMBLOC_CQPDS(cmbloc) \
+ ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK)
+#define NVME_CMBLOC_CDPMLS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK)
+#define NVME_CMBLOC_CDPCILS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK)
+#define NVME_CMBLOC_CDMMMS(cmbloc) \
+ ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK)
+#define NVME_CMBLOC_CQDA(cmbloc) \
+ ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK)
+#define NVME_CMBLOC_OFST(cmbloc) \
+ ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK)
+
+#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
+#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT)
+#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT)
+#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT)
+#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT)
+#define NVME_CMBLOC_SET_OFST(cmbloc, val) \
+ (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
+
+#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT)
+
+enum NvmeCmbszShift {
+ CMBSZ_SQS_SHIFT = 0,
+ CMBSZ_CQS_SHIFT = 1,
+ CMBSZ_LISTS_SHIFT = 2,
+ CMBSZ_RDS_SHIFT = 3,
+ CMBSZ_WDS_SHIFT = 4,
+ CMBSZ_SZU_SHIFT = 8,
+ CMBSZ_SZ_SHIFT = 12,
+};
+
+enum NvmeCmbszMask {
+ CMBSZ_SQS_MASK = 0x1,
+ CMBSZ_CQS_MASK = 0x1,
+ CMBSZ_LISTS_MASK = 0x1,
+ CMBSZ_RDS_MASK = 0x1,
+ CMBSZ_WDS_MASK = 0x1,
+ CMBSZ_SZU_MASK = 0xf,
+ CMBSZ_SZ_MASK = 0xfffff,
+};
+
+#define NVME_CMBSZ_SQS(cmbsz) ((cmbsz >> CMBSZ_SQS_SHIFT) & CMBSZ_SQS_MASK)
+#define NVME_CMBSZ_CQS(cmbsz) ((cmbsz >> CMBSZ_CQS_SHIFT) & CMBSZ_CQS_MASK)
+#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK)
+#define NVME_CMBSZ_RDS(cmbsz) ((cmbsz >> CMBSZ_RDS_SHIFT) & CMBSZ_RDS_MASK)
+#define NVME_CMBSZ_WDS(cmbsz) ((cmbsz >> CMBSZ_WDS_SHIFT) & CMBSZ_WDS_MASK)
+#define NVME_CMBSZ_SZU(cmbsz) ((cmbsz >> CMBSZ_SZU_SHIFT) & CMBSZ_SZU_MASK)
+#define NVME_CMBSZ_SZ(cmbsz) ((cmbsz >> CMBSZ_SZ_SHIFT) & CMBSZ_SZ_MASK)
+
+#define NVME_CMBSZ_SET_SQS(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_SQS_MASK) << CMBSZ_SQS_SHIFT)
+#define NVME_CMBSZ_SET_CQS(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT)
+#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT)
+#define NVME_CMBSZ_SET_RDS(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT)
+#define NVME_CMBSZ_SET_WDS(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT)
+#define NVME_CMBSZ_SET_SZU(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT)
+#define NVME_CMBSZ_SET_SZ(cmbsz, val) \
+ (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT)
+
+#define NVME_CMBSZ_GETSIZE(cmbsz) \
+ (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
+
+enum NvmeCmbmscShift {
+ CMBMSC_CRE_SHIFT = 0,
+ CMBMSC_CMSE_SHIFT = 1,
+ CMBMSC_CBA_SHIFT = 12,
+};
+
+enum NvmeCmbmscMask {
+ CMBMSC_CRE_MASK = 0x1,
+ CMBMSC_CMSE_MASK = 0x1,
+ CMBMSC_CBA_MASK = ((1ULL << 52) - 1),
+};
+
+#define NVME_CMBMSC_CRE(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CRE_SHIFT) & CMBMSC_CRE_MASK)
+#define NVME_CMBMSC_CMSE(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK)
+#define NVME_CMBMSC_CBA(cmbmsc) \
+ ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK)
+
+
+#define NVME_CMBMSC_SET_CRE(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT)
+#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT)
+#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \
+ (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT)
+
+enum NvmeCmbstsShift {
+ CMBSTS_CBAI_SHIFT = 0,
+};
+enum NvmeCmbstsMask {
+ CMBSTS_CBAI_MASK = 0x1,
+};
+
+#define NVME_CMBSTS_CBAI(cmbsts) \
+ ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK)
+
+#define NVME_CMBSTS_SET_CBAI(cmbsts, val) \
+ (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT)
+
+enum NvmePmrcapShift {
+ PMRCAP_RDS_SHIFT = 3,
+ PMRCAP_WDS_SHIFT = 4,
+ PMRCAP_BIR_SHIFT = 5,
+ PMRCAP_PMRTU_SHIFT = 8,
+ PMRCAP_PMRWBM_SHIFT = 10,
+ PMRCAP_PMRTO_SHIFT = 16,
+ PMRCAP_CMSS_SHIFT = 24,
+};
+
+enum NvmePmrcapMask {
+ PMRCAP_RDS_MASK = 0x1,
+ PMRCAP_WDS_MASK = 0x1,
+ PMRCAP_BIR_MASK = 0x7,
+ PMRCAP_PMRTU_MASK = 0x3,
+ PMRCAP_PMRWBM_MASK = 0xf,
+ PMRCAP_PMRTO_MASK = 0xff,
+ PMRCAP_CMSS_MASK = 0x1,
+};
+
+#define NVME_PMRCAP_RDS(pmrcap) \
+ ((pmrcap >> PMRCAP_RDS_SHIFT) & PMRCAP_RDS_MASK)
+#define NVME_PMRCAP_WDS(pmrcap) \
+ ((pmrcap >> PMRCAP_WDS_SHIFT) & PMRCAP_WDS_MASK)
+#define NVME_PMRCAP_BIR(pmrcap) \
+ ((pmrcap >> PMRCAP_BIR_SHIFT) & PMRCAP_BIR_MASK)
+#define NVME_PMRCAP_PMRTU(pmrcap) \
+ ((pmrcap >> PMRCAP_PMRTU_SHIFT) & PMRCAP_PMRTU_MASK)
+#define NVME_PMRCAP_PMRWBM(pmrcap) \
+ ((pmrcap >> PMRCAP_PMRWBM_SHIFT) & PMRCAP_PMRWBM_MASK)
+#define NVME_PMRCAP_PMRTO(pmrcap) \
+ ((pmrcap >> PMRCAP_PMRTO_SHIFT) & PMRCAP_PMRTO_MASK)
+#define NVME_PMRCAP_CMSS(pmrcap) \
+ ((pmrcap >> PMRCAP_CMSS_SHIFT) & PMRCAP_CMSS_MASK)
+
+#define NVME_PMRCAP_SET_RDS(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
+#define NVME_PMRCAP_SET_WDS(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
+#define NVME_PMRCAP_SET_BIR(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
+#define NVME_PMRCAP_SET_PMRTU(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
+#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
+#define NVME_PMRCAP_SET_PMRTO(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
+#define NVME_PMRCAP_SET_CMSS(pmrcap, val) \
+ (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
+
+enum NvmePmrctlShift {
+ PMRCTL_EN_SHIFT = 0,
+};
+
+enum NvmePmrctlMask {
+ PMRCTL_EN_MASK = 0x1,
+};
+
+#define NVME_PMRCTL_EN(pmrctl) ((pmrctl >> PMRCTL_EN_SHIFT) & PMRCTL_EN_MASK)
+
+#define NVME_PMRCTL_SET_EN(pmrctl, val) \
+ (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
+
+enum NvmePmrstsShift {
+ PMRSTS_ERR_SHIFT = 0,
+ PMRSTS_NRDY_SHIFT = 8,
+ PMRSTS_HSTS_SHIFT = 9,
+ PMRSTS_CBAI_SHIFT = 12,
+};
+
+enum NvmePmrstsMask {
+ PMRSTS_ERR_MASK = 0xff,
+ PMRSTS_NRDY_MASK = 0x1,
+ PMRSTS_HSTS_MASK = 0x7,
+ PMRSTS_CBAI_MASK = 0x1,
+};
+
+#define NVME_PMRSTS_ERR(pmrsts) \
+ ((pmrsts >> PMRSTS_ERR_SHIFT) & PMRSTS_ERR_MASK)
+#define NVME_PMRSTS_NRDY(pmrsts) \
+ ((pmrsts >> PMRSTS_NRDY_SHIFT) & PMRSTS_NRDY_MASK)
+#define NVME_PMRSTS_HSTS(pmrsts) \
+ ((pmrsts >> PMRSTS_HSTS_SHIFT) & PMRSTS_HSTS_MASK)
+#define NVME_PMRSTS_CBAI(pmrsts) \
+ ((pmrsts >> PMRSTS_CBAI_SHIFT) & PMRSTS_CBAI_MASK)
+
+#define NVME_PMRSTS_SET_ERR(pmrsts, val) \
+ (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
+#define NVME_PMRSTS_SET_NRDY(pmrsts, val) \
+ (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
+#define NVME_PMRSTS_SET_HSTS(pmrsts, val) \
+ (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
+#define NVME_PMRSTS_SET_CBAI(pmrsts, val) \
+ (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
+
+enum NvmePmrebsShift {
+ PMREBS_PMRSZU_SHIFT = 0,
+ PMREBS_RBB_SHIFT = 4,
+ PMREBS_PMRWBZ_SHIFT = 8,
+};
+
+enum NvmePmrebsMask {
+ PMREBS_PMRSZU_MASK = 0xf,
+ PMREBS_RBB_MASK = 0x1,
+ PMREBS_PMRWBZ_MASK = 0xffffff,
+};
+
+#define NVME_PMREBS_PMRSZU(pmrebs) \
+ ((pmrebs >> PMREBS_PMRSZU_SHIFT) & PMREBS_PMRSZU_MASK)
+#define NVME_PMREBS_RBB(pmrebs) \
+ ((pmrebs >> PMREBS_RBB_SHIFT) & PMREBS_RBB_MASK)
+#define NVME_PMREBS_PMRWBZ(pmrebs) \
+ ((pmrebs >> PMREBS_PMRWBZ_SHIFT) & PMREBS_PMRWBZ_MASK)
+
+#define NVME_PMREBS_SET_PMRSZU(pmrebs, val) \
+ (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
+#define NVME_PMREBS_SET_RBB(pmrebs, val) \
+ (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
+#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val) \
+ (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
+
+enum NvmePmrswtpShift {
+ PMRSWTP_PMRSWTU_SHIFT = 0,
+ PMRSWTP_PMRSWTV_SHIFT = 8,
+};
+
+enum NvmePmrswtpMask {
+ PMRSWTP_PMRSWTU_MASK = 0xf,
+ PMRSWTP_PMRSWTV_MASK = 0xffffff,
+};
+
+#define NVME_PMRSWTP_PMRSWTU(pmrswtp) \
+ ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT) & PMRSWTP_PMRSWTU_MASK)
+#define NVME_PMRSWTP_PMRSWTV(pmrswtp) \
+ ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT) & PMRSWTP_PMRSWTV_MASK)
+
+#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val) \
+ (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
+#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val) \
+ (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
+
+enum NvmePmrmsclShift {
+ PMRMSCL_CMSE_SHIFT = 1,
+ PMRMSCL_CBA_SHIFT = 12,
+};
+
+enum NvmePmrmsclMask {
+ PMRMSCL_CMSE_MASK = 0x1,
+ PMRMSCL_CBA_MASK = 0xfffff,
+};
+
+#define NVME_PMRMSCL_CMSE(pmrmscl) \
+ ((pmrmscl >> PMRMSCL_CMSE_SHIFT) & PMRMSCL_CMSE_MASK)
+#define NVME_PMRMSCL_CBA(pmrmscl) \
+ ((pmrmscl >> PMRMSCL_CBA_SHIFT) & PMRMSCL_CBA_MASK)
+
+#define NVME_PMRMSCL_SET_CMSE(pmrmscl, val) \
+ (pmrmscl |= (uint32_t)(val & PMRMSCL_CMSE_MASK) << PMRMSCL_CMSE_SHIFT)
+#define NVME_PMRMSCL_SET_CBA(pmrmscl, val) \
+ (pmrmscl |= (uint32_t)(val & PMRMSCL_CBA_MASK) << PMRMSCL_CBA_SHIFT)
+
+enum NvmeSglDescriptorType {
+ NVME_SGL_DESCR_TYPE_DATA_BLOCK = 0x0,
+ NVME_SGL_DESCR_TYPE_BIT_BUCKET = 0x1,
+ NVME_SGL_DESCR_TYPE_SEGMENT = 0x2,
+ NVME_SGL_DESCR_TYPE_LAST_SEGMENT = 0x3,
+ NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK = 0x4,
+
+ NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf,
+};
+
+enum NvmeSglDescriptorSubtype {
+ NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0,
+};
+
+typedef struct QEMU_PACKED NvmeSglDescriptor {
+ uint64_t addr;
+ uint32_t len;
+ uint8_t rsvd[3];
+ uint8_t type;
+} NvmeSglDescriptor;
+
+#define NVME_SGL_TYPE(type) ((type >> 4) & 0xf)
+#define NVME_SGL_SUBTYPE(type) (type & 0xf)
+
+typedef union NvmeCmdDptr {
+ struct {
+ uint64_t prp1;
+ uint64_t prp2;
+ };
+
+ NvmeSglDescriptor sgl;
+} NvmeCmdDptr;
+
+enum NvmePsdt {
+ NVME_PSDT_PRP = 0x0,
+ NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
+ NVME_PSDT_SGL_MPTR_SGL = 0x2,
+};
+
+typedef struct QEMU_PACKED NvmeCmd {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t nsid;
+ uint64_t res1;
+ uint64_t mptr;
+ NvmeCmdDptr dptr;
+ uint32_t cdw10;
+ uint32_t cdw11;
+ uint32_t cdw12;
+ uint32_t cdw13;
+ uint32_t cdw14;
+ uint32_t cdw15;
+} NvmeCmd;
+
+#define NVME_CMD_FLAGS_FUSE(flags) (flags & 0x3)
+#define NVME_CMD_FLAGS_PSDT(flags) ((flags >> 6) & 0x3)
+
+enum NvmeAdminCommands {
+ NVME_ADM_CMD_DELETE_SQ = 0x00,
+ NVME_ADM_CMD_CREATE_SQ = 0x01,
+ NVME_ADM_CMD_GET_LOG_PAGE = 0x02,
+ NVME_ADM_CMD_DELETE_CQ = 0x04,
+ NVME_ADM_CMD_CREATE_CQ = 0x05,
+ NVME_ADM_CMD_IDENTIFY = 0x06,
+ NVME_ADM_CMD_ABORT = 0x08,
+ NVME_ADM_CMD_SET_FEATURES = 0x09,
+ NVME_ADM_CMD_GET_FEATURES = 0x0a,
+ NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c,
+ NVME_ADM_CMD_ACTIVATE_FW = 0x10,
+ NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
+ NVME_ADM_CMD_NS_ATTACHMENT = 0x15,
+ NVME_ADM_CMD_FORMAT_NVM = 0x80,
+ NVME_ADM_CMD_SECURITY_SEND = 0x81,
+ NVME_ADM_CMD_SECURITY_RECV = 0x82,
+};
+
+enum NvmeIoCommands {
+ NVME_CMD_FLUSH = 0x00,
+ NVME_CMD_WRITE = 0x01,
+ NVME_CMD_READ = 0x02,
+ NVME_CMD_WRITE_UNCOR = 0x04,
+ NVME_CMD_COMPARE = 0x05,
+ NVME_CMD_WRITE_ZEROES = 0x08,
+ NVME_CMD_DSM = 0x09,
+ NVME_CMD_VERIFY = 0x0c,
+ NVME_CMD_COPY = 0x19,
+ NVME_CMD_ZONE_MGMT_SEND = 0x79,
+ NVME_CMD_ZONE_MGMT_RECV = 0x7a,
+ NVME_CMD_ZONE_APPEND = 0x7d,
+};
+
+typedef struct QEMU_PACKED NvmeDeleteQ {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t rsvd1[9];
+ uint16_t qid;
+ uint16_t rsvd10;
+ uint32_t rsvd11[5];
+} NvmeDeleteQ;
+
+typedef struct QEMU_PACKED NvmeCreateCq {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t rsvd1[5];
+ uint64_t prp1;
+ uint64_t rsvd8;
+ uint16_t cqid;
+ uint16_t qsize;
+ uint16_t cq_flags;
+ uint16_t irq_vector;
+ uint32_t rsvd12[4];
+} NvmeCreateCq;
+
+#define NVME_CQ_FLAGS_PC(cq_flags) (cq_flags & 0x1)
+#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
+
+enum NvmeFlagsCq {
+ NVME_CQ_PC = 1,
+ NVME_CQ_IEN = 2,
+};
+
+typedef struct QEMU_PACKED NvmeCreateSq {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t rsvd1[5];
+ uint64_t prp1;
+ uint64_t rsvd8;
+ uint16_t sqid;
+ uint16_t qsize;
+ uint16_t sq_flags;
+ uint16_t cqid;
+ uint32_t rsvd12[4];
+} NvmeCreateSq;
+
+#define NVME_SQ_FLAGS_PC(sq_flags) (sq_flags & 0x1)
+#define NVME_SQ_FLAGS_QPRIO(sq_flags) ((sq_flags >> 1) & 0x3)
+
+enum NvmeFlagsSq {
+ NVME_SQ_PC = 1,
+
+ NVME_SQ_PRIO_URGENT = 0,
+ NVME_SQ_PRIO_HIGH = 1,
+ NVME_SQ_PRIO_NORMAL = 2,
+ NVME_SQ_PRIO_LOW = 3,
+};
+
+typedef struct QEMU_PACKED NvmeIdentify {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t nsid;
+ uint64_t rsvd2[2];
+ uint64_t prp1;
+ uint64_t prp2;
+ uint8_t cns;
+ uint8_t rsvd10;
+ uint16_t ctrlid;
+ uint16_t nvmsetid;
+ uint8_t rsvd11;
+ uint8_t csi;
+ uint32_t rsvd12[4];
+} NvmeIdentify;
+
+typedef struct QEMU_PACKED NvmeRwCmd {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t nsid;
+ uint64_t rsvd2;
+ uint64_t mptr;
+ NvmeCmdDptr dptr;
+ uint64_t slba;
+ uint16_t nlb;
+ uint16_t control;
+ uint32_t dsmgmt;
+ uint32_t reftag;
+ uint16_t apptag;
+ uint16_t appmask;
+} NvmeRwCmd;
+
+enum {
+ NVME_RW_LR = 1 << 15,
+ NVME_RW_FUA = 1 << 14,
+ NVME_RW_DSM_FREQ_UNSPEC = 0,
+ NVME_RW_DSM_FREQ_TYPICAL = 1,
+ NVME_RW_DSM_FREQ_RARE = 2,
+ NVME_RW_DSM_FREQ_READS = 3,
+ NVME_RW_DSM_FREQ_WRITES = 4,
+ NVME_RW_DSM_FREQ_RW = 5,
+ NVME_RW_DSM_FREQ_ONCE = 6,
+ NVME_RW_DSM_FREQ_PREFETCH = 7,
+ NVME_RW_DSM_FREQ_TEMP = 8,
+ NVME_RW_DSM_LATENCY_NONE = 0 << 4,
+ NVME_RW_DSM_LATENCY_IDLE = 1 << 4,
+ NVME_RW_DSM_LATENCY_NORM = 2 << 4,
+ NVME_RW_DSM_LATENCY_LOW = 3 << 4,
+ NVME_RW_DSM_SEQ_REQ = 1 << 6,
+ NVME_RW_DSM_COMPRESSED = 1 << 7,
+ NVME_RW_PIREMAP = 1 << 9,
+ NVME_RW_PRINFO_PRACT = 1 << 13,
+ NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
+ NVME_RW_PRINFO_PRCHK_APP = 1 << 11,
+ NVME_RW_PRINFO_PRCHK_REF = 1 << 10,
+ NVME_RW_PRINFO_PRCHK_MASK = 7 << 10,
+
+};
+
+#define NVME_RW_PRINFO(control) ((control >> 10) & 0xf)
+
+enum {
+ NVME_PRINFO_PRACT = 1 << 3,
+ NVME_PRINFO_PRCHK_GUARD = 1 << 2,
+ NVME_PRINFO_PRCHK_APP = 1 << 1,
+ NVME_PRINFO_PRCHK_REF = 1 << 0,
+ NVME_PRINFO_PRCHK_MASK = 7 << 0,
+};
+
+typedef struct QEMU_PACKED NvmeDsmCmd {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t nsid;
+ uint64_t rsvd2[2];
+ NvmeCmdDptr dptr;
+ uint32_t nr;
+ uint32_t attributes;
+ uint32_t rsvd12[4];
+} NvmeDsmCmd;
+
+enum {
+ NVME_DSMGMT_IDR = 1 << 0,
+ NVME_DSMGMT_IDW = 1 << 1,
+ NVME_DSMGMT_AD = 1 << 2,
+};
+
+typedef struct QEMU_PACKED NvmeDsmRange {
+ uint32_t cattr;
+ uint32_t nlb;
+ uint64_t slba;
+} NvmeDsmRange;
+
+enum {
+ NVME_COPY_FORMAT_0 = 0x0,
+};
+
+typedef struct QEMU_PACKED NvmeCopyCmd {
+ uint8_t opcode;
+ uint8_t flags;
+ uint16_t cid;
+ uint32_t nsid;
+ uint32_t rsvd2[4];
+ NvmeCmdDptr dptr;
+ uint64_t sdlba;
+ uint8_t nr;
+ uint8_t control[3];
+ uint16_t rsvd13;
+ uint16_t dspec;
+ uint32_t reftag;
+ uint16_t apptag;
+ uint16_t appmask;
+} NvmeCopyCmd;
+
+typedef struct QEMU_PACKED NvmeCopySourceRange {
+ uint8_t rsvd0[8];
+ uint64_t slba;
+ uint16_t nlb;
+ uint8_t rsvd18[6];
+ uint32_t reftag;
+ uint16_t apptag;
+ uint16_t appmask;
+} NvmeCopySourceRange;
+
+enum NvmeAsyncEventRequest {
+ NVME_AER_TYPE_ERROR = 0,
+ NVME_AER_TYPE_SMART = 1,
+ NVME_AER_TYPE_NOTICE = 2,
+ NVME_AER_TYPE_IO_SPECIFIC = 6,
+ NVME_AER_TYPE_VENDOR_SPECIFIC = 7,
+ NVME_AER_INFO_ERR_INVALID_DB_REGISTER = 0,
+ NVME_AER_INFO_ERR_INVALID_DB_VALUE = 1,
+ NVME_AER_INFO_ERR_DIAG_FAIL = 2,
+ NVME_AER_INFO_ERR_PERS_INTERNAL_ERR = 3,
+ NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR = 4,
+ NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR = 5,
+ NVME_AER_INFO_SMART_RELIABILITY = 0,
+ NVME_AER_INFO_SMART_TEMP_THRESH = 1,
+ NVME_AER_INFO_SMART_SPARE_THRESH = 2,
+ NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED = 0,
+};
+
+typedef struct QEMU_PACKED NvmeAerResult {
+ uint8_t event_type;
+ uint8_t event_info;
+ uint8_t log_page;
+ uint8_t resv;
+} NvmeAerResult;
+
+typedef struct QEMU_PACKED NvmeZonedResult {
+ uint64_t slba;
+} NvmeZonedResult;
+
+typedef struct QEMU_PACKED NvmeCqe {
+ uint32_t result;
+ uint32_t dw1;
+ uint16_t sq_head;
+ uint16_t sq_id;
+ uint16_t cid;
+ uint16_t status;
+} NvmeCqe;
+
+enum NvmeStatusCodes {
+ NVME_SUCCESS = 0x0000,
+ NVME_INVALID_OPCODE = 0x0001,
+ NVME_INVALID_FIELD = 0x0002,
+ NVME_CID_CONFLICT = 0x0003,
+ NVME_DATA_TRAS_ERROR = 0x0004,
+ NVME_POWER_LOSS_ABORT = 0x0005,
+ NVME_INTERNAL_DEV_ERROR = 0x0006,
+ NVME_CMD_ABORT_REQ = 0x0007,
+ NVME_CMD_ABORT_SQ_DEL = 0x0008,
+ NVME_CMD_ABORT_FAILED_FUSE = 0x0009,
+ NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
+ NVME_INVALID_NSID = 0x000b,
+ NVME_CMD_SEQ_ERROR = 0x000c,
+ NVME_INVALID_SGL_SEG_DESCR = 0x000d,
+ NVME_INVALID_NUM_SGL_DESCRS = 0x000e,
+ NVME_DATA_SGL_LEN_INVALID = 0x000f,
+ NVME_MD_SGL_LEN_INVALID = 0x0010,
+ NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
+ NVME_INVALID_USE_OF_CMB = 0x0012,
+ NVME_INVALID_PRP_OFFSET = 0x0013,
+ NVME_CMD_SET_CMB_REJECTED = 0x002b,
+ NVME_INVALID_CMD_SET = 0x002c,
+ NVME_LBA_RANGE = 0x0080,
+ NVME_CAP_EXCEEDED = 0x0081,
+ NVME_NS_NOT_READY = 0x0082,
+ NVME_NS_RESV_CONFLICT = 0x0083,
+ NVME_FORMAT_IN_PROGRESS = 0x0084,
+ NVME_INVALID_CQID = 0x0100,
+ NVME_INVALID_QID = 0x0101,
+ NVME_MAX_QSIZE_EXCEEDED = 0x0102,
+ NVME_ACL_EXCEEDED = 0x0103,
+ NVME_RESERVED = 0x0104,
+ NVME_AER_LIMIT_EXCEEDED = 0x0105,
+ NVME_INVALID_FW_SLOT = 0x0106,
+ NVME_INVALID_FW_IMAGE = 0x0107,
+ NVME_INVALID_IRQ_VECTOR = 0x0108,
+ NVME_INVALID_LOG_ID = 0x0109,
+ NVME_INVALID_FORMAT = 0x010a,
+ NVME_FW_REQ_RESET = 0x010b,
+ NVME_INVALID_QUEUE_DEL = 0x010c,
+ NVME_FID_NOT_SAVEABLE = 0x010d,
+ NVME_FEAT_NOT_CHANGEABLE = 0x010e,
+ NVME_FEAT_NOT_NS_SPEC = 0x010f,
+ NVME_FW_REQ_SUSYSTEM_RESET = 0x0110,
+ NVME_NS_ALREADY_ATTACHED = 0x0118,
+ NVME_NS_PRIVATE = 0x0119,
+ NVME_NS_NOT_ATTACHED = 0x011a,
+ NVME_NS_CTRL_LIST_INVALID = 0x011c,
+ NVME_CONFLICTING_ATTRS = 0x0180,
+ NVME_INVALID_PROT_INFO = 0x0181,
+ NVME_WRITE_TO_RO = 0x0182,
+ NVME_CMD_SIZE_LIMIT = 0x0183,
+ NVME_ZONE_BOUNDARY_ERROR = 0x01b8,
+ NVME_ZONE_FULL = 0x01b9,
+ NVME_ZONE_READ_ONLY = 0x01ba,
+ NVME_ZONE_OFFLINE = 0x01bb,
+ NVME_ZONE_INVALID_WRITE = 0x01bc,
+ NVME_ZONE_TOO_MANY_ACTIVE = 0x01bd,
+ NVME_ZONE_TOO_MANY_OPEN = 0x01be,
+ NVME_ZONE_INVAL_TRANSITION = 0x01bf,
+ NVME_WRITE_FAULT = 0x0280,
+ NVME_UNRECOVERED_READ = 0x0281,
+ NVME_E2E_GUARD_ERROR = 0x0282,
+ NVME_E2E_APP_ERROR = 0x0283,
+ NVME_E2E_REF_ERROR = 0x0284,
+ NVME_CMP_FAILURE = 0x0285,
+ NVME_ACCESS_DENIED = 0x0286,
+ NVME_DULB = 0x0287,
+ NVME_MORE = 0x2000,
+ NVME_DNR = 0x4000,
+ NVME_NO_COMPLETE = 0xffff,
+};
+
+typedef struct QEMU_PACKED NvmeFwSlotInfoLog {
+ uint8_t afi;
+ uint8_t reserved1[7];
+ uint8_t frs1[8];
+ uint8_t frs2[8];
+ uint8_t frs3[8];
+ uint8_t frs4[8];
+ uint8_t frs5[8];
+ uint8_t frs6[8];
+ uint8_t frs7[8];
+ uint8_t reserved2[448];
+} NvmeFwSlotInfoLog;
+
+typedef struct QEMU_PACKED NvmeErrorLog {
+ uint64_t error_count;
+ uint16_t sqid;
+ uint16_t cid;
+ uint16_t status_field;
+ uint16_t param_error_location;
+ uint64_t lba;
+ uint32_t nsid;
+ uint8_t vs;
+ uint8_t resv[35];
+} NvmeErrorLog;
+
+typedef struct QEMU_PACKED NvmeSmartLog {
+ uint8_t critical_warning;
+ uint16_t temperature;
+ uint8_t available_spare;
+ uint8_t available_spare_threshold;
+ uint8_t percentage_used;
+ uint8_t reserved1[26];
+ uint64_t data_units_read[2];
+ uint64_t data_units_written[2];
+ uint64_t host_read_commands[2];
+ uint64_t host_write_commands[2];
+ uint64_t controller_busy_time[2];
+ uint64_t power_cycles[2];
+ uint64_t power_on_hours[2];
+ uint64_t unsafe_shutdowns[2];
+ uint64_t media_errors[2];
+ uint64_t number_of_error_log_entries[2];
+ uint8_t reserved2[320];
+} NvmeSmartLog;
+
+#define NVME_SMART_WARN_MAX 6
+enum NvmeSmartWarn {
+ NVME_SMART_SPARE = 1 << 0,
+ NVME_SMART_TEMPERATURE = 1 << 1,
+ NVME_SMART_RELIABILITY = 1 << 2,
+ NVME_SMART_MEDIA_READ_ONLY = 1 << 3,
+ NVME_SMART_FAILED_VOLATILE_MEDIA = 1 << 4,
+ NVME_SMART_PMR_UNRELIABLE = 1 << 5,
+};
+
+typedef struct NvmeEffectsLog {
+ uint32_t acs[256];
+ uint32_t iocs[256];
+ uint8_t resv[2048];
+} NvmeEffectsLog;
+
+enum {
+ NVME_CMD_EFF_CSUPP = 1 << 0,
+ NVME_CMD_EFF_LBCC = 1 << 1,
+ NVME_CMD_EFF_NCC = 1 << 2,
+ NVME_CMD_EFF_NIC = 1 << 3,
+ NVME_CMD_EFF_CCC = 1 << 4,
+ NVME_CMD_EFF_CSE_MASK = 3 << 16,
+ NVME_CMD_EFF_UUID_SEL = 1 << 19,
+};
+
+enum NvmeLogIdentifier {
+ NVME_LOG_ERROR_INFO = 0x01,
+ NVME_LOG_SMART_INFO = 0x02,
+ NVME_LOG_FW_SLOT_INFO = 0x03,
+ NVME_LOG_CHANGED_NSLIST = 0x04,
+ NVME_LOG_CMD_EFFECTS = 0x05,
+};
+
+typedef struct QEMU_PACKED NvmePSD {
+ uint16_t mp;
+ uint16_t reserved;
+ uint32_t enlat;
+ uint32_t exlat;
+ uint8_t rrt;
+ uint8_t rrl;
+ uint8_t rwt;
+ uint8_t rwl;
+ uint8_t resv[16];
+} NvmePSD;
+
+#define NVME_CONTROLLER_LIST_SIZE 2048
+#define NVME_IDENTIFY_DATA_SIZE 4096
+
+enum NvmeIdCns {
+ NVME_ID_CNS_NS = 0x00,
+ NVME_ID_CNS_CTRL = 0x01,
+ NVME_ID_CNS_NS_ACTIVE_LIST = 0x02,
+ NVME_ID_CNS_NS_DESCR_LIST = 0x03,
+ NVME_ID_CNS_CS_NS = 0x05,
+ NVME_ID_CNS_CS_CTRL = 0x06,
+ NVME_ID_CNS_CS_NS_ACTIVE_LIST = 0x07,
+ NVME_ID_CNS_NS_PRESENT_LIST = 0x10,
+ NVME_ID_CNS_NS_PRESENT = 0x11,
+ NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
+ NVME_ID_CNS_CTRL_LIST = 0x13,
+ NVME_ID_CNS_CS_NS_PRESENT_LIST = 0x1a,
+ NVME_ID_CNS_CS_NS_PRESENT = 0x1b,
+ NVME_ID_CNS_IO_COMMAND_SET = 0x1c,
+};
+
+typedef struct QEMU_PACKED NvmeIdCtrl {
+ uint16_t vid;
+ uint16_t ssvid;
+ uint8_t sn[20];
+ uint8_t mn[40];
+ uint8_t fr[8];
+ uint8_t rab;
+ uint8_t ieee[3];
+ uint8_t cmic;
+ uint8_t mdts;
+ uint16_t cntlid;
+ uint32_t ver;
+ uint32_t rtd3r;
+ uint32_t rtd3e;
+ uint32_t oaes;
+ uint32_t ctratt;
+ uint8_t rsvd100[11];
+ uint8_t cntrltype;
+ uint8_t fguid[16];
+ uint8_t rsvd128[128];
+ uint16_t oacs;
+ uint8_t acl;
+ uint8_t aerl;
+ uint8_t frmw;
+ uint8_t lpa;
+ uint8_t elpe;
+ uint8_t npss;
+ uint8_t avscc;
+ uint8_t apsta;
+ uint16_t wctemp;
+ uint16_t cctemp;
+ uint16_t mtfa;
+ uint32_t hmpre;
+ uint32_t hmmin;
+ uint8_t tnvmcap[16];
+ uint8_t unvmcap[16];
+ uint32_t rpmbs;
+ uint16_t edstt;
+ uint8_t dsto;
+ uint8_t fwug;
+ uint16_t kas;
+ uint16_t hctma;
+ uint16_t mntmt;
+ uint16_t mxtmt;
+ uint32_t sanicap;
+ uint8_t rsvd332[180];
+ uint8_t sqes;
+ uint8_t cqes;
+ uint16_t maxcmd;
+ uint32_t nn;
+ uint16_t oncs;
+ uint16_t fuses;
+ uint8_t fna;
+ uint8_t vwc;
+ uint16_t awun;
+ uint16_t awupf;
+ uint8_t nvscc;
+ uint8_t rsvd531;
+ uint16_t acwu;
+ uint16_t ocfs;
+ uint32_t sgls;
+ uint8_t rsvd540[228];
+ uint8_t subnqn[256];
+ uint8_t rsvd1024[1024];
+ NvmePSD psd[32];
+ uint8_t vs[1024];
+} NvmeIdCtrl;
+
+typedef struct NvmeIdCtrlZoned {
+ uint8_t zasl;
+ uint8_t rsvd1[4095];
+} NvmeIdCtrlZoned;
+
+typedef struct NvmeIdCtrlNvm {
+ uint8_t vsl;
+ uint8_t wzsl;
+ uint8_t wusl;
+ uint8_t dmrl;
+ uint32_t dmrsl;
+ uint64_t dmsl;
+ uint8_t rsvd16[4080];
+} NvmeIdCtrlNvm;
+
+enum NvmeIdCtrlOaes {
+ NVME_OAES_NS_ATTR = 1 << 8,
+};
+
+enum NvmeIdCtrlOacs {
+ NVME_OACS_SECURITY = 1 << 0,
+ NVME_OACS_FORMAT = 1 << 1,
+ NVME_OACS_FW = 1 << 2,
+ NVME_OACS_NS_MGMT = 1 << 3,
+};
+
+enum NvmeIdCtrlOncs {
+ NVME_ONCS_COMPARE = 1 << 0,
+ NVME_ONCS_WRITE_UNCORR = 1 << 1,
+ NVME_ONCS_DSM = 1 << 2,
+ NVME_ONCS_WRITE_ZEROES = 1 << 3,
+ NVME_ONCS_FEATURES = 1 << 4,
+ NVME_ONCS_RESRVATIONS = 1 << 5,
+ NVME_ONCS_TIMESTAMP = 1 << 6,
+ NVME_ONCS_VERIFY = 1 << 7,
+ NVME_ONCS_COPY = 1 << 8,
+};
+
+enum NvmeIdCtrlOcfs {
+ NVME_OCFS_COPY_FORMAT_0 = 1 << 0,
+};
+
+enum NvmeIdctrlVwc {
+ NVME_VWC_PRESENT = 1 << 0,
+ NVME_VWC_NSID_BROADCAST_NO_SUPPORT = 0 << 1,
+ NVME_VWC_NSID_BROADCAST_RESERVED = 1 << 1,
+ NVME_VWC_NSID_BROADCAST_CTRL_SPEC = 2 << 1,
+ NVME_VWC_NSID_BROADCAST_SUPPORT = 3 << 1,
+};
+
+enum NvmeIdCtrlFrmw {
+ NVME_FRMW_SLOT1_RO = 1 << 0,
+};
+
+enum NvmeIdCtrlLpa {
+ NVME_LPA_NS_SMART = 1 << 0,
+ NVME_LPA_CSE = 1 << 1,
+ NVME_LPA_EXTENDED = 1 << 2,
+};
+
+enum NvmeIdCtrlCmic {
+ NVME_CMIC_MULTI_CTRL = 1 << 1,
+};
+
+enum NvmeNsAttachmentOperation {
+ NVME_NS_ATTACHMENT_ATTACH = 0x0,
+ NVME_NS_ATTACHMENT_DETACH = 0x1,
+};
+
+#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
+#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf)
+#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf)
+#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf)
+
+#define NVME_CTRL_SGLS_SUPPORT_MASK (0x3 << 0)
+#define NVME_CTRL_SGLS_SUPPORT_NO_ALIGN (0x1 << 0)
+#define NVME_CTRL_SGLS_SUPPORT_DWORD_ALIGN (0x1 << 1)
+#define NVME_CTRL_SGLS_KEYED (0x1 << 2)
+#define NVME_CTRL_SGLS_BITBUCKET (0x1 << 16)
+#define NVME_CTRL_SGLS_MPTR_CONTIGUOUS (0x1 << 17)
+#define NVME_CTRL_SGLS_EXCESS_LENGTH (0x1 << 18)
+#define NVME_CTRL_SGLS_MPTR_SGL (0x1 << 19)
+#define NVME_CTRL_SGLS_ADDR_OFFSET (0x1 << 20)
+
+#define NVME_ARB_AB(arb) (arb & 0x7)
+#define NVME_ARB_AB_NOLIMIT 0x7
+#define NVME_ARB_LPW(arb) ((arb >> 8) & 0xff)
+#define NVME_ARB_MPW(arb) ((arb >> 16) & 0xff)
+#define NVME_ARB_HPW(arb) ((arb >> 24) & 0xff)
+
+#define NVME_INTC_THR(intc) (intc & 0xff)
+#define NVME_INTC_TIME(intc) ((intc >> 8) & 0xff)
+
+#define NVME_INTVC_NOCOALESCING (0x1 << 16)
+
+#define NVME_TEMP_THSEL(temp) ((temp >> 20) & 0x3)
+#define NVME_TEMP_THSEL_OVER 0x0
+#define NVME_TEMP_THSEL_UNDER 0x1
+
+#define NVME_TEMP_TMPSEL(temp) ((temp >> 16) & 0xf)
+#define NVME_TEMP_TMPSEL_COMPOSITE 0x0
+
+#define NVME_TEMP_TMPTH(temp) (temp & 0xffff)
+
+#define NVME_AEC_SMART(aec) (aec & 0xff)
+#define NVME_AEC_NS_ATTR(aec) ((aec >> 8) & 0x1)
+#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+
+#define NVME_ERR_REC_TLER(err_rec) (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
+enum NvmeFeatureIds {
+ NVME_ARBITRATION = 0x1,
+ NVME_POWER_MANAGEMENT = 0x2,
+ NVME_LBA_RANGE_TYPE = 0x3,
+ NVME_TEMPERATURE_THRESHOLD = 0x4,
+ NVME_ERROR_RECOVERY = 0x5,
+ NVME_VOLATILE_WRITE_CACHE = 0x6,
+ NVME_NUMBER_OF_QUEUES = 0x7,
+ NVME_INTERRUPT_COALESCING = 0x8,
+ NVME_INTERRUPT_VECTOR_CONF = 0x9,
+ NVME_WRITE_ATOMICITY = 0xa,
+ NVME_ASYNCHRONOUS_EVENT_CONF = 0xb,
+ NVME_TIMESTAMP = 0xe,
+ NVME_COMMAND_SET_PROFILE = 0x19,
+ NVME_SOFTWARE_PROGRESS_MARKER = 0x80,
+ NVME_FID_MAX = 0x100,
+};
+
+typedef enum NvmeFeatureCap {
+ NVME_FEAT_CAP_SAVE = 1 << 0,
+ NVME_FEAT_CAP_NS = 1 << 1,
+ NVME_FEAT_CAP_CHANGE = 1 << 2,
+} NvmeFeatureCap;
+
+typedef enum NvmeGetFeatureSelect {
+ NVME_GETFEAT_SELECT_CURRENT = 0x0,
+ NVME_GETFEAT_SELECT_DEFAULT = 0x1,
+ NVME_GETFEAT_SELECT_SAVED = 0x2,
+ NVME_GETFEAT_SELECT_CAP = 0x3,
+} NvmeGetFeatureSelect;
+
+#define NVME_GETSETFEAT_FID_MASK 0xff
+#define NVME_GETSETFEAT_FID(dw10) (dw10 & NVME_GETSETFEAT_FID_MASK)
+
+#define NVME_GETFEAT_SELECT_SHIFT 8
+#define NVME_GETFEAT_SELECT_MASK 0x7
+#define NVME_GETFEAT_SELECT(dw10) \
+ ((dw10 >> NVME_GETFEAT_SELECT_SHIFT) & NVME_GETFEAT_SELECT_MASK)
+
+#define NVME_SETFEAT_SAVE_SHIFT 31
+#define NVME_SETFEAT_SAVE_MASK 0x1
+#define NVME_SETFEAT_SAVE(dw10) \
+ ((dw10 >> NVME_SETFEAT_SAVE_SHIFT) & NVME_SETFEAT_SAVE_MASK)
+
+typedef struct QEMU_PACKED NvmeRangeType {
+ uint8_t type;
+ uint8_t attributes;
+ uint8_t rsvd2[14];
+ uint64_t slba;
+ uint64_t nlb;
+ uint8_t guid[16];
+ uint8_t rsvd48[16];
+} NvmeRangeType;
+
+typedef struct QEMU_PACKED NvmeLBAF {
+ uint16_t ms;
+ uint8_t ds;
+ uint8_t rp;
+} NvmeLBAF;
+
+typedef struct QEMU_PACKED NvmeLBAFE {
+ uint64_t zsze;
+ uint8_t zdes;
+ uint8_t rsvd9[7];
+} NvmeLBAFE;
+
+#define NVME_NSID_BROADCAST 0xffffffff
+
+typedef struct QEMU_PACKED NvmeIdNs {
+ uint64_t nsze;
+ uint64_t ncap;
+ uint64_t nuse;
+ uint8_t nsfeat;
+ uint8_t nlbaf;
+ uint8_t flbas;
+ uint8_t mc;
+ uint8_t dpc;
+ uint8_t dps;
+ uint8_t nmic;
+ uint8_t rescap;
+ uint8_t fpi;
+ uint8_t dlfeat;
+ uint16_t nawun;
+ uint16_t nawupf;
+ uint16_t nacwu;
+ uint16_t nabsn;
+ uint16_t nabo;
+ uint16_t nabspf;
+ uint16_t noiob;
+ uint8_t nvmcap[16];
+ uint16_t npwg;
+ uint16_t npwa;
+ uint16_t npdg;
+ uint16_t npda;
+ uint16_t nows;
+ uint16_t mssrl;
+ uint32_t mcl;
+ uint8_t msrc;
+ uint8_t rsvd81[23];
+ uint8_t nguid[16];
+ uint64_t eui64;
+ NvmeLBAF lbaf[16];
+ uint8_t rsvd192[192];
+ uint8_t vs[3712];
+} NvmeIdNs;
+
+typedef struct QEMU_PACKED NvmeIdNsDescr {
+ uint8_t nidt;
+ uint8_t nidl;
+ uint8_t rsvd2[2];
+} NvmeIdNsDescr;
+
+enum NvmeNsIdentifierLength {
+ NVME_NIDL_EUI64 = 8,
+ NVME_NIDL_NGUID = 16,
+ NVME_NIDL_UUID = 16,
+ NVME_NIDL_CSI = 1,
+};
+
+enum NvmeNsIdentifierType {
+ NVME_NIDT_EUI64 = 0x01,
+ NVME_NIDT_NGUID = 0x02,
+ NVME_NIDT_UUID = 0x03,
+ NVME_NIDT_CSI = 0x04,
+};
+
+enum NvmeIdNsNmic {
+ NVME_NMIC_NS_SHARED = 1 << 0,
+};
+
+enum NvmeCsi {
+ NVME_CSI_NVM = 0x00,
+ NVME_CSI_ZONED = 0x02,
+};
+
+#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
+
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+ uint16_t zoc;
+ uint16_t ozcs;
+ uint32_t mar;
+ uint32_t mor;
+ uint32_t rrl;
+ uint32_t frl;
+ uint8_t rsvd20[2796];
+ NvmeLBAFE lbafe[16];
+ uint8_t rsvd3072[768];
+ uint8_t vs[256];
+} NvmeIdNsZoned;
+
+/*Deallocate Logical Block Features*/
+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat) ((dlfeat) & 0x10)
+#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat) ((dlfeat) & 0x08)
+
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat) ((dlfeat) & 0x7)
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED 0
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES 1
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES 2
+
+
+#define NVME_ID_NS_NSFEAT_THIN(nsfeat) ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat) ((nsfeat >> 2) & 0x1)
+#define NVME_ID_NS_FLBAS_EXTENDED(flbas) ((flbas >> 4) & 0x1)
+#define NVME_ID_NS_FLBAS_INDEX(flbas) ((flbas & 0xf))
+#define NVME_ID_NS_MC_SEPARATE(mc) ((mc >> 1) & 0x1)
+#define NVME_ID_NS_MC_EXTENDED(mc) ((mc & 0x1))
+#define NVME_ID_NS_DPC_LAST_EIGHT(dpc) ((dpc >> 4) & 0x1)
+#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc) ((dpc >> 3) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_3(dpc) ((dpc >> 2) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_2(dpc) ((dpc >> 1) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_1(dpc) ((dpc & 0x1))
+#define NVME_ID_NS_DPC_TYPE_MASK 0x7
+
+enum NvmeIdNsDps {
+ NVME_ID_NS_DPS_TYPE_NONE = 0,
+ NVME_ID_NS_DPS_TYPE_1 = 1,
+ NVME_ID_NS_DPS_TYPE_2 = 2,
+ NVME_ID_NS_DPS_TYPE_3 = 3,
+ NVME_ID_NS_DPS_TYPE_MASK = 0x7,
+ NVME_ID_NS_DPS_FIRST_EIGHT = 8,
+};
+
+enum NvmeIdNsFlbas {
+ NVME_ID_NS_FLBAS_EXTENDED = 1 << 4,
+};
+
+enum NvmeIdNsMc {
+ NVME_ID_NS_MC_EXTENDED = 1 << 0,
+ NVME_ID_NS_MC_SEPARATE = 1 << 1,
+};
+
+#define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK)
+
+typedef struct NvmeDifTuple {
+ uint16_t guard;
+ uint16_t apptag;
+ uint32_t reftag;
+} NvmeDifTuple;
+
+enum NvmeZoneAttr {
+ NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
+ NVME_ZA_FINISH_RECOMMENDED = 1 << 1,
+ NVME_ZA_RESET_RECOMMENDED = 1 << 2,
+ NVME_ZA_ZD_EXT_VALID = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+ uint64_t nr_zones;
+ uint8_t rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+ NVME_ZONE_REPORT = 0,
+ NVME_ZONE_REPORT_EXTENDED = 1,
+};
+
+enum NvmeZoneReportType {
+ NVME_ZONE_REPORT_ALL = 0,
+ NVME_ZONE_REPORT_EMPTY = 1,
+ NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+ NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+ NVME_ZONE_REPORT_CLOSED = 4,
+ NVME_ZONE_REPORT_FULL = 5,
+ NVME_ZONE_REPORT_READ_ONLY = 6,
+ NVME_ZONE_REPORT_OFFLINE = 7,
+};
+
+enum NvmeZoneType {
+ NVME_ZONE_TYPE_RESERVED = 0x00,
+ NVME_ZONE_TYPE_SEQ_WRITE = 0x02,
+};
+
+enum NvmeZoneSendAction {
+ NVME_ZONE_ACTION_RSD = 0x00,
+ NVME_ZONE_ACTION_CLOSE = 0x01,
+ NVME_ZONE_ACTION_FINISH = 0x02,
+ NVME_ZONE_ACTION_OPEN = 0x03,
+ NVME_ZONE_ACTION_RESET = 0x04,
+ NVME_ZONE_ACTION_OFFLINE = 0x05,
+ NVME_ZONE_ACTION_SET_ZD_EXT = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+ uint8_t zt;
+ uint8_t zs;
+ uint8_t za;
+ uint8_t rsvd3[5];
+ uint64_t zcap;
+ uint64_t zslba;
+ uint64_t wp;
+ uint8_t rsvd32[32];
+} NvmeZoneDescr;
+
+typedef enum NvmeZoneState {
+ NVME_ZONE_STATE_RESERVED = 0x00,
+ NVME_ZONE_STATE_EMPTY = 0x01,
+ NVME_ZONE_STATE_IMPLICITLY_OPEN = 0x02,
+ NVME_ZONE_STATE_EXPLICITLY_OPEN = 0x03,
+ NVME_ZONE_STATE_CLOSED = 0x04,
+ NVME_ZONE_STATE_READ_ONLY = 0x0d,
+ NVME_ZONE_STATE_FULL = 0x0e,
+ NVME_ZONE_STATE_OFFLINE = 0x0f,
+} NvmeZoneState;
+
+static inline void _nvme_check_size(void)
+{
+ QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlNvm) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
+ QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 8);
+}
+#endif
diff --git a/include/block/qapi.h b/include/block/qapi.h
new file mode 100644
index 000000000..22c7807c8
--- /dev/null
+++ b/include/block/qapi.h
@@ -0,0 +1,45 @@
+/*
+ * Block layer qmp and info dump related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QAPI_H
+#define BLOCK_QAPI_H
+
+#include "block/block.h"
+#include "block/snapshot.h"
+
+BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
+ BlockDriverState *bs,
+ bool flat,
+ Error **errp);
+int bdrv_query_snapshot_info_list(BlockDriverState *bs,
+ SnapshotInfoList **p_list,
+ Error **errp);
+void bdrv_query_image_info(BlockDriverState *bs,
+ ImageInfo **p_info,
+ Error **errp);
+
+void bdrv_snapshot_dump(QEMUSnapshotInfo *sn);
+void bdrv_image_info_specific_dump(ImageInfoSpecific *info_spec);
+void bdrv_image_info_dump(ImageInfo *info);
+#endif
diff --git a/include/block/qdict.h b/include/block/qdict.h
new file mode 100644
index 000000000..ced2acfb9
--- /dev/null
+++ b/include/block/qdict.h
@@ -0,0 +1,32 @@
+/*
+ * Special QDict functions used by the block layer
+ *
+ * Copyright (c) 2013-2018 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_QDICT_H
+#define BLOCK_QDICT_H
+
+#include "qapi/qmp/qdict.h"
+
+void qdict_copy_default(QDict *dst, QDict *src, const char *key);
+void qdict_set_default_str(QDict *dst, const char *key, const char *val);
+
+void qdict_join(QDict *dest, QDict *src, bool overwrite);
+
+void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start);
+void qdict_array_split(QDict *src, QList **dst);
+int qdict_array_entries(QDict *src, const char *subqdict);
+
+typedef struct QDictRenames {
+ const char *from;
+ const char *to;
+} QDictRenames;
+bool qdict_rename_keys(QDict *qdict, const QDictRenames *renames, Error **errp);
+
+Visitor *qobject_input_visitor_new_flat_confused(QDict *qdict,
+ Error **errp);
+#endif
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
new file mode 100644
index 000000000..21fc10c4c
--- /dev/null
+++ b/include/block/raw-aio.h
@@ -0,0 +1,90 @@
+/*
+ * Declarations for AIO in the raw protocol
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef QEMU_RAW_AIO_H
+#define QEMU_RAW_AIO_H
+
+#include "block/aio.h"
+#include "qemu/coroutine.h"
+#include "qemu/iov.h"
+
+/* AIO request types */
+#define QEMU_AIO_READ 0x0001
+#define QEMU_AIO_WRITE 0x0002
+#define QEMU_AIO_IOCTL 0x0004
+#define QEMU_AIO_FLUSH 0x0008
+#define QEMU_AIO_DISCARD 0x0010
+#define QEMU_AIO_WRITE_ZEROES 0x0020
+#define QEMU_AIO_COPY_RANGE 0x0040
+#define QEMU_AIO_TRUNCATE 0x0080
+#define QEMU_AIO_TYPE_MASK \
+ (QEMU_AIO_READ | \
+ QEMU_AIO_WRITE | \
+ QEMU_AIO_IOCTL | \
+ QEMU_AIO_FLUSH | \
+ QEMU_AIO_DISCARD | \
+ QEMU_AIO_WRITE_ZEROES | \
+ QEMU_AIO_COPY_RANGE | \
+ QEMU_AIO_TRUNCATE)
+
+/* AIO flags */
+#define QEMU_AIO_MISALIGNED 0x1000
+#define QEMU_AIO_BLKDEV 0x2000
+#define QEMU_AIO_NO_FALLBACK 0x4000
+
+
+/* linux-aio.c - Linux native implementation */
+#ifdef CONFIG_LINUX_AIO
+typedef struct LinuxAioState LinuxAioState;
+LinuxAioState *laio_init(Error **errp);
+void laio_cleanup(LinuxAioState *s);
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+ uint64_t offset, QEMUIOVector *qiov, int type,
+ uint64_t dev_max_batch);
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+ uint64_t dev_max_batch);
+#endif
+/* io_uring.c - Linux io_uring implementation */
+#ifdef CONFIG_LINUX_IO_URING
+typedef struct LuringState LuringState;
+LuringState *luring_init(Error **errp);
+void luring_cleanup(LuringState *s);
+int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
+ uint64_t offset, QEMUIOVector *qiov, int type);
+void luring_detach_aio_context(LuringState *s, AioContext *old_context);
+void luring_attach_aio_context(LuringState *s, AioContext *new_context);
+void luring_io_plug(BlockDriverState *bs, LuringState *s);
+void luring_io_unplug(BlockDriverState *bs, LuringState *s);
+#endif
+
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+void win32_aio_cleanup(QEMUWin32AIOState *aio);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
+ QEMUWin32AIOState *aio, HANDLE hfile,
+ uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
+ BlockCompletionFunc *cb, void *opaque, int type);
+void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
+ AioContext *old_context);
+void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
+ AioContext *new_context);
+#endif
+
+#endif /* QEMU_RAW_AIO_H */
diff --git a/include/block/replication.h b/include/block/replication.h
new file mode 100644
index 000000000..21931b4f0
--- /dev/null
+++ b/include/block/replication.h
@@ -0,0 +1,175 @@
+/*
+ * Replication filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ * Changlong Xie <xiecl.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef REPLICATION_H
+#define REPLICATION_H
+
+#include "qapi/qapi-types-block-core.h"
+#include "qemu/module.h"
+#include "qemu/queue.h"
+
+typedef struct ReplicationOps ReplicationOps;
+typedef struct ReplicationState ReplicationState;
+
+/**
+ * SECTION:block/replication.h
+ * @title:Base Replication System
+ * @short_description: interfaces for handling replication
+ *
+ * The Replication Model provides a framework for handling Replication
+ *
+ * <example>
+ * <title>How to use replication interfaces</title>
+ * <programlisting>
+ * #include "block/replication.h"
+ *
+ * typedef struct BDRVReplicationState {
+ * ReplicationState *rs;
+ * } BDRVReplicationState;
+ *
+ * static void replication_start(ReplicationState *rs, ReplicationMode mode,
+ * Error **errp);
+ * static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
+ * static void replication_get_error(ReplicationState *rs, Error **errp);
+ * static void replication_stop(ReplicationState *rs, bool failover,
+ * Error **errp);
+ *
+ * static ReplicationOps replication_ops = {
+ * .start = replication_start,
+ * .checkpoint = replication_do_checkpoint,
+ * .get_error = replication_get_error,
+ * .stop = replication_stop,
+ * }
+ *
+ * static int replication_open(BlockDriverState *bs, QDict *options,
+ * int flags, Error **errp)
+ * {
+ * BDRVReplicationState *s = bs->opaque;
+ * s->rs = replication_new(bs, &replication_ops);
+ * return 0;
+ * }
+ *
+ * static void replication_close(BlockDriverState *bs)
+ * {
+ * BDRVReplicationState *s = bs->opaque;
+ * replication_remove(s->rs);
+ * }
+ *
+ * BlockDriver bdrv_replication = {
+ * .format_name = "replication",
+ * .instance_size = sizeof(BDRVReplicationState),
+ *
+ * .bdrv_open = replication_open,
+ * .bdrv_close = replication_close,
+ * };
+ *
+ * static void bdrv_replication_init(void)
+ * {
+ * bdrv_register(&bdrv_replication);
+ * }
+ *
+ * block_init(bdrv_replication_init);
+ * </programlisting>
+ * </example>
+ *
+ * We create an example about how to use replication interfaces in above.
+ * Then in migration, we can use replication_(start/stop/do_checkpoint/
+ * get_error)_all to handle all replication operations.
+ */
+
+/**
+ * ReplicationState:
+ * @opaque: opaque pointer value passed to this ReplicationState
+ * @ops: replication operation of this ReplicationState
+ * @node: node that we will insert into @replication_states QLIST
+ */
+struct ReplicationState {
+ void *opaque;
+ ReplicationOps *ops;
+ QLIST_ENTRY(ReplicationState) node;
+};
+
+/**
+ * ReplicationOps:
+ * @start: callback to start replication
+ * @stop: callback to stop replication
+ * @checkpoint: callback to do checkpoint
+ * @get_error: callback to check if error occurred during replication
+ */
+struct ReplicationOps {
+ void (*start)(ReplicationState *rs, ReplicationMode mode, Error **errp);
+ void (*stop)(ReplicationState *rs, bool failover, Error **errp);
+ void (*checkpoint)(ReplicationState *rs, Error **errp);
+ void (*get_error)(ReplicationState *rs, Error **errp);
+};
+
+/**
+ * replication_new:
+ * @opaque: opaque pointer value passed to ReplicationState
+ * @ops: replication operation of the new relevant ReplicationState
+ *
+ * Called to create a new ReplicationState instance, and then insert it
+ * into @replication_states QLIST
+ *
+ * Returns: the new ReplicationState instance
+ */
+ReplicationState *replication_new(void *opaque, ReplicationOps *ops);
+
+/**
+ * replication_remove:
+ * @rs: the ReplicationState instance to remove
+ *
+ * Called to remove a ReplicationState instance, and then delete it from
+ * @replication_states QLIST
+ */
+void replication_remove(ReplicationState *rs);
+
+/**
+ * replication_start_all:
+ * @mode: replication mode that could be "primary" or "secondary"
+ * @errp: returns an error if this function fails
+ *
+ * Start replication, called in migration/checkpoint thread
+ *
+ * Note: the caller of the function MUST make sure vm stopped
+ */
+void replication_start_all(ReplicationMode mode, Error **errp);
+
+/**
+ * replication_do_checkpoint_all:
+ * @errp: returns an error if this function fails
+ *
+ * This interface is called after all VM state is transferred to Secondary QEMU
+ */
+void replication_do_checkpoint_all(Error **errp);
+
+/**
+ * replication_get_error_all:
+ * @errp: returns an error if this function fails
+ *
+ * This interface is called to check if error occurred during replication
+ */
+void replication_get_error_all(Error **errp);
+
+/**
+ * replication_stop_all:
+ * @failover: boolean value that indicates if we need do failover or not
+ * @errp: returns an error if this function fails
+ *
+ * It is called on failover. The vm should be stopped before calling it, if you
+ * use this API to shutdown the guest, or other things except failover
+ */
+void replication_stop_all(bool failover, Error **errp);
+
+#endif /* REPLICATION_H */
diff --git a/include/block/snapshot.h b/include/block/snapshot.h
new file mode 100644
index 000000000..940345692
--- /dev/null
+++ b/include/block/snapshot.h
@@ -0,0 +1,102 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef SNAPSHOT_H
+#define SNAPSHOT_H
+
+#include "qapi/qapi-builtin-types.h"
+
+#define SNAPSHOT_OPT_BASE "snapshot."
+#define SNAPSHOT_OPT_ID "snapshot.id"
+#define SNAPSHOT_OPT_NAME "snapshot.name"
+
+extern QemuOptsList internal_snapshot_opts;
+
+typedef struct QEMUSnapshotInfo {
+ char id_str[128]; /* unique snapshot id */
+ /* the following fields are informative. They are not needed for
+ the consistency of the snapshot */
+ char name[256]; /* user chosen name */
+ uint64_t vm_state_size; /* VM state info size */
+ uint32_t date_sec; /* UTC date of the snapshot */
+ uint32_t date_nsec;
+ uint64_t vm_clock_nsec; /* VM clock relative to boot */
+ uint64_t icount; /* record/replay step */
+} QEMUSnapshotInfo;
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+ const char *name);
+bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
+ const char *id,
+ const char *name,
+ QEMUSnapshotInfo *sn_info,
+ Error **errp);
+int bdrv_can_snapshot(BlockDriverState *bs);
+int bdrv_snapshot_create(BlockDriverState *bs,
+ QEMUSnapshotInfo *sn_info);
+int bdrv_snapshot_goto(BlockDriverState *bs,
+ const char *snapshot_id,
+ Error **errp);
+int bdrv_snapshot_delete(BlockDriverState *bs,
+ const char *snapshot_id,
+ const char *name,
+ Error **errp);
+int bdrv_snapshot_list(BlockDriverState *bs,
+ QEMUSnapshotInfo **psn_info);
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+ const char *snapshot_id,
+ const char *name,
+ Error **errp);
+int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
+ const char *id_or_name,
+ Error **errp);
+
+
+/* Group operations. All block drivers are involved.
+ * These functions will properly handle dataplane (take aio_context_acquire
+ * when appropriate for appropriate block drivers */
+
+bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
+ Error **errp);
+int bdrv_all_delete_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp);
+int bdrv_all_goto_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp);
+int bdrv_all_has_snapshot(const char *name,
+ bool has_devices, strList *devices,
+ Error **errp);
+int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
+ BlockDriverState *vm_state_bs,
+ uint64_t vm_state_size,
+ bool has_devices,
+ strList *devices,
+ Error **errp);
+
+BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
+ bool has_devices, strList *devices,
+ Error **errp);
+
+#endif
diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
new file mode 100644
index 000000000..7dd7d730a
--- /dev/null
+++ b/include/block/thread-pool.h
@@ -0,0 +1,37 @@
+/*
+ * QEMU block layer thread pool
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ * Paolo Bonzini <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef QEMU_THREAD_POOL_H
+#define QEMU_THREAD_POOL_H
+
+#include "block/block.h"
+
+typedef int ThreadPoolFunc(void *opaque);
+
+typedef struct ThreadPool ThreadPool;
+
+ThreadPool *thread_pool_new(struct AioContext *ctx);
+void thread_pool_free(ThreadPool *pool);
+
+BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
+ ThreadPoolFunc *func, void *arg,
+ BlockCompletionFunc *cb, void *opaque);
+int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
+ ThreadPoolFunc *func, void *arg);
+void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
+
+#endif
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
new file mode 100644
index 000000000..9541b3243
--- /dev/null
+++ b/include/block/throttle-groups.h
@@ -0,0 +1,91 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ * BenoƮt Canet <benoit.canet@nodalink.com>
+ * Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef THROTTLE_GROUPS_H
+#define THROTTLE_GROUPS_H
+
+#include "qemu/throttle.h"
+#include "block/block_int.h"
+#include "qom/object.h"
+
+/* The ThrottleGroupMember structure indicates membership in a ThrottleGroup
+ * and holds related data.
+ */
+
+typedef struct ThrottleGroupMember {
+ AioContext *aio_context;
+ /* throttled_reqs_lock protects the CoQueues for throttled requests. */
+ CoMutex throttled_reqs_lock;
+ CoQueue throttled_reqs[2];
+
+ /* Nonzero if the I/O limits are currently being ignored; generally
+ * it is zero. Accessed with atomic operations.
+ */
+ unsigned int io_limits_disabled;
+
+ /* Number of pending throttle_group_restart_queue_entry() coroutines.
+ * Accessed with atomic operations.
+ */
+ unsigned int restart_pending;
+
+ /* The following fields are protected by the ThrottleGroup lock.
+ * See the ThrottleGroup documentation for details.
+ * throttle_state tells us if I/O limits are configured. */
+ ThrottleState *throttle_state;
+ ThrottleTimers throttle_timers;
+ unsigned pending_reqs[2];
+ QLIST_ENTRY(ThrottleGroupMember) round_robin;
+
+} ThrottleGroupMember;
+
+#define TYPE_THROTTLE_GROUP "throttle-group"
+OBJECT_DECLARE_SIMPLE_TYPE(ThrottleGroup, THROTTLE_GROUP)
+
+const char *throttle_group_get_name(ThrottleGroupMember *tgm);
+
+ThrottleState *throttle_group_incref(const char *name);
+void throttle_group_unref(ThrottleState *ts);
+
+void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+
+void throttle_group_register_tgm(ThrottleGroupMember *tgm,
+ const char *groupname,
+ AioContext *ctx);
+void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
+void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
+
+void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
+ int64_t bytes,
+ bool is_write);
+void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
+ AioContext *new_context);
+void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
+/*
+ * throttle_group_exists() must be called under the global
+ * mutex.
+ */
+bool throttle_group_exists(const char *name);
+
+#endif
diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
new file mode 100644
index 000000000..f50f923e7
--- /dev/null
+++ b/include/block/write-threshold.h
@@ -0,0 +1,47 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ * Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_WRITE_THRESHOLD_H
+#define BLOCK_WRITE_THRESHOLD_H
+
+#include "qemu/typedefs.h"
+
+/*
+ * bdrv_write_threshold_set:
+ *
+ * Set the write threshold for block devices, in bytes.
+ * Notify when a write exceeds the threshold, meaning the device
+ * is becoming full, so it can be transparently resized.
+ * To be used with thin-provisioned block devices.
+ *
+ * Use threshold_bytes == 0 to disable.
+ */
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
+
+/*
+ * bdrv_write_threshold_get
+ *
+ * Get the configured write threshold, in bytes.
+ * Zero means no threshold configured.
+ */
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_check_write
+ *
+ * Check whether the specified request exceeds the write threshold.
+ * If so, send a corresponding event and disable write threshold checking.
+ */
+void bdrv_write_threshold_check_write(BlockDriverState *bs, int64_t offset,
+ int64_t bytes);
+
+#endif