24 files changed, 6724 insertions, 0 deletions
diff --git a/include/block/accounting.h b/include/block/accounting.h
new file mode 100644
index 000000000..878b4c358
--- /dev/null
+++ b/include/block/accounting.h
@@ -0,0 +1,123 @@
+/*
+ * QEMU System Emulator block accounting
+ *
+ * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_ACCOUNTING_H
+#define BLOCK_ACCOUNTING_H
+
+#include "qemu/timed-average.h"
+#include "qemu/thread.h"
+#include "qapi/qapi-builtin-types.h"
+
+typedef struct BlockAcctTimedStats BlockAcctTimedStats;
+typedef struct BlockAcctStats BlockAcctStats;
+
+enum BlockAcctType {
+    BLOCK_ACCT_NONE = 0,
+    BLOCK_ACCT_READ,
+    BLOCK_ACCT_WRITE,
+    BLOCK_ACCT_FLUSH,
+    BLOCK_ACCT_UNMAP,
+    BLOCK_MAX_IOTYPE,
+};
+
+struct BlockAcctTimedStats {
+    BlockAcctStats *stats;
+    TimedAverage latency[BLOCK_MAX_IOTYPE];
+    unsigned interval_length; /* in seconds */
+    QSLIST_ENTRY(BlockAcctTimedStats) entries;
+};
+
+typedef struct BlockLatencyHistogram {
+    /* The following histogram is represented like this:
+     *
+     * 5|           *
+     * 4|           *
+     * 3| *         *
+     * 2| *         *    *
+     * 1| *    *    *    *
+     *  +------------------
+     *      10   50   100
+     *
+     * BlockLatencyHistogram histogram = {
+     *     .nbins = 4,
+     *     .boundaries = {10, 50, 100},
+     *     .bins = {3, 1, 5, 2},
+     * };
+     *
+     * @boundaries array define histogram intervals as follows:
+     * [0, boundaries[0]), [boundaries[0], boundaries[1]), ...
+     * [boundaries[nbins-2], +inf)
+     *
+     * So, for example above, histogram intervals are:
+     * [0, 10), [10, 50), [50, 100), [100, +inf)
+     */
+    int nbins;
+    uint64_t *boundaries; /* @nbins-1 numbers here
+                             (all boundaries, except 0 and +inf) */
+    uint64_t *bins;
+} BlockLatencyHistogram;
+
+struct BlockAcctStats {
+    QemuMutex lock;
+    uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
+    uint64_t nr_ops[BLOCK_MAX_IOTYPE];
+    uint64_t invalid_ops[BLOCK_MAX_IOTYPE];
+    uint64_t failed_ops[BLOCK_MAX_IOTYPE];
+    uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
+    uint64_t merged[BLOCK_MAX_IOTYPE];
+    int64_t last_access_time_ns;
+    QSLIST_HEAD(, BlockAcctTimedStats) intervals;
+    bool account_invalid;
+    bool account_failed;
+    BlockLatencyHistogram latency_histogram[BLOCK_MAX_IOTYPE];
+};
+
+typedef struct BlockAcctCookie {
+    int64_t bytes;
+    int64_t start_time_ns;
+    enum BlockAcctType type;
+} BlockAcctCookie;
+
+void block_acct_init(BlockAcctStats *stats);
+void block_acct_setup(BlockAcctStats *stats, bool account_invalid,
+                     bool account_failed);
+void block_acct_cleanup(BlockAcctStats *stats);
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length);
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+                                              BlockAcctTimedStats *s);
+void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
+                      int64_t bytes, enum BlockAcctType type);
+void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type);
+void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
+                           int num_requests);
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats);
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type);
+int block_latency_histogram_set(BlockAcctStats *stats, enum BlockAcctType type,
+                                uint64List *boundaries);
+void block_latency_histograms_clear(BlockAcctStats *stats);
+
+#endif
diff --git a/include/block/aio-wait.h b/include/block/aio-wait.h
new file mode 100644
index 000000000..b39eefb38
--- /dev/null
+++ b/include/block/aio-wait.h
@@ -0,0 +1,149 @@
+/*
+ * AioContext wait support
+ *
+ * Copyright (C) 2018 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_AIO_WAIT_H
+#define QEMU_AIO_WAIT_H
+
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+
+/**
+ * AioWait:
+ *
+ * An object that facilitates synchronous waiting on a condition. A single
+ * global AioWait object (global_aio_wait) is used internally.
+ *
+ * The main loop can wait on an operation running in an IOThread as follows:
+ *
+ *   AioContext *ctx = ...;
+ *   MyWork work = { .done = false };
+ *   schedule_my_work_in_iothread(ctx, &work);
+ *   AIO_WAIT_WHILE(ctx, !work.done);
+ *
+ * The IOThread must call aio_wait_kick() to notify the main loop when
+ * work.done changes:
+ *
+ *   static void do_work(...)
+ *   {
+ *       ...
+ *       work.done = true;
+ *       aio_wait_kick();
+ *   }
+ */
+typedef struct {
+    /* Number of waiting AIO_WAIT_WHILE() callers. Accessed with atomic ops. */
+    unsigned num_waiters;
+} AioWait;
+
+extern AioWait global_aio_wait;
+
+/**
+ * AIO_WAIT_WHILE:
+ * @ctx: the aio context, or NULL if multiple aio contexts (for which the
+ *       caller does not hold a lock) are involved in the polling condition.
+ * @cond: wait while this conditional expression is true
+ *
+ * Wait while a condition is true.  Use this to implement synchronous
+ * operations that require event loop activity.
+ *
+ * The caller must be sure that something calls aio_wait_kick() when the value
+ * of @cond might have changed.
+ *
+ * The caller's thread must be the IOThread that owns @ctx or the main loop
+ * thread (with @ctx acquired exactly once).  This function cannot be used to
+ * wait on conditions between two IOThreads since that could lead to deadlock,
+ * go via the main loop instead.
+ */
+#define AIO_WAIT_WHILE(ctx, cond) ({                               \
+    bool waited_ = false;                                          \
+    AioWait *wait_ = &global_aio_wait;                             \
+    AioContext *ctx_ = (ctx);                                      \
+    /* Increment wait_->num_waiters before evaluating cond. */     \
+    qatomic_inc(&wait_->num_waiters);                              \
+    if (ctx_ && in_aio_context_home_thread(ctx_)) {                \
+        while ((cond)) {                                           \
+            aio_poll(ctx_, true);                                  \
+            waited_ = true;                                        \
+        }                                                          \
+    } else {                                                       \
+        assert(qemu_get_current_aio_context() ==                   \
+               qemu_get_aio_context());                            \
+        while ((cond)) {                                           \
+            if (ctx_) {                                            \
+                aio_context_release(ctx_);                         \
+            }                                                      \
+            aio_poll(qemu_get_aio_context(), true);                \
+            if (ctx_) {                                            \
+                aio_context_acquire(ctx_);                         \
+            }                                                      \
+            waited_ = true;                                        \
+        }                                                          \
+    }                                                              \
+    qatomic_dec(&wait_->num_waiters);                              \
+    waited_; })
+
+/**
+ * aio_wait_kick:
+ * Wake up the main thread if it is waiting on AIO_WAIT_WHILE().  During
+ * synchronous operations performed in an IOThread, the main thread lets the
+ * IOThread's event loop run, waiting for the operation to complete.  A
+ * aio_wait_kick() call will wake up the main thread.
+ */
+void aio_wait_kick(void);
+
+/**
+ * aio_wait_bh_oneshot:
+ * @ctx: the aio context
+ * @cb: the BH callback function
+ * @opaque: user data for the BH callback function
+ *
+ * Run a BH in @ctx and wait for it to complete.
+ *
+ * Must be called from the main loop thread with @ctx acquired exactly once.
+ * Note that main loop event processing may occur.
+ */
+void aio_wait_bh_oneshot(AioContext *ctx, QEMUBHFunc *cb, void *opaque);
+
+/**
+ * in_aio_context_home_thread:
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the thread that normally runs @ctx.  Note
+ * that acquiring/releasing ctx does not affect the outcome, each AioContext
+ * still only has one home thread that is responsible for running it.
+ */
+static inline bool in_aio_context_home_thread(AioContext *ctx)
+{
+    if (ctx == qemu_get_current_aio_context()) {
+        return true;
+    }
+
+    if (ctx == qemu_get_aio_context()) {
+        return qemu_mutex_iothread_locked();
+    } else {
+        return false;
+    }
+}
+
+#endif /* QEMU_AIO_WAIT_H */
diff --git a/include/block/aio.h b/include/block/aio.h
new file mode 100644
index 000000000..47fbe9d81
--- /dev/null
+++ b/include/block/aio.h
@@ -0,0 +1,770 @@
+/*
+ * QEMU aio implementation
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_AIO_H
+#define QEMU_AIO_H
+
+#ifdef CONFIG_LINUX_IO_URING
+#include <liburing.h>
+#endif
+#include "qemu/coroutine.h"
+#include "qemu/queue.h"
+#include "qemu/event_notifier.h"
+#include "qemu/thread.h"
+#include "qemu/timer.h"
+
+typedef struct BlockAIOCB BlockAIOCB;
+typedef void BlockCompletionFunc(void *opaque, int ret);
+
+typedef struct AIOCBInfo {
+    void (*cancel_async)(BlockAIOCB *acb);
+    AioContext *(*get_aio_context)(BlockAIOCB *acb);
+    size_t aiocb_size;
+} AIOCBInfo;
+
+struct BlockAIOCB {
+    const AIOCBInfo *aiocb_info;
+    BlockDriverState *bs;
+    BlockCompletionFunc *cb;
+    void *opaque;
+    int refcnt;
+};
+
+void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs,
+                   BlockCompletionFunc *cb, void *opaque);
+void qemu_aio_unref(void *p);
+void qemu_aio_ref(void *p);
+
+typedef struct AioHandler AioHandler;
+typedef QLIST_HEAD(, AioHandler) AioHandlerList;
+typedef void QEMUBHFunc(void *opaque);
+typedef bool AioPollFn(void *opaque);
+typedef void IOHandler(void *opaque);
+
+struct Coroutine;
+struct ThreadPool;
+struct LinuxAioState;
+struct LuringState;
+
+/* Is polling disabled? */
+bool aio_poll_disabled(AioContext *ctx);
+
+/* Callbacks for file descriptor monitoring implementations */
+typedef struct {
+    /*
+     * update:
+     * @ctx: the AioContext
+     * @old_node: the existing handler or NULL if this file descriptor is being
+     *            monitored for the first time
+     * @new_node: the new handler or NULL if this file descriptor is being
+     *            removed
+     *
+     * Add/remove/modify a monitored file descriptor.
+     *
+     * Called with ctx->list_lock acquired.
+     */
+    void (*update)(AioContext *ctx, AioHandler *old_node, AioHandler *new_node);
+
+    /*
+     * wait:
+     * @ctx: the AioContext
+     * @ready_list: list for handlers that become ready
+     * @timeout: maximum duration to wait, in nanoseconds
+     *
+     * Wait for file descriptors to become ready and place them on ready_list.
+     *
+     * Called with ctx->list_lock incremented but not locked.
+     *
+     * Returns: number of ready file descriptors.
+     */
+    int (*wait)(AioContext *ctx, AioHandlerList *ready_list, int64_t timeout);
+
+    /*
+     * need_wait:
+     * @ctx: the AioContext
+     *
+     * Tell aio_poll() when to stop userspace polling early because ->wait()
+     * has fds ready.
+     *
+     * File descriptor monitoring implementations that cannot poll fd readiness
+     * from userspace should use aio_poll_disabled() here.  This ensures that
+     * file descriptors are not starved by handlers that frequently make
+     * progress via userspace polling.
+     *
+     * Returns: true if ->wait() should be called, false otherwise.
+     */
+    bool (*need_wait)(AioContext *ctx);
+} FDMonOps;
+
+/*
+ * Each aio_bh_poll() call carves off a slice of the BH list, so that newly
+ * scheduled BHs are not processed until the next aio_bh_poll() call.  All
+ * active aio_bh_poll() calls chain their slices together in a list, so that
+ * nested aio_bh_poll() calls process all scheduled bottom halves.
+ */
+typedef QSLIST_HEAD(, QEMUBH) BHList;
+typedef struct BHListSlice BHListSlice;
+struct BHListSlice {
+    BHList bh_list;
+    QSIMPLEQ_ENTRY(BHListSlice) next;
+};
+
+typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
+
+struct AioContext {
+    GSource source;
+
+    /* Used by AioContext users to protect from multi-threaded access.  */
+    QemuRecMutex lock;
+
+    /* The list of registered AIO handlers.  Protected by ctx->list_lock. */
+    AioHandlerList aio_handlers;
+
+    /* The list of AIO handlers to be deleted.  Protected by ctx->list_lock. */
+    AioHandlerList deleted_aio_handlers;
+
+    /* Used to avoid unnecessary event_notifier_set calls in aio_notify;
+     * only written from the AioContext home thread, or under the BQL in
+     * the case of the main AioContext.  However, it is read from any
+     * thread so it is still accessed with atomic primitives.
+     *
+     * If this field is 0, everything (file descriptors, bottom halves,
+     * timers) will be re-evaluated before the next blocking poll() or
+     * io_uring wait; therefore, the event_notifier_set call can be
+     * skipped.  If it is non-zero, you may need to wake up a concurrent
+     * aio_poll or the glib main event loop, making event_notifier_set
+     * necessary.
+     *
+     * Bit 0 is reserved for GSource usage of the AioContext, and is 1
+     * between a call to aio_ctx_prepare and the next call to aio_ctx_check.
+     * Bits 1-31 simply count the number of active calls to aio_poll
+     * that are in the prepare or poll phase.
+     *
+     * The GSource and aio_poll must use a different mechanism because
+     * there is no certainty that a call to GSource's prepare callback
+     * (via g_main_context_prepare) is indeed followed by check and
+     * dispatch.  It's not clear whether this would be a bug, but let's
+     * play safe and allow it---it will just cause extra calls to
+     * event_notifier_set until the next call to dispatch.
+     *
+     * Instead, the aio_poll calls include both the prepare and the
+     * dispatch phase, hence a simple counter is enough for them.
+     */
+    uint32_t notify_me;
+
+    /* A lock to protect between QEMUBH and AioHandler adders and deleter,
+     * and to ensure that no callbacks are removed while we're walking and
+     * dispatching them.
+     */
+    QemuLockCnt list_lock;
+
+    /* Bottom Halves pending aio_bh_poll() processing */
+    BHList bh_list;
+
+    /* Chained BH list slices for each nested aio_bh_poll() call */
+    QSIMPLEQ_HEAD(, BHListSlice) bh_slice_list;
+
+    /* Used by aio_notify.
+     *
+     * "notified" is used to avoid expensive event_notifier_test_and_clear
+     * calls.  When it is clear, the EventNotifier is clear, or one thread
+     * is going to clear "notified" before processing more events.  False
+     * positives are possible, i.e. "notified" could be set even though the
+     * EventNotifier is clear.
+     *
+     * Note that event_notifier_set *cannot* be optimized the same way.  For
+     * more information on the problem that would result, see "#ifdef BUG2"
+     * in the docs/aio_notify_accept.promela formal model.
+     */
+    bool notified;
+    EventNotifier notifier;
+
+    QSLIST_HEAD(, Coroutine) scheduled_coroutines;
+    QEMUBH *co_schedule_bh;
+
+    /* Thread pool for performing work and receiving completion callbacks.
+     * Has its own locking.
+     */
+    struct ThreadPool *thread_pool;
+
+#ifdef CONFIG_LINUX_AIO
+    /*
+     * State for native Linux AIO.  Uses aio_context_acquire/release for
+     * locking.
+     */
+    struct LinuxAioState *linux_aio;
+#endif
+#ifdef CONFIG_LINUX_IO_URING
+    /*
+     * State for Linux io_uring.  Uses aio_context_acquire/release for
+     * locking.
+     */
+    struct LuringState *linux_io_uring;
+
+    /* State for file descriptor monitoring using Linux io_uring */
+    struct io_uring fdmon_io_uring;
+    AioHandlerSList submit_list;
+#endif
+
+    /* TimerLists for calling timers - one per clock type.  Has its own
+     * locking.
+     */
+    QEMUTimerListGroup tlg;
+
+    int external_disable_cnt;
+
+    /* Number of AioHandlers without .io_poll() */
+    int poll_disable_cnt;
+
+    /* Polling mode parameters */
+    int64_t poll_ns;        /* current polling time in nanoseconds */
+    int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
+    int64_t poll_grow;      /* polling time growth factor */
+    int64_t poll_shrink;    /* polling time shrink factor */
+
+    /* AIO engine parameters */
+    int64_t aio_max_batch;  /* maximum number of requests in a batch */
+
+    /*
+     * List of handlers participating in userspace polling.  Protected by
+     * ctx->list_lock.  Iterated and modified mostly by the event loop thread
+     * from aio_poll() with ctx->list_lock incremented.  aio_set_fd_handler()
+     * only touches the list to delete nodes if ctx->list_lock's count is zero.
+     */
+    AioHandlerList poll_aio_handlers;
+
+    /* Are we in polling mode or monitoring file descriptors? */
+    bool poll_started;
+
+    /* epoll(7) state used when built with CONFIG_EPOLL */
+    int epollfd;
+
+    const FDMonOps *fdmon_ops;
+};
+
+/**
+ * aio_context_new: Allocate a new AioContext.
+ *
+ * AioContext provide a mini event-loop that can be waited on synchronously.
+ * They also provide bottom halves, a service to execute a piece of code
+ * as soon as possible.
+ */
+AioContext *aio_context_new(Error **errp);
+
+/**
+ * aio_context_ref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Add a reference to an AioContext.
+ */
+void aio_context_ref(AioContext *ctx);
+
+/**
+ * aio_context_unref:
+ * @ctx: The AioContext to operate on.
+ *
+ * Drop a reference to an AioContext.
+ */
+void aio_context_unref(AioContext *ctx);
+
+/* Take ownership of the AioContext.  If the AioContext will be shared between
+ * threads, and a thread does not want to be interrupted, it will have to
+ * take ownership around calls to aio_poll().  Otherwise, aio_poll()
+ * automatically takes care of calling aio_context_acquire and
+ * aio_context_release.
+ *
+ * Note that this is separate from bdrv_drained_begin/bdrv_drained_end.  A
+ * thread still has to call those to avoid being interrupted by the guest.
+ *
+ * Bottom halves, timers and callbacks can be created or removed without
+ * acquiring the AioContext.
+ */
+void aio_context_acquire(AioContext *ctx);
+
+/* Relinquish ownership of the AioContext. */
+void aio_context_release(AioContext *ctx);
+
+/**
+ * aio_bh_schedule_oneshot_full: Allocate a new bottom half structure that will
+ * run only once and as soon as possible.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+void aio_bh_schedule_oneshot_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                                  const char *name);
+
+/**
+ * aio_bh_schedule_oneshot: Allocate a new bottom half structure that will run
+ * only once and as soon as possible.
+ *
+ * A convenience wrapper for aio_bh_schedule_oneshot_full() that uses cb as the
+ * name string.
+ */
+#define aio_bh_schedule_oneshot(ctx, cb, opaque) \
+    aio_bh_schedule_oneshot_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_bh_new_full: Allocate a new bottom half structure.
+ *
+ * Bottom halves are lightweight callbacks whose invocation is guaranteed
+ * to be wait-free, thread-safe and signal-safe.  The #QEMUBH structure
+ * is opaque and must be allocated prior to its use.
+ *
+ * @name: A human-readable identifier for debugging purposes.
+ */
+QEMUBH *aio_bh_new_full(AioContext *ctx, QEMUBHFunc *cb, void *opaque,
+                        const char *name);
+
+/**
+ * aio_bh_new: Allocate a new bottom half structure
+ *
+ * A convenience wrapper for aio_bh_new_full() that uses the cb as the name
+ * string.
+ */
+#define aio_bh_new(ctx, cb, opaque) \
+    aio_bh_new_full((ctx), (cb), (opaque), (stringify(cb)))
+
+/**
+ * aio_notify: Force processing of pending events.
+ *
+ * Similar to signaling a condition variable, aio_notify forces
+ * aio_poll to exit, so that the next call will re-examine pending events.
+ * The caller of aio_notify will usually call aio_poll again very soon,
+ * or go through another iteration of the GLib main loop.  Hence, aio_notify
+ * also has the side effect of recalculating the sets of file descriptors
+ * that the main loop waits for.
+ *
+ * Calling aio_notify is rarely necessary, because for example scheduling
+ * a bottom half calls it already.
+ */
+void aio_notify(AioContext *ctx);
+
+/**
+ * aio_notify_accept: Acknowledge receiving an aio_notify.
+ *
+ * aio_notify() uses an EventNotifier in order to wake up a sleeping
+ * aio_poll() or g_main_context_iteration().  Calls to aio_notify() are
+ * usually rare, but the AioContext has to clear the EventNotifier on
+ * every aio_poll() or g_main_context_iteration() in order to avoid
+ * busy waiting.  This event_notifier_test_and_clear() cannot be done
+ * using the usual aio_context_set_event_notifier(), because it must
+ * be done before processing all events (file descriptors, bottom halves,
+ * timers).
+ *
+ * aio_notify_accept() is an optimized event_notifier_test_and_clear()
+ * that is specific to an AioContext's notifier; it is used internally
+ * to clear the EventNotifier only if aio_notify() had been called.
+ */
+void aio_notify_accept(AioContext *ctx);
+
+/**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
+ * aio_bh_poll: Poll bottom halves for an AioContext.
+ *
+ * These are internal functions used by the QEMU main loop.
+ * And notice that multiple occurrences of aio_bh_poll cannot
+ * be called concurrently
+ */
+int aio_bh_poll(AioContext *ctx);
+
+/**
+ * qemu_bh_schedule: Schedule a bottom half.
+ *
+ * Scheduling a bottom half interrupts the main loop and causes the
+ * execution of the callback that was passed to qemu_bh_new.
+ *
+ * Bottom halves that are scheduled from a bottom half handler are instantly
+ * invoked.  This can create an infinite loop if a bottom half handler
+ * schedules itself.
+ *
+ * @bh: The bottom half to be scheduled.
+ */
+void qemu_bh_schedule(QEMUBH *bh);
+
+/**
+ * qemu_bh_cancel: Cancel execution of a bottom half.
+ *
+ * Canceling execution of a bottom half undoes the effect of calls to
+ * qemu_bh_schedule without freeing its resources yet.  While cancellation
+ * itself is also wait-free and thread-safe, it can of course race with the
+ * loop that executes bottom halves unless you are holding the iothread
+ * mutex.  This makes it mostly useless if you are not holding the mutex.
+ *
+ * @bh: The bottom half to be canceled.
+ */
+void qemu_bh_cancel(QEMUBH *bh);
+
+/**
+ *qemu_bh_delete: Cancel execution of a bottom half and free its resources.
+ *
+ * Deleting a bottom half frees the memory that was allocated for it by
+ * qemu_bh_new.  It also implies canceling the bottom half if it was
+ * scheduled.
+ * This func is async. The bottom half will do the delete action at the finial
+ * end.
+ *
+ * @bh: The bottom half to be deleted.
+ */
+void qemu_bh_delete(QEMUBH *bh);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, before g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_prepare(AioContext *ctx);
+
+/* Return whether there are any pending callbacks from the GSource
+ * attached to the AioContext, after g_poll is invoked.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+bool aio_pending(AioContext *ctx);
+
+/* Dispatch any pending callbacks from the GSource attached to the AioContext.
+ *
+ * This is used internally in the implementation of the GSource.
+ */
+void aio_dispatch(AioContext *ctx);
+
+/* Progress in completing AIO work to occur.  This can issue new pending
+ * aio as a result of executing I/O completion or bh callbacks.
+ *
+ * Return whether any progress was made by executing AIO or bottom half
+ * handlers.  If @blocking == true, this should always be true except
+ * if someone called aio_notify.
+ *
+ * If there are no pending bottom halves, but there are pending AIO
+ * operations, it may not be possible to make any progress without
+ * blocking.  If @blocking is true, this function will wait until one
+ * or more AIO events have completed, to ensure something has moved
+ * before returning.
+ */
+bool aio_poll(AioContext *ctx, bool blocking);
+
+/* Register a file descriptor and associated callbacks.  Behaves very similarly
+ * to qemu_set_fd_handler.  Unlike qemu_set_fd_handler, these callbacks will
+ * be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of qemu_set_fd_handler[2].
+ */
+void aio_set_fd_handler(AioContext *ctx,
+                        int fd,
+                        bool is_external,
+                        IOHandler *io_read,
+                        IOHandler *io_write,
+                        AioPollFn *io_poll,
+                        void *opaque);
+
+/* Set polling begin/end callbacks for a file descriptor that has already been
+ * registered with aio_set_fd_handler.  Do nothing if the file descriptor is
+ * not registered.
+ */
+void aio_set_fd_poll(AioContext *ctx, int fd,
+                     IOHandler *io_poll_begin,
+                     IOHandler *io_poll_end);
+
+/* Register an event notifier and associated callbacks.  Behaves very similarly
+ * to event_notifier_set_handler.  Unlike event_notifier_set_handler, these callbacks
+ * will be invoked when using aio_poll().
+ *
+ * Code that invokes AIO completion functions should rely on this function
+ * instead of event_notifier_set_handler.
+ */
+void aio_set_event_notifier(AioContext *ctx,
+                            EventNotifier *notifier,
+                            bool is_external,
+                            EventNotifierHandler *io_read,
+                            AioPollFn *io_poll);
+
+/* Set polling begin/end callbacks for an event notifier that has already been
+ * registered with aio_set_event_notifier.  Do nothing if the event notifier is
+ * not registered.
+ */
+void aio_set_event_notifier_poll(AioContext *ctx,
+                                 EventNotifier *notifier,
+                                 EventNotifierHandler *io_poll_begin,
+                                 EventNotifierHandler *io_poll_end);
+
+/* Return a GSource that lets the main loop poll the file descriptors attached
+ * to this AioContext.
+ */
+GSource *aio_get_g_source(AioContext *ctx);
+
+/* Return the ThreadPool bound to this AioContext */
+struct ThreadPool *aio_get_thread_pool(AioContext *ctx);
+
+/* Setup the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_setup_linux_aio(AioContext *ctx, Error **errp);
+
+/* Return the LinuxAioState bound to this AioContext */
+struct LinuxAioState *aio_get_linux_aio(AioContext *ctx);
+
+/* Setup the LuringState bound to this AioContext */
+struct LuringState *aio_setup_linux_io_uring(AioContext *ctx, Error **errp);
+
+/* Return the LuringState bound to this AioContext */
+struct LuringState *aio_get_linux_io_uring(AioContext *ctx);
+/**
+ * aio_timer_new_with_attrs:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer (with attributes) attached to the context @ctx.
+ * The function is responsible for memory allocation.
+ *
+ * The preferred interface is aio_timer_init or aio_timer_init_with_attrs.
+ * Use that unless you really need dynamic memory allocation.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new_with_attrs(AioContext *ctx,
+                                                  QEMUClockType type,
+                                                  int scale, int attributes,
+                                                  QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_new:
+ * @ctx: the aio context
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Allocate a new timer attached to the context @ctx.
+ * See aio_timer_new_with_attrs for details.
+ *
+ * Returns: a pointer to the new timer
+ */
+static inline QEMUTimer *aio_timer_new(AioContext *ctx, QEMUClockType type,
+                                       int scale,
+                                       QEMUTimerCB *cb, void *opaque)
+{
+    return timer_new_full(&ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_timer_init_with_attrs:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @attributes: 0, or one to multiple OR'ed QEMU_TIMER_ATTR_<id> values
+ *              to assign
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer (with attributes) attached to the context @ctx.
+ * The caller is responsible for memory allocation.
+ */
+static inline void aio_timer_init_with_attrs(AioContext *ctx,
+                                             QEMUTimer *ts, QEMUClockType type,
+                                             int scale, int attributes,
+                                             QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, attributes, cb, opaque);
+}
+
+/**
+ * aio_timer_init:
+ * @ctx: the aio context
+ * @ts: the timer
+ * @type: the clock type
+ * @scale: the scale
+ * @cb: the callback to call on timer expiry
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Initialise a new timer attached to the context @ctx.
+ * See aio_timer_init_with_attrs for details.
+ */
+static inline void aio_timer_init(AioContext *ctx,
+                                  QEMUTimer *ts, QEMUClockType type,
+                                  int scale,
+                                  QEMUTimerCB *cb, void *opaque)
+{
+    timer_init_full(ts, &ctx->tlg, type, scale, 0, cb, opaque);
+}
+
+/**
+ * aio_compute_timeout:
+ * @ctx: the aio context
+ *
+ * Compute the timeout that a blocking aio_poll should use.
+ */
+int64_t aio_compute_timeout(AioContext *ctx);
+
+/**
+ * aio_disable_external:
+ * @ctx: the aio context
+ *
+ * Disable the further processing of external clients.
+ */
+static inline void aio_disable_external(AioContext *ctx)
+{
+    qatomic_inc(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_enable_external:
+ * @ctx: the aio context
+ *
+ * Enable the processing of external clients.
+ */
+static inline void aio_enable_external(AioContext *ctx)
+{
+    int old;
+
+    old = qatomic_fetch_dec(&ctx->external_disable_cnt);
+    assert(old > 0);
+    if (old == 1) {
+        /* Kick event loop so it re-arms file descriptors */
+        aio_notify(ctx);
+    }
+}
+
+/**
+ * aio_external_disabled:
+ * @ctx: the aio context
+ *
+ * Return true if the external clients are disabled.
+ */
+static inline bool aio_external_disabled(AioContext *ctx)
+{
+    return qatomic_read(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_node_check:
+ * @ctx: the aio context
+ * @is_external: Whether or not the checked node is an external event source.
+ *
+ * Check if the node's is_external flag is okay to be polled by the ctx at this
+ * moment. True means green light.
+ */
+static inline bool aio_node_check(AioContext *ctx, bool is_external)
+{
+    return !is_external || !qatomic_read(&ctx->external_disable_cnt);
+}
+
+/**
+ * aio_co_schedule:
+ * @ctx: the aio context
+ * @co: the coroutine
+ *
+ * Start a coroutine on a remote AioContext.
+ *
+ * The coroutine must not be entered by anyone else while aio_co_schedule()
+ * is active.  In addition the coroutine must have yielded unless ctx
+ * is the context in which the coroutine is running (i.e. the value of
+ * qemu_get_current_aio_context() from the coroutine itself).
+ */
+void aio_co_schedule(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * aio_co_reschedule_self:
+ * @new_ctx: the new context
+ *
+ * Move the currently running coroutine to new_ctx. If the coroutine is already
+ * running in new_ctx, do nothing.
+ */
+void coroutine_fn aio_co_reschedule_self(AioContext *new_ctx);
+
+/**
+ * aio_co_wake:
+ * @co: the coroutine
+ *
+ * Restart a coroutine on the AioContext where it was running last, thus
+ * preventing coroutines from jumping from one context to another when they
+ * go to sleep.
+ *
+ * aio_co_wake may be executed either in coroutine or non-coroutine
+ * context.  The coroutine must not be entered by anyone else while
+ * aio_co_wake() is active.
+ */
+void aio_co_wake(struct Coroutine *co);
+
+/**
+ * aio_co_enter:
+ * @ctx: the context to run the coroutine
+ * @co: the coroutine to run
+ *
+ * Enter a coroutine in the specified AioContext.
+ */
+void aio_co_enter(AioContext *ctx, struct Coroutine *co);
+
+/**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from the main thread or with the "big QEMU lock" taken it
+ * will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+void qemu_set_current_aio_context(AioContext *ctx);
+
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ *
+ * Initialize the aio context.
+ */
+void aio_context_setup(AioContext *ctx);
+
+/**
+ * aio_context_destroy:
+ * @ctx: the aio context
+ *
+ * Destroy the aio context.
+ */
+void aio_context_destroy(AioContext *ctx);
+
+/* Used internally, do not call outside AioContext code */
+void aio_context_use_g_source(AioContext *ctx);
+
+/**
+ * aio_context_set_poll_params:
+ * @ctx: the aio context
+ * @max_ns: how long to busy poll for, in nanoseconds
+ * @grow: polling time growth factor
+ * @shrink: polling time shrink factor
+ *
+ * Poll mode can be disabled by setting poll_max_ns to 0.
+ */
+void aio_context_set_poll_params(AioContext *ctx, int64_t max_ns,
+                                 int64_t grow, int64_t shrink,
+                                 Error **errp);
+
+/**
+ * aio_context_set_aio_params:
+ * @ctx: the aio context
+ * @max_batch: maximum number of requests in a batch, 0 means that the
+ *             engine will use its default
+ */
+void aio_context_set_aio_params(AioContext *ctx, int64_t max_batch,
+                                Error **errp);
+
+#endif
diff --git a/include/block/aio_task.h b/include/block/aio_task.h
new file mode 100644
index 000000000..50bc1e181
--- /dev/null
+++ b/include/block/aio_task.h
@@ -0,0 +1,54 @@
+/*
+ * Aio tasks loops
+ *
+ * Copyright (c) 2019 Virtuozzo International GmbH.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_AIO_TASK_H
+#define BLOCK_AIO_TASK_H
+
+#include "qemu/coroutine.h"
+
+typedef struct AioTaskPool AioTaskPool;
+typedef struct AioTask AioTask;
+typedef int coroutine_fn (*AioTaskFunc)(AioTask *task);
+struct AioTask {
+    AioTaskPool *pool;
+    AioTaskFunc func;
+    int ret;
+};
+
+AioTaskPool *coroutine_fn aio_task_pool_new(int max_busy_tasks);
+void aio_task_pool_free(AioTaskPool *);
+
+/* error code of failed task or 0 if all is OK */
+int aio_task_pool_status(AioTaskPool *pool);
+
+bool aio_task_pool_empty(AioTaskPool *pool);
+
+/* User provides filled @task, however task->pool will be set automatically */
+void coroutine_fn aio_task_pool_start_task(AioTaskPool *pool, AioTask *task);
+
+void coroutine_fn aio_task_pool_wait_slot(AioTaskPool *pool);
+void coroutine_fn aio_task_pool_wait_one(AioTaskPool *pool);
+void coroutine_fn aio_task_pool_wait_all(AioTaskPool *pool);
+
+#endif /* BLOCK_AIO_TASK_H */
diff --git a/include/block/block-copy.h b/include/block/block-copy.h
new file mode 100644
index 000000000..99370fa38
--- /dev/null
+++ b/include/block/block-copy.h
@@ -0,0 +1,96 @@
+/*
+ * block_copy API
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ * Copyright (c) 2019 Virtuozzo International GmbH.
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *  Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+#include "block/block.h"
+#include "qemu/co-shared-resource.h"
+
+/* All APIs are thread-safe */
+
+typedef void (*BlockCopyAsyncCallbackFunc)(void *opaque);
+typedef struct BlockCopyState BlockCopyState;
+typedef struct BlockCopyCallState BlockCopyCallState;
+
+BlockCopyState *block_copy_state_new(BdrvChild *source, BdrvChild *target,
+                                     Error **errp);
+
+/* Function should be called prior any actual copy request */
+void block_copy_set_copy_opts(BlockCopyState *s, bool use_copy_range,
+                              bool compress);
+void block_copy_set_progress_meter(BlockCopyState *s, ProgressMeter *pm);
+
+void block_copy_state_free(BlockCopyState *s);
+
+int64_t block_copy_reset_unallocated(BlockCopyState *s,
+                                     int64_t offset, int64_t *count);
+
+int coroutine_fn block_copy(BlockCopyState *s, int64_t offset, int64_t bytes,
+                            bool ignore_ratelimit);
+
+/*
+ * Run block-copy in a coroutine, create corresponding BlockCopyCallState
+ * object and return pointer to it. Never returns NULL.
+ *
+ * Caller is responsible to call block_copy_call_free() to free
+ * BlockCopyCallState object.
+ *
+ * @max_workers means maximum of parallel coroutines to execute sub-requests,
+ * must be > 0.
+ *
+ * @max_chunk means maximum length for one IO operation. Zero means unlimited.
+ */
+BlockCopyCallState *block_copy_async(BlockCopyState *s,
+                                     int64_t offset, int64_t bytes,
+                                     int max_workers, int64_t max_chunk,
+                                     BlockCopyAsyncCallbackFunc cb,
+                                     void *cb_opaque);
+
+/*
+ * Free finished BlockCopyCallState. Trying to free running
+ * block-copy will crash.
+ */
+void block_copy_call_free(BlockCopyCallState *call_state);
+
+/*
+ * Note, that block-copy call is marked finished prior to calling
+ * the callback.
+ */
+bool block_copy_call_finished(BlockCopyCallState *call_state);
+bool block_copy_call_succeeded(BlockCopyCallState *call_state);
+bool block_copy_call_failed(BlockCopyCallState *call_state);
+bool block_copy_call_cancelled(BlockCopyCallState *call_state);
+int block_copy_call_status(BlockCopyCallState *call_state, bool *error_is_read);
+
+void block_copy_set_speed(BlockCopyState *s, uint64_t speed);
+void block_copy_kick(BlockCopyCallState *call_state);
+
+/*
+ * Cancel running block-copy call.
+ *
+ * Cancel leaves block-copy state valid: dirty bits are correct and you may use
+ * cancel + <run block_copy with same parameters> to emulate pause/resume.
+ *
+ * Note also, that the cancel is async: it only marks block-copy call to be
+ * cancelled. So, the call may be cancelled (block_copy_call_cancelled() reports
+ * true) but not yet finished (block_copy_call_finished() reports false).
+ */
+void block_copy_call_cancel(BlockCopyCallState *call_state);
+
+BdrvDirtyBitmap *block_copy_dirty_bitmap(BlockCopyState *s);
+int64_t block_copy_cluster_size(BlockCopyState *s);
+void block_copy_set_skip_unallocated(BlockCopyState *s, bool skip);
+
+#endif /* BLOCK_COPY_H */
diff --git a/include/block/block-hmp-cmds.h b/include/block/block-hmp-cmds.h
new file mode 100644
index 000000000..3412e108c
--- /dev/null
+++ b/include/block/block-hmp-cmds.h
@@ -0,0 +1,54 @@
+/*
+ * HMP commands related to the block layer
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2020 Red Hat, Inc.
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_HMP_COMMANDS_H
+#define BLOCK_HMP_COMMANDS_H
+
+void hmp_drive_add(Monitor *mon, const QDict *qdict);
+
+void hmp_commit(Monitor *mon, const QDict *qdict);
+void hmp_drive_del(Monitor *mon, const QDict *qdict);
+
+void hmp_drive_mirror(Monitor *mon, const QDict *qdict);
+void hmp_drive_backup(Monitor *mon, const QDict *qdict);
+
+void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
+void hmp_block_job_cancel(Monitor *mon, const QDict *qdict);
+void hmp_block_job_pause(Monitor *mon, const QDict *qdict);
+void hmp_block_job_resume(Monitor *mon, const QDict *qdict);
+void hmp_block_job_complete(Monitor *mon, const QDict *qdict);
+
+void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict);
+void hmp_snapshot_blkdev_internal(Monitor *mon, const QDict *qdict);
+void hmp_snapshot_delete_blkdev_internal(Monitor *mon, const QDict *qdict);
+
+void hmp_nbd_server_start(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_add(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_remove(Monitor *mon, const QDict *qdict);
+void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict);
+
+void hmp_block_resize(Monitor *mon, const QDict *qdict);
+void hmp_block_stream(Monitor *mon, const QDict *qdict);
+void hmp_block_passwd(Monitor *mon, const QDict *qdict);
+void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
+void hmp_eject(Monitor *mon, const QDict *qdict);
+
+void hmp_qemu_io(Monitor *mon, const QDict *qdict);
+
+void hmp_info_block(Monitor *mon, const QDict *qdict);
+void hmp_info_blockstats(Monitor *mon, const QDict *qdict);
+void hmp_info_block_jobs(Monitor *mon, const QDict *qdict);
+void hmp_info_snapshots(Monitor *mon, const QDict *qdict);
+
+#endif
diff --git a/include/block/block.h b/include/block/block.h
new file mode 100644
index 000000000..e5dd22b03
--- /dev/null
+++ b/include/block/block.h
@@ -0,0 +1,864 @@
+#ifndef BLOCK_H
+#define BLOCK_H
+
+#include "block/aio.h"
+#include "block/aio-wait.h"
+#include "qemu/iov.h"
+#include "qemu/coroutine.h"
+#include "block/accounting.h"
+#include "block/dirty-bitmap.h"
+#include "block/blockjob.h"
+#include "qemu/hbitmap.h"
+#include "qemu/transactions.h"
+
+/*
+ * generated_co_wrapper
+ *
+ * Function specifier, which does nothing but mark functions to be
+ * generated by scripts/block-coroutine-wrapper.py
+ *
+ * Read more in docs/devel/block-coroutine-wrapper.rst
+ */
+#define generated_co_wrapper
+
+/* block.c */
+typedef struct BlockDriver BlockDriver;
+typedef struct BdrvChild BdrvChild;
+typedef struct BdrvChildClass BdrvChildClass;
+
+typedef struct BlockDriverInfo {
+    /* in bytes, 0 if irrelevant */
+    int cluster_size;
+    /* offset at which the VM state can be saved (0 if not possible) */
+    int64_t vm_state_offset;
+    bool is_dirty;
+    /*
+     * True if this block driver only supports compressed writes
+     */
+    bool needs_compressed_writes;
+} BlockDriverInfo;
+
+typedef struct BlockFragInfo {
+    uint64_t allocated_clusters;
+    uint64_t total_clusters;
+    uint64_t fragmented_clusters;
+    uint64_t compressed_clusters;
+} BlockFragInfo;
+
+typedef enum {
+    BDRV_REQ_COPY_ON_READ       = 0x1,
+    BDRV_REQ_ZERO_WRITE         = 0x2,
+
+    /*
+     * The BDRV_REQ_MAY_UNMAP flag is used in write_zeroes requests to indicate
+     * that the block driver should unmap (discard) blocks if it is guaranteed
+     * that the result will read back as zeroes. The flag is only passed to the
+     * driver if the block device is opened with BDRV_O_UNMAP.
+     */
+    BDRV_REQ_MAY_UNMAP          = 0x4,
+
+    BDRV_REQ_FUA                = 0x10,
+    BDRV_REQ_WRITE_COMPRESSED   = 0x20,
+
+    /* Signifies that this write request will not change the visible disk
+     * content. */
+    BDRV_REQ_WRITE_UNCHANGED    = 0x40,
+
+    /* Forces request serialisation. Use only with write requests. */
+    BDRV_REQ_SERIALISING        = 0x80,
+
+    /* Execute the request only if the operation can be offloaded or otherwise
+     * be executed efficiently, but return an error instead of using a slow
+     * fallback. */
+    BDRV_REQ_NO_FALLBACK        = 0x100,
+
+    /*
+     * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
+     * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
+     * filter is involved), in which case it signals that the COR operation
+     * need not read the data into memory (qiov) but only ensure they are
+     * copied to the top layer (i.e., that COR operation is done).
+     */
+    BDRV_REQ_PREFETCH  = 0x200,
+
+    /*
+     * If we need to wait for other requests, just fail immediately. Used
+     * only together with BDRV_REQ_SERIALISING.
+     */
+    BDRV_REQ_NO_WAIT = 0x400,
+
+    /* Mask of valid flags */
+    BDRV_REQ_MASK               = 0x7ff,
+} BdrvRequestFlags;
+
+typedef struct BlockSizes {
+    uint32_t phys;
+    uint32_t log;
+} BlockSizes;
+
+typedef struct HDGeometry {
+    uint32_t heads;
+    uint32_t sectors;
+    uint32_t cylinders;
+} HDGeometry;
+
+#define BDRV_O_NO_SHARE    0x0001 /* don't share permissions */
+#define BDRV_O_RDWR        0x0002
+#define BDRV_O_RESIZE      0x0004 /* request permission for resizing the node */
+#define BDRV_O_SNAPSHOT    0x0008 /* open the file read only and save writes in a snapshot */
+#define BDRV_O_TEMPORARY   0x0010 /* delete the file after use */
+#define BDRV_O_NOCACHE     0x0020 /* do not use the host page cache */
+#define BDRV_O_NATIVE_AIO  0x0080 /* use native AIO instead of the thread pool */
+#define BDRV_O_NO_BACKING  0x0100 /* don't open the backing file */
+#define BDRV_O_NO_FLUSH    0x0200 /* disable flushing on this disk */
+#define BDRV_O_COPY_ON_READ 0x0400 /* copy read backing sectors into image */
+#define BDRV_O_INACTIVE    0x0800  /* consistency hint for migration handoff */
+#define BDRV_O_CHECK       0x1000  /* open solely for consistency check */
+#define BDRV_O_ALLOW_RDWR  0x2000  /* allow reopen to change from r/o to r/w */
+#define BDRV_O_UNMAP       0x4000  /* execute guest UNMAP/TRIM operations */
+#define BDRV_O_PROTOCOL    0x8000  /* if no block driver is explicitly given:
+                                      select an appropriate protocol driver,
+                                      ignoring the format layer */
+#define BDRV_O_NO_IO       0x10000 /* don't initialize for I/O */
+#define BDRV_O_AUTO_RDONLY 0x20000 /* degrade to read-only if opening read-write fails */
+#define BDRV_O_IO_URING    0x40000 /* use io_uring instead of the thread pool */
+
+#define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
+
+
+/* Option names of options parsed by the block layer */
+
+#define BDRV_OPT_CACHE_WB       "cache.writeback"
+#define BDRV_OPT_CACHE_DIRECT   "cache.direct"
+#define BDRV_OPT_CACHE_NO_FLUSH "cache.no-flush"
+#define BDRV_OPT_READ_ONLY      "read-only"
+#define BDRV_OPT_AUTO_READ_ONLY "auto-read-only"
+#define BDRV_OPT_DISCARD        "discard"
+#define BDRV_OPT_FORCE_SHARE    "force-share"
+
+
+#define BDRV_SECTOR_BITS   9
+#define BDRV_SECTOR_SIZE   (1ULL << BDRV_SECTOR_BITS)
+
+#define BDRV_REQUEST_MAX_SECTORS MIN_CONST(SIZE_MAX >> BDRV_SECTOR_BITS, \
+                                           INT_MAX >> BDRV_SECTOR_BITS)
+#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
+
+/*
+ * We want allow aligning requests and disk length up to any 32bit alignment
+ * and don't afraid of overflow.
+ * To achieve it, and in the same time use some pretty number as maximum disk
+ * size, let's define maximum "length" (a limit for any offset/bytes request and
+ * for disk size) to be the greatest power of 2 less than INT64_MAX.
+ */
+#define BDRV_MAX_ALIGNMENT (1L << 30)
+#define BDRV_MAX_LENGTH (QEMU_ALIGN_DOWN(INT64_MAX, BDRV_MAX_ALIGNMENT))
+
+/*
+ * Allocation status flags for bdrv_block_status() and friends.
+ *
+ * Public flags:
+ * BDRV_BLOCK_DATA: allocation for data at offset is tied to this layer
+ * BDRV_BLOCK_ZERO: offset reads as zero
+ * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
+ * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
+ *                       layer rather than any backing, set by block layer
+ * BDRV_BLOCK_EOF: the returned pnum covers through end of file for this
+ *                 layer, set by block layer
+ *
+ * Internal flags:
+ * BDRV_BLOCK_RAW: for use by passthrough drivers, such as raw, to request
+ *                 that the block layer recompute the answer from the returned
+ *                 BDS; must be accompanied by just BDRV_BLOCK_OFFSET_VALID.
+ * BDRV_BLOCK_RECURSE: request that the block layer will recursively search for
+ *                     zeroes in file child of current block node inside
+ *                     returned region. Only valid together with both
+ *                     BDRV_BLOCK_DATA and BDRV_BLOCK_OFFSET_VALID. Should not
+ *                     appear with BDRV_BLOCK_ZERO.
+ *
+ * If BDRV_BLOCK_OFFSET_VALID is set, the map parameter represents the
+ * host offset within the returned BDS that is allocated for the
+ * corresponding raw guest data.  However, whether that offset
+ * actually contains data also depends on BDRV_BLOCK_DATA, as follows:
+ *
+ * DATA ZERO OFFSET_VALID
+ *  t    t        t       sectors read as zero, returned file is zero at offset
+ *  t    f        t       sectors read as valid from file at offset
+ *  f    t        t       sectors preallocated, read as zero, returned file not
+ *                        necessarily zero at offset
+ *  f    f        t       sectors preallocated but read from backing_hd,
+ *                        returned file contains garbage at offset
+ *  t    t        f       sectors preallocated, read as zero, unknown offset
+ *  t    f        f       sectors read from unknown file or offset
+ *  f    t        f       not allocated or unknown offset, read as zero
+ *  f    f        f       not allocated or unknown offset, read from backing_hd
+ */
+#define BDRV_BLOCK_DATA         0x01
+#define BDRV_BLOCK_ZERO         0x02
+#define BDRV_BLOCK_OFFSET_VALID 0x04
+#define BDRV_BLOCK_RAW          0x08
+#define BDRV_BLOCK_ALLOCATED    0x10
+#define BDRV_BLOCK_EOF          0x20
+#define BDRV_BLOCK_RECURSE      0x40
+
+typedef QTAILQ_HEAD(BlockReopenQueue, BlockReopenQueueEntry) BlockReopenQueue;
+
+typedef struct BDRVReopenState {
+    BlockDriverState *bs;
+    int flags;
+    BlockdevDetectZeroesOptions detect_zeroes;
+    bool backing_missing;
+    BlockDriverState *old_backing_bs; /* keep pointer for permissions update */
+    BlockDriverState *old_file_bs; /* keep pointer for permissions update */
+    QDict *options;
+    QDict *explicit_options;
+    void *opaque;
+} BDRVReopenState;
+
+/*
+ * Block operation types
+ */
+typedef enum BlockOpType {
+    BLOCK_OP_TYPE_BACKUP_SOURCE,
+    BLOCK_OP_TYPE_BACKUP_TARGET,
+    BLOCK_OP_TYPE_CHANGE,
+    BLOCK_OP_TYPE_COMMIT_SOURCE,
+    BLOCK_OP_TYPE_COMMIT_TARGET,
+    BLOCK_OP_TYPE_DATAPLANE,
+    BLOCK_OP_TYPE_DRIVE_DEL,
+    BLOCK_OP_TYPE_EJECT,
+    BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
+    BLOCK_OP_TYPE_INTERNAL_SNAPSHOT,
+    BLOCK_OP_TYPE_INTERNAL_SNAPSHOT_DELETE,
+    BLOCK_OP_TYPE_MIRROR_SOURCE,
+    BLOCK_OP_TYPE_MIRROR_TARGET,
+    BLOCK_OP_TYPE_RESIZE,
+    BLOCK_OP_TYPE_STREAM,
+    BLOCK_OP_TYPE_REPLACE,
+    BLOCK_OP_TYPE_MAX,
+} BlockOpType;
+
+/* Block node permission constants */
+enum {
+    /**
+     * A user that has the "permission" of consistent reads is guaranteed that
+     * their view of the contents of the block device is complete and
+     * self-consistent, representing the contents of a disk at a specific
+     * point.
+     *
+     * For most block devices (including their backing files) this is true, but
+     * the property cannot be maintained in a few situations like for
+     * intermediate nodes of a commit block job.
+     */
+    BLK_PERM_CONSISTENT_READ    = 0x01,
+
+    /** This permission is required to change the visible disk contents. */
+    BLK_PERM_WRITE              = 0x02,
+
+    /**
+     * This permission (which is weaker than BLK_PERM_WRITE) is both enough and
+     * required for writes to the block node when the caller promises that
+     * the visible disk content doesn't change.
+     *
+     * As the BLK_PERM_WRITE permission is strictly stronger, either is
+     * sufficient to perform an unchanging write.
+     */
+    BLK_PERM_WRITE_UNCHANGED    = 0x04,
+
+    /** This permission is required to change the size of a block node. */
+    BLK_PERM_RESIZE             = 0x08,
+
+    /**
+     * This permission is required to change the node that this BdrvChild
+     * points to.
+     */
+    BLK_PERM_GRAPH_MOD          = 0x10,
+
+    BLK_PERM_ALL                = 0x1f,
+
+    DEFAULT_PERM_PASSTHROUGH    = BLK_PERM_CONSISTENT_READ
+                                 | BLK_PERM_WRITE
+                                 | BLK_PERM_WRITE_UNCHANGED
+                                 | BLK_PERM_RESIZE,
+
+    DEFAULT_PERM_UNCHANGED      = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
+};
+
+/*
+ * Flags that parent nodes assign to child nodes to specify what kind of
+ * role(s) they take.
+ *
+ * At least one of DATA, METADATA, FILTERED, or COW must be set for
+ * every child.
+ */
+enum BdrvChildRoleBits {
+    /*
+     * This child stores data.
+     * Any node may have an arbitrary number of such children.
+     */
+    BDRV_CHILD_DATA         = (1 << 0),
+
+    /*
+     * This child stores metadata.
+     * Any node may have an arbitrary number of metadata-storing
+     * children.
+     */
+    BDRV_CHILD_METADATA     = (1 << 1),
+
+    /*
+     * A child that always presents exactly the same visible data as
+     * the parent, e.g. by virtue of the parent forwarding all reads
+     * and writes.
+     * This flag is mutually exclusive with DATA, METADATA, and COW.
+     * Any node may have at most one filtered child at a time.
+     */
+    BDRV_CHILD_FILTERED     = (1 << 2),
+
+    /*
+     * Child from which to read all data that isn't allocated in the
+     * parent (i.e., the backing child); such data is copied to the
+     * parent through COW (and optionally COR).
+     * This field is mutually exclusive with DATA, METADATA, and
+     * FILTERED.
+     * Any node may have at most one such backing child at a time.
+     */
+    BDRV_CHILD_COW          = (1 << 3),
+
+    /*
+     * The primary child.  For most drivers, this is the child whose
+     * filename applies best to the parent node.
+     * Any node may have at most one primary child at a time.
+     */
+    BDRV_CHILD_PRIMARY      = (1 << 4),
+
+    /* Useful combination of flags */
+    BDRV_CHILD_IMAGE        = BDRV_CHILD_DATA
+                              | BDRV_CHILD_METADATA
+                              | BDRV_CHILD_PRIMARY,
+};
+
+/* Mask of BdrvChildRoleBits values */
+typedef unsigned int BdrvChildRole;
+
+char *bdrv_perm_names(uint64_t perm);
+uint64_t bdrv_qapi_perm_to_blk_perm(BlockPermission qapi_perm);
+
+/* disk I/O throttling */
+void bdrv_init(void);
+void bdrv_init_with_whitelist(void);
+bool bdrv_uses_whitelist(void);
+int bdrv_is_whitelisted(BlockDriver *drv, bool read_only);
+BlockDriver *bdrv_find_protocol(const char *filename,
+                                bool allow_protocol_prefix,
+                                Error **errp);
+BlockDriver *bdrv_find_format(const char *format_name);
+int bdrv_create(BlockDriver *drv, const char* filename,
+                QemuOpts *opts, Error **errp);
+int bdrv_create_file(const char *filename, QemuOpts *opts, Error **errp);
+
+BlockDriverState *bdrv_new(void);
+int bdrv_append(BlockDriverState *bs_new, BlockDriverState *bs_top,
+                Error **errp);
+int bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
+                      Error **errp);
+int bdrv_replace_child_bs(BdrvChild *child, BlockDriverState *new_bs,
+                          Error **errp);
+BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
+                                   int flags, Error **errp);
+int bdrv_drop_filter(BlockDriverState *bs, Error **errp);
+
+int bdrv_parse_aio(const char *mode, int *flags);
+int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
+int bdrv_parse_discard_flags(const char *mode, int *flags);
+BdrvChild *bdrv_open_child(const char *filename,
+                           QDict *options, const char *bdref_key,
+                           BlockDriverState* parent,
+                           const BdrvChildClass *child_class,
+                           BdrvChildRole child_role,
+                           bool allow_none, Error **errp);
+BlockDriverState *bdrv_open_blockdev_ref(BlockdevRef *ref, Error **errp);
+int bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
+                        Error **errp);
+int bdrv_open_backing_file(BlockDriverState *bs, QDict *parent_options,
+                           const char *bdref_key, Error **errp);
+BlockDriverState *bdrv_open(const char *filename, const char *reference,
+                            QDict *options, int flags, Error **errp);
+BlockDriverState *bdrv_new_open_driver_opts(BlockDriver *drv,
+                                            const char *node_name,
+                                            QDict *options, int flags,
+                                            Error **errp);
+BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
+                                       int flags, Error **errp);
+BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
+                                    BlockDriverState *bs, QDict *options,
+                                    bool keep_old_opts);
+void bdrv_reopen_queue_free(BlockReopenQueue *bs_queue);
+int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen(BlockDriverState *bs, QDict *opts, bool keep_old_opts,
+                Error **errp);
+int bdrv_reopen_set_read_only(BlockDriverState *bs, bool read_only,
+                              Error **errp);
+int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
+                       int64_t bytes, BdrvRequestFlags flags);
+int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags);
+int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int64_t bytes);
+int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf,
+                int64_t bytes);
+int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
+                     const void *buf, int64_t bytes);
+/*
+ * Efficiently zero a region of the disk image.  Note that this is a regular
+ * I/O request like read or write and should have a reasonable size.  This
+ * function is not suitable for zeroing the entire image in a single request
+ * because it may allocate memory for the entire region.
+ */
+int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
+                                       int64_t bytes, BdrvRequestFlags flags);
+BlockDriverState *bdrv_find_backing_image(BlockDriverState *bs,
+    const char *backing_file);
+void bdrv_refresh_filename(BlockDriverState *bs);
+
+int coroutine_fn bdrv_co_truncate(BdrvChild *child, int64_t offset, bool exact,
+                                  PreallocMode prealloc, BdrvRequestFlags flags,
+                                  Error **errp);
+int generated_co_wrapper
+bdrv_truncate(BdrvChild *child, int64_t offset, bool exact,
+              PreallocMode prealloc, BdrvRequestFlags flags, Error **errp);
+
+int64_t bdrv_nb_sectors(BlockDriverState *bs);
+int64_t bdrv_getlength(BlockDriverState *bs);
+int64_t bdrv_get_allocated_file_size(BlockDriverState *bs);
+BlockMeasureInfo *bdrv_measure(BlockDriver *drv, QemuOpts *opts,
+                               BlockDriverState *in_bs, Error **errp);
+void bdrv_get_geometry(BlockDriverState *bs, uint64_t *nb_sectors_ptr);
+void bdrv_refresh_limits(BlockDriverState *bs, Transaction *tran, Error **errp);
+int bdrv_commit(BlockDriverState *bs);
+int bdrv_make_empty(BdrvChild *c, Error **errp);
+int bdrv_change_backing_file(BlockDriverState *bs, const char *backing_file,
+                             const char *backing_fmt, bool warn);
+void bdrv_register(BlockDriver *bdrv);
+int bdrv_drop_intermediate(BlockDriverState *top, BlockDriverState *base,
+                           const char *backing_file_str);
+BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
+                                    BlockDriverState *bs);
+BlockDriverState *bdrv_find_base(BlockDriverState *bs);
+bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+                                  Error **errp);
+int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
+                              Error **errp);
+void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base);
+int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp);
+void coroutine_fn bdrv_co_delete_file_noerr(BlockDriverState *bs);
+
+
+typedef struct BdrvCheckResult {
+    int corruptions;
+    int leaks;
+    int check_errors;
+    int corruptions_fixed;
+    int leaks_fixed;
+    int64_t image_end_offset;
+    BlockFragInfo bfi;
+} BdrvCheckResult;
+
+typedef enum {
+    BDRV_FIX_LEAKS    = 1,
+    BDRV_FIX_ERRORS   = 2,
+} BdrvCheckMode;
+
+int generated_co_wrapper bdrv_check(BlockDriverState *bs, BdrvCheckResult *res,
+                                    BdrvCheckMode fix);
+
+/* The units of offset and total_work_size may be chosen arbitrarily by the
+ * block driver; total_work_size may change during the course of the amendment
+ * operation */
+typedef void BlockDriverAmendStatusCB(BlockDriverState *bs, int64_t offset,
+                                      int64_t total_work_size, void *opaque);
+int bdrv_amend_options(BlockDriverState *bs_new, QemuOpts *opts,
+                       BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
+                       bool force,
+                       Error **errp);
+
+/* check if a named node can be replaced when doing drive-mirror */
+BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+                                        const char *node_name, Error **errp);
+
+/* async block I/O */
+void bdrv_aio_cancel(BlockAIOCB *acb);
+void bdrv_aio_cancel_async(BlockAIOCB *acb);
+
+/* sg packet commands */
+int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf);
+
+/* Invalidate any cached metadata used by image formats */
+int generated_co_wrapper bdrv_invalidate_cache(BlockDriverState *bs,
+                                               Error **errp);
+void bdrv_invalidate_cache_all(Error **errp);
+int bdrv_inactivate_all(void);
+
+/* Ensure contents are flushed to disk.  */
+int generated_co_wrapper bdrv_flush(BlockDriverState *bs);
+int coroutine_fn bdrv_co_flush(BlockDriverState *bs);
+int bdrv_flush_all(void);
+void bdrv_close_all(void);
+void bdrv_drain(BlockDriverState *bs);
+void coroutine_fn bdrv_co_drain(BlockDriverState *bs);
+void bdrv_drain_all_begin(void);
+void bdrv_drain_all_end(void);
+void bdrv_drain_all(void);
+
+#define BDRV_POLL_WHILE(bs, cond) ({                       \
+    BlockDriverState *bs_ = (bs);                          \
+    AIO_WAIT_WHILE(bdrv_get_aio_context(bs_),              \
+                   cond); })
+
+int generated_co_wrapper bdrv_pdiscard(BdrvChild *child, int64_t offset,
+                                       int64_t bytes);
+int bdrv_co_pdiscard(BdrvChild *child, int64_t offset, int64_t bytes);
+int bdrv_has_zero_init_1(BlockDriverState *bs);
+int bdrv_has_zero_init(BlockDriverState *bs);
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
+int bdrv_block_status(BlockDriverState *bs, int64_t offset,
+                      int64_t bytes, int64_t *pnum, int64_t *map,
+                      BlockDriverState **file);
+int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
+                            int64_t offset, int64_t bytes, int64_t *pnum,
+                            int64_t *map, BlockDriverState **file);
+int bdrv_is_allocated(BlockDriverState *bs, int64_t offset, int64_t bytes,
+                      int64_t *pnum);
+int bdrv_is_allocated_above(BlockDriverState *top, BlockDriverState *base,
+                            bool include_base, int64_t offset, int64_t bytes,
+                            int64_t *pnum);
+int coroutine_fn bdrv_co_is_zero_fast(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes);
+
+bool bdrv_is_read_only(BlockDriverState *bs);
+int bdrv_can_set_read_only(BlockDriverState *bs, bool read_only,
+                           bool ignore_allow_rdw, Error **errp);
+int bdrv_apply_auto_read_only(BlockDriverState *bs, const char *errmsg,
+                              Error **errp);
+bool bdrv_is_writable(BlockDriverState *bs);
+bool bdrv_is_sg(BlockDriverState *bs);
+bool bdrv_is_inserted(BlockDriverState *bs);
+void bdrv_lock_medium(BlockDriverState *bs, bool locked);
+void bdrv_eject(BlockDriverState *bs, bool eject_flag);
+const char *bdrv_get_format_name(BlockDriverState *bs);
+BlockDriverState *bdrv_find_node(const char *node_name);
+BlockDeviceInfoList *bdrv_named_nodes_list(bool flat, Error **errp);
+XDbgBlockGraph *bdrv_get_xdbg_block_graph(Error **errp);
+BlockDriverState *bdrv_lookup_bs(const char *device,
+                                 const char *node_name,
+                                 Error **errp);
+bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base);
+BlockDriverState *bdrv_next_node(BlockDriverState *bs);
+BlockDriverState *bdrv_next_all_states(BlockDriverState *bs);
+
+typedef struct BdrvNextIterator {
+    enum {
+        BDRV_NEXT_BACKEND_ROOTS,
+        BDRV_NEXT_MONITOR_OWNED,
+    } phase;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+} BdrvNextIterator;
+
+BlockDriverState *bdrv_first(BdrvNextIterator *it);
+BlockDriverState *bdrv_next(BdrvNextIterator *it);
+void bdrv_next_cleanup(BdrvNextIterator *it);
+
+BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
+bool bdrv_supports_compressed_writes(BlockDriverState *bs);
+void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
+                         void *opaque, bool read_only);
+const char *bdrv_get_node_name(const BlockDriverState *bs);
+const char *bdrv_get_device_name(const BlockDriverState *bs);
+const char *bdrv_get_device_or_node_name(const BlockDriverState *bs);
+int bdrv_get_flags(BlockDriverState *bs);
+int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
+ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs,
+                                          Error **errp);
+BlockStatsSpecific *bdrv_get_specific_stats(BlockDriverState *bs);
+void bdrv_round_to_clusters(BlockDriverState *bs,
+                            int64_t offset, int64_t bytes,
+                            int64_t *cluster_offset,
+                            int64_t *cluster_bytes);
+
+void bdrv_get_backing_filename(BlockDriverState *bs,
+                               char *filename, int filename_size);
+char *bdrv_get_full_backing_filename(BlockDriverState *bs, Error **errp);
+char *bdrv_get_full_backing_filename_from_filename(const char *backed,
+                                                   const char *backing,
+                                                   Error **errp);
+char *bdrv_dirname(BlockDriverState *bs, Error **errp);
+
+int path_has_protocol(const char *path);
+int path_is_absolute(const char *path);
+char *path_combine(const char *base_path, const char *filename);
+
+int generated_co_wrapper
+bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int generated_co_wrapper
+bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos);
+int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
+                      int64_t pos, int size);
+
+int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
+                      int64_t pos, int size);
+
+void bdrv_img_create(const char *filename, const char *fmt,
+                     const char *base_filename, const char *base_fmt,
+                     char *options, uint64_t img_size, int flags,
+                     bool quiet, Error **errp);
+
+/* Returns the alignment in bytes that is required so that no bounce buffer
+ * is required throughout the stack */
+size_t bdrv_min_mem_align(BlockDriverState *bs);
+/* Returns optimal alignment in bytes for bounce buffer */
+size_t bdrv_opt_mem_align(BlockDriverState *bs);
+void *qemu_blockalign(BlockDriverState *bs, size_t size);
+void *qemu_blockalign0(BlockDriverState *bs, size_t size);
+void *qemu_try_blockalign(BlockDriverState *bs, size_t size);
+void *qemu_try_blockalign0(BlockDriverState *bs, size_t size);
+bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov);
+
+void bdrv_enable_copy_on_read(BlockDriverState *bs);
+void bdrv_disable_copy_on_read(BlockDriverState *bs);
+
+void bdrv_ref(BlockDriverState *bs);
+void bdrv_unref(BlockDriverState *bs);
+void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child);
+BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+                             BlockDriverState *child_bs,
+                             const char *child_name,
+                             const BdrvChildClass *child_class,
+                             BdrvChildRole child_role,
+                             Error **errp);
+
+bool bdrv_op_is_blocked(BlockDriverState *bs, BlockOpType op, Error **errp);
+void bdrv_op_block(BlockDriverState *bs, BlockOpType op, Error *reason);
+void bdrv_op_unblock(BlockDriverState *bs, BlockOpType op, Error *reason);
+void bdrv_op_block_all(BlockDriverState *bs, Error *reason);
+void bdrv_op_unblock_all(BlockDriverState *bs, Error *reason);
+bool bdrv_op_blocker_is_empty(BlockDriverState *bs);
+
+#define BLKDBG_EVENT(child, evt) \
+    do { \
+        if (child) { \
+            bdrv_debug_event(child->bs, evt); \
+        } \
+    } while (0)
+
+void bdrv_debug_event(BlockDriverState *bs, BlkdebugEvent event);
+
+int bdrv_debug_breakpoint(BlockDriverState *bs, const char *event,
+                           const char *tag);
+int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag);
+int bdrv_debug_resume(BlockDriverState *bs, const char *tag);
+bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag);
+
+/**
+ * bdrv_get_aio_context:
+ *
+ * Returns: the currently bound #AioContext
+ */
+AioContext *bdrv_get_aio_context(BlockDriverState *bs);
+
+/**
+ * Move the current coroutine to the AioContext of @bs and return the old
+ * AioContext of the coroutine. Increase bs->in_flight so that draining @bs
+ * will wait for the operation to proceed until the corresponding
+ * bdrv_co_leave().
+ *
+ * Consequently, you can't call drain inside a bdrv_co_enter/leave() section as
+ * this will deadlock.
+ */
+AioContext *coroutine_fn bdrv_co_enter(BlockDriverState *bs);
+
+/**
+ * Ends a section started by bdrv_co_enter(). Move the current coroutine back
+ * to old_ctx and decrease bs->in_flight again.
+ */
+void coroutine_fn bdrv_co_leave(BlockDriverState *bs, AioContext *old_ctx);
+
+/**
+ * Locks the AioContext of @bs if it's not the current AioContext. This avoids
+ * double locking which could lead to deadlocks: This is a coroutine_fn, so we
+ * know we already own the lock of the current AioContext.
+ *
+ * May only be called in the main thread.
+ */
+void coroutine_fn bdrv_co_lock(BlockDriverState *bs);
+
+/**
+ * Unlocks the AioContext of @bs if it's not the current AioContext.
+ */
+void coroutine_fn bdrv_co_unlock(BlockDriverState *bs);
+
+/**
+ * Transfer control to @co in the aio context of @bs
+ */
+void bdrv_coroutine_enter(BlockDriverState *bs, Coroutine *co);
+
+void bdrv_set_aio_context_ignore(BlockDriverState *bs,
+                                 AioContext *new_context, GSList **ignore);
+int bdrv_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+                             Error **errp);
+int bdrv_child_try_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+                                   BdrvChild *ignore_child, Error **errp);
+bool bdrv_child_can_set_aio_context(BdrvChild *c, AioContext *ctx,
+                                    GSList **ignore, Error **errp);
+bool bdrv_can_set_aio_context(BlockDriverState *bs, AioContext *ctx,
+                              GSList **ignore, Error **errp);
+AioContext *bdrv_child_get_parent_aio_context(BdrvChild *c);
+AioContext *child_of_bds_get_parent_aio_context(BdrvChild *c);
+
+int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz);
+int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo);
+
+void bdrv_io_plug(BlockDriverState *bs);
+void bdrv_io_unplug(BlockDriverState *bs);
+
+/**
+ * bdrv_parent_drained_begin_single:
+ *
+ * Begin a quiesced section for the parent of @c. If @poll is true, wait for
+ * any pending activity to cease.
+ */
+void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
+
+/**
+ * bdrv_parent_drained_end_single:
+ *
+ * End a quiesced section for the parent of @c.
+ *
+ * This polls @bs's AioContext until all scheduled sub-drained_ends
+ * have settled, which may result in graph changes.
+ */
+void bdrv_parent_drained_end_single(BdrvChild *c);
+
+/**
+ * bdrv_drain_poll:
+ *
+ * Poll for pending requests in @bs, its parents (except for @ignore_parent),
+ * and if @recursive is true its children as well (used for subtree drain).
+ *
+ * If @ignore_bds_parents is true, parents that are BlockDriverStates must
+ * ignore the drain request because they will be drained separately (used for
+ * drain_all).
+ *
+ * This is part of bdrv_drained_begin.
+ */
+bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
+                     BdrvChild *ignore_parent, bool ignore_bds_parents);
+
+/**
+ * bdrv_drained_begin:
+ *
+ * Begin a quiesced section for exclusive access to the BDS, by disabling
+ * external request sources including NBD server, block jobs, and device model.
+ *
+ * This function can be recursive.
+ */
+void bdrv_drained_begin(BlockDriverState *bs);
+
+/**
+ * bdrv_do_drained_begin_quiesce:
+ *
+ * Quiesces a BDS like bdrv_drained_begin(), but does not wait for already
+ * running requests to complete.
+ */
+void bdrv_do_drained_begin_quiesce(BlockDriverState *bs,
+                                   BdrvChild *parent, bool ignore_bds_parents);
+
+/**
+ * Like bdrv_drained_begin, but recursively begins a quiesced section for
+ * exclusive access to all child nodes as well.
+ */
+void bdrv_subtree_drained_begin(BlockDriverState *bs);
+
+/**
+ * bdrv_drained_end:
+ *
+ * End a quiescent section started by bdrv_drained_begin().
+ *
+ * This polls @bs's AioContext until all scheduled sub-drained_ends
+ * have settled.  On one hand, that may result in graph changes.  On
+ * the other, this requires that the caller either runs in the main
+ * loop; or that all involved nodes (@bs and all of its parents) are
+ * in the caller's AioContext.
+ */
+void bdrv_drained_end(BlockDriverState *bs);
+
+/**
+ * bdrv_drained_end_no_poll:
+ *
+ * Same as bdrv_drained_end(), but do not poll for the subgraph to
+ * actually become unquiesced.  Therefore, no graph changes will occur
+ * with this function.
+ *
+ * *drained_end_counter is incremented for every background operation
+ * that is scheduled, and will be decremented for every operation once
+ * it settles.  The caller must poll until it reaches 0.  The counter
+ * should be accessed using atomic operations only.
+ */
+void bdrv_drained_end_no_poll(BlockDriverState *bs, int *drained_end_counter);
+
+/**
+ * End a quiescent section started by bdrv_subtree_drained_begin().
+ */
+void bdrv_subtree_drained_end(BlockDriverState *bs);
+
+void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
+                    Error **errp);
+void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
+
+bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, const char *name,
+                                     uint32_t granularity, Error **errp);
+/**
+ *
+ * bdrv_register_buf/bdrv_unregister_buf:
+ *
+ * Register/unregister a buffer for I/O. For example, VFIO drivers are
+ * interested to know the memory areas that would later be used for I/O, so
+ * that they can prepare IOMMU mapping etc., to get better performance.
+ */
+void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
+void bdrv_unregister_buf(BlockDriverState *bs, void *host);
+
+/**
+ *
+ * bdrv_co_copy_range:
+ *
+ * Do offloaded copy between two children. If the operation is not implemented
+ * by the driver, or if the backend storage doesn't support it, a negative
+ * error code will be returned.
+ *
+ * Note: block layer doesn't emulate or fallback to a bounce buffer approach
+ * because usually the caller shouldn't attempt offloaded copy any more (e.g.
+ * calling copy_file_range(2)) after the first error, thus it should fall back
+ * to a read+write path in the caller level.
+ *
+ * @src: Source child to copy data from
+ * @src_offset: offset in @src image to read data
+ * @dst: Destination child to copy data to
+ * @dst_offset: offset in @dst image to write data
+ * @bytes: number of bytes to copy
+ * @flags: request flags. Supported flags:
+ *         BDRV_REQ_ZERO_WRITE - treat the @src range as zero data and do zero
+ *                               write on @dst as if bdrv_co_pwrite_zeroes is
+ *                               called. Used to simplify caller code, or
+ *                               during BlockDriver.bdrv_co_copy_range_from()
+ *                               recursion.
+ *         BDRV_REQ_NO_SERIALISING - do not serialize with other overlapping
+ *                                   requests currently in flight.
+ *
+ * Returns: 0 if succeeded; negative error code if failed.
+ **/
+int coroutine_fn bdrv_co_copy_range(BdrvChild *src, int64_t src_offset,
+                                    BdrvChild *dst, int64_t dst_offset,
+                                    int64_t bytes, BdrvRequestFlags read_flags,
+                                    BdrvRequestFlags write_flags);
+
+void bdrv_cancel_in_flight(BlockDriverState *bs);
+
+#endif
diff --git a/include/block/block_backup.h b/include/block/block_backup.h
new file mode 100644
index 000000000..157596c29
--- /dev/null
+++ b/include/block/block_backup.h
@@ -0,0 +1,25 @@
+/*
+ * QEMU backup
+ *
+ * Copyright (c) 2013 Proxmox Server Solutions
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Authors:
+ *  Dietmar Maurer <dietmar@proxmox.com>
+ *  Changlong Xie <xiecl.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_BACKUP_H
+#define BLOCK_BACKUP_H
+
+#include "block/block_int.h"
+
+void backup_do_checkpoint(BlockJob *job, Error **errp);
+
+#endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
new file mode 100644
index 000000000..f4c75e8ba
--- /dev/null
+++ b/include/block/block_int.h
@@ -0,0 +1,1504 @@
+/*
+ * QEMU System Emulator block driver
+ *
+ * Copyright (c) 2003 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef BLOCK_INT_H
+#define BLOCK_INT_H
+
+#include "block/accounting.h"
+#include "block/block.h"
+#include "block/aio-wait.h"
+#include "qemu/queue.h"
+#include "qemu/coroutine.h"
+#include "qemu/stats64.h"
+#include "qemu/timer.h"
+#include "qemu/hbitmap.h"
+#include "block/snapshot.h"
+#include "qemu/throttle.h"
+#include "qemu/rcu.h"
+
+#define BLOCK_FLAG_LAZY_REFCOUNTS   8
+
+#define BLOCK_OPT_SIZE              "size"
+#define BLOCK_OPT_ENCRYPT           "encryption"
+#define BLOCK_OPT_ENCRYPT_FORMAT    "encrypt.format"
+#define BLOCK_OPT_COMPAT6           "compat6"
+#define BLOCK_OPT_HWVERSION         "hwversion"
+#define BLOCK_OPT_BACKING_FILE      "backing_file"
+#define BLOCK_OPT_BACKING_FMT       "backing_fmt"
+#define BLOCK_OPT_CLUSTER_SIZE      "cluster_size"
+#define BLOCK_OPT_TABLE_SIZE        "table_size"
+#define BLOCK_OPT_PREALLOC          "preallocation"
+#define BLOCK_OPT_SUBFMT            "subformat"
+#define BLOCK_OPT_COMPAT_LEVEL      "compat"
+#define BLOCK_OPT_LAZY_REFCOUNTS    "lazy_refcounts"
+#define BLOCK_OPT_ADAPTER_TYPE      "adapter_type"
+#define BLOCK_OPT_REDUNDANCY        "redundancy"
+#define BLOCK_OPT_NOCOW             "nocow"
+#define BLOCK_OPT_EXTENT_SIZE_HINT  "extent_size_hint"
+#define BLOCK_OPT_OBJECT_SIZE       "object_size"
+#define BLOCK_OPT_REFCOUNT_BITS     "refcount_bits"
+#define BLOCK_OPT_DATA_FILE         "data_file"
+#define BLOCK_OPT_DATA_FILE_RAW     "data_file_raw"
+#define BLOCK_OPT_COMPRESSION_TYPE  "compression_type"
+#define BLOCK_OPT_EXTL2             "extended_l2"
+
+#define BLOCK_PROBE_BUF_SIZE        512
+
+enum BdrvTrackedRequestType {
+    BDRV_TRACKED_READ,
+    BDRV_TRACKED_WRITE,
+    BDRV_TRACKED_DISCARD,
+    BDRV_TRACKED_TRUNCATE,
+};
+
+/*
+ * That is not quite good that BdrvTrackedRequest structure is public,
+ * as block/io.c is very careful about incoming offset/bytes being
+ * correct. Be sure to assert bdrv_check_request() succeeded after any
+ * modification of BdrvTrackedRequest object out of block/io.c
+ */
+typedef struct BdrvTrackedRequest {
+    BlockDriverState *bs;
+    int64_t offset;
+    int64_t bytes;
+    enum BdrvTrackedRequestType type;
+
+    bool serialising;
+    int64_t overlap_offset;
+    int64_t overlap_bytes;
+
+    QLIST_ENTRY(BdrvTrackedRequest) list;
+    Coroutine *co; /* owner, used for deadlock detection */
+    CoQueue wait_queue; /* coroutines blocked on this request */
+
+    struct BdrvTrackedRequest *waiting_for;
+} BdrvTrackedRequest;
+
+int bdrv_check_qiov_request(int64_t offset, int64_t bytes,
+                            QEMUIOVector *qiov, size_t qiov_offset,
+                            Error **errp);
+int bdrv_check_request(int64_t offset, int64_t bytes, Error **errp);
+
+struct BlockDriver {
+    const char *format_name;
+    int instance_size;
+
+    /* set to true if the BlockDriver is a block filter. Block filters pass
+     * certain callbacks that refer to data (see block.c) to their bs->file
+     * or bs->backing (whichever one exists) if the driver doesn't implement
+     * them. Drivers that do not wish to forward must implement them and return
+     * -ENOTSUP.
+     * Note that filters are not allowed to modify data.
+     *
+     * Filters generally cannot have more than a single filtered child,
+     * because the data they present must at all times be the same as
+     * that on their filtered child.  That would be impossible to
+     * achieve for multiple filtered children.
+     * (And this filtered child must then be bs->file or bs->backing.)
+     */
+    bool is_filter;
+    /*
+     * Set to true if the BlockDriver is a format driver.  Format nodes
+     * generally do not expect their children to be other format nodes
+     * (except for backing files), and so format probing is disabled
+     * on those children.
+     */
+    bool is_format;
+    /*
+     * Return true if @to_replace can be replaced by a BDS with the
+     * same data as @bs without it affecting @bs's behavior (that is,
+     * without it being visible to @bs's parents).
+     */
+    bool (*bdrv_recurse_can_replace)(BlockDriverState *bs,
+                                     BlockDriverState *to_replace);
+
+    int (*bdrv_probe)(const uint8_t *buf, int buf_size, const char *filename);
+    int (*bdrv_probe_device)(const char *filename);
+
+    /* Any driver implementing this callback is expected to be able to handle
+     * NULL file names in its .bdrv_open() implementation */
+    void (*bdrv_parse_filename)(const char *filename, QDict *options, Error **errp);
+    /* Drivers not implementing bdrv_parse_filename nor bdrv_open should have
+     * this field set to true, except ones that are defined only by their
+     * child's bs.
+     * An example of the last type will be the quorum block driver.
+     */
+    bool bdrv_needs_filename;
+
+    /*
+     * Set if a driver can support backing files. This also implies the
+     * following semantics:
+     *
+     *  - Return status 0 of .bdrv_co_block_status means that corresponding
+     *    blocks are not allocated in this layer of backing-chain
+     *  - For such (unallocated) blocks, read will:
+     *    - fill buffer with zeros if there is no backing file
+     *    - read from the backing file otherwise, where the block layer
+     *      takes care of reading zeros beyond EOF if backing file is short
+     */
+    bool supports_backing;
+
+    /* For handling image reopen for split or non-split files */
+    int (*bdrv_reopen_prepare)(BDRVReopenState *reopen_state,
+                               BlockReopenQueue *queue, Error **errp);
+    void (*bdrv_reopen_commit)(BDRVReopenState *reopen_state);
+    void (*bdrv_reopen_commit_post)(BDRVReopenState *reopen_state);
+    void (*bdrv_reopen_abort)(BDRVReopenState *reopen_state);
+    void (*bdrv_join_options)(QDict *options, QDict *old_options);
+
+    int (*bdrv_open)(BlockDriverState *bs, QDict *options, int flags,
+                     Error **errp);
+
+    /* Protocol drivers should implement this instead of bdrv_open */
+    int (*bdrv_file_open)(BlockDriverState *bs, QDict *options, int flags,
+                          Error **errp);
+    void (*bdrv_close)(BlockDriverState *bs);
+
+
+    int coroutine_fn (*bdrv_co_create)(BlockdevCreateOptions *opts,
+                                       Error **errp);
+    int coroutine_fn (*bdrv_co_create_opts)(BlockDriver *drv,
+                                            const char *filename,
+                                            QemuOpts *opts,
+                                            Error **errp);
+
+    int coroutine_fn (*bdrv_co_amend)(BlockDriverState *bs,
+                                      BlockdevAmendOptions *opts,
+                                      bool force,
+                                      Error **errp);
+
+    int (*bdrv_amend_options)(BlockDriverState *bs,
+                              QemuOpts *opts,
+                              BlockDriverAmendStatusCB *status_cb,
+                              void *cb_opaque,
+                              bool force,
+                              Error **errp);
+
+    int (*bdrv_make_empty)(BlockDriverState *bs);
+
+    /*
+     * Refreshes the bs->exact_filename field. If that is impossible,
+     * bs->exact_filename has to be left empty.
+     */
+    void (*bdrv_refresh_filename)(BlockDriverState *bs);
+
+    /*
+     * Gathers the open options for all children into @target.
+     * A simple format driver (without backing file support) might
+     * implement this function like this:
+     *
+     *     QINCREF(bs->file->bs->full_open_options);
+     *     qdict_put(target, "file", bs->file->bs->full_open_options);
+     *
+     * If not specified, the generic implementation will simply put
+     * all children's options under their respective name.
+     *
+     * @backing_overridden is true when bs->backing seems not to be
+     * the child that would result from opening bs->backing_file.
+     * Therefore, if it is true, the backing child's options should be
+     * gathered; otherwise, there is no need since the backing child
+     * is the one implied by the image header.
+     *
+     * Note that ideally this function would not be needed.  Every
+     * block driver which implements it is probably doing something
+     * shady regarding its runtime option structure.
+     */
+    void (*bdrv_gather_child_options)(BlockDriverState *bs, QDict *target,
+                                      bool backing_overridden);
+
+    /*
+     * Returns an allocated string which is the directory name of this BDS: It
+     * will be used to make relative filenames absolute by prepending this
+     * function's return value to them.
+     */
+    char *(*bdrv_dirname)(BlockDriverState *bs, Error **errp);
+
+    /* aio */
+    BlockAIOCB *(*bdrv_aio_preadv)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+        BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
+    BlockAIOCB *(*bdrv_aio_pwritev)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+        BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque);
+    BlockAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs,
+        BlockCompletionFunc *cb, void *opaque);
+    BlockAIOCB *(*bdrv_aio_pdiscard)(BlockDriverState *bs,
+        int64_t offset, int bytes,
+        BlockCompletionFunc *cb, void *opaque);
+
+    int coroutine_fn (*bdrv_co_readv)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+
+    /**
+     * @offset: position in bytes to read at
+     * @bytes: number of bytes to read
+     * @qiov: the buffers to fill with read data
+     * @flags: currently unused, always 0
+     *
+     * @offset and @bytes will be a multiple of 'request_alignment',
+     * but the length of individual @qiov elements does not have to
+     * be a multiple.
+     *
+     * @bytes will always equal the total size of @qiov, and will be
+     * no larger than 'max_transfer'.
+     *
+     * The buffer in @qiov may point directly to guest memory.
+     */
+    int coroutine_fn (*bdrv_co_preadv)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+        BdrvRequestFlags flags);
+    int coroutine_fn (*bdrv_co_preadv_part)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes,
+        QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+    int coroutine_fn (*bdrv_co_writev)(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int flags);
+    /**
+     * @offset: position in bytes to write at
+     * @bytes: number of bytes to write
+     * @qiov: the buffers containing data to write
+     * @flags: zero or more bits allowed by 'supported_write_flags'
+     *
+     * @offset and @bytes will be a multiple of 'request_alignment',
+     * but the length of individual @qiov elements does not have to
+     * be a multiple.
+     *
+     * @bytes will always equal the total size of @qiov, and will be
+     * no larger than 'max_transfer'.
+     *
+     * The buffer in @qiov may point directly to guest memory.
+     */
+    int coroutine_fn (*bdrv_co_pwritev)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+        BdrvRequestFlags flags);
+    int coroutine_fn (*bdrv_co_pwritev_part)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset,
+        BdrvRequestFlags flags);
+
+    /*
+     * Efficiently zero a region of the disk image.  Typically an image format
+     * would use a compact metadata representation to implement this.  This
+     * function pointer may be NULL or return -ENOSUP and .bdrv_co_writev()
+     * will be called instead.
+     */
+    int coroutine_fn (*bdrv_co_pwrite_zeroes)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, BdrvRequestFlags flags);
+    int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes);
+
+    /* Map [offset, offset + nbytes) range onto a child of @bs to copy from,
+     * and invoke bdrv_co_copy_range_from(child, ...), or invoke
+     * bdrv_co_copy_range_to() if @bs is the leaf child to copy data from.
+     *
+     * See the comment of bdrv_co_copy_range for the parameter and return value
+     * semantics.
+     */
+    int coroutine_fn (*bdrv_co_copy_range_from)(BlockDriverState *bs,
+                                                BdrvChild *src,
+                                                int64_t offset,
+                                                BdrvChild *dst,
+                                                int64_t dst_offset,
+                                                int64_t bytes,
+                                                BdrvRequestFlags read_flags,
+                                                BdrvRequestFlags write_flags);
+
+    /* Map [offset, offset + nbytes) range onto a child of bs to copy data to,
+     * and invoke bdrv_co_copy_range_to(child, src, ...), or perform the copy
+     * operation if @bs is the leaf and @src has the same BlockDriver.  Return
+     * -ENOTSUP if @bs is the leaf but @src has a different BlockDriver.
+     *
+     * See the comment of bdrv_co_copy_range for the parameter and return value
+     * semantics.
+     */
+    int coroutine_fn (*bdrv_co_copy_range_to)(BlockDriverState *bs,
+                                              BdrvChild *src,
+                                              int64_t src_offset,
+                                              BdrvChild *dst,
+                                              int64_t dst_offset,
+                                              int64_t bytes,
+                                              BdrvRequestFlags read_flags,
+                                              BdrvRequestFlags write_flags);
+
+    /*
+     * Building block for bdrv_block_status[_above] and
+     * bdrv_is_allocated[_above].  The driver should answer only
+     * according to the current layer, and should only need to set
+     * BDRV_BLOCK_DATA, BDRV_BLOCK_ZERO, BDRV_BLOCK_OFFSET_VALID,
+     * and/or BDRV_BLOCK_RAW; if the current layer defers to a backing
+     * layer, the result should be 0 (and not BDRV_BLOCK_ZERO).  See
+     * block.h for the overall meaning of the bits.  As a hint, the
+     * flag want_zero is true if the caller cares more about precise
+     * mappings (favor accurate _OFFSET_VALID/_ZERO) or false for
+     * overall allocation (favor larger *pnum, perhaps by reporting
+     * _DATA instead of _ZERO).  The block layer guarantees input
+     * clamped to bdrv_getlength() and aligned to request_alignment,
+     * as well as non-NULL pnum, map, and file; in turn, the driver
+     * must return an error or set pnum to an aligned non-zero value.
+     *
+     * Note that @bytes is just a hint on how big of a region the
+     * caller wants to inspect.  It is not a limit on *pnum.
+     * Implementations are free to return larger values of *pnum if
+     * doing so does not incur a performance penalty.
+     *
+     * block/io.c's bdrv_co_block_status() will utilize an unclamped
+     * *pnum value for the block-status cache on protocol nodes, prior
+     * to clamping *pnum for return to its caller.
+     */
+    int coroutine_fn (*bdrv_co_block_status)(BlockDriverState *bs,
+        bool want_zero, int64_t offset, int64_t bytes, int64_t *pnum,
+        int64_t *map, BlockDriverState **file);
+
+    /*
+     * This informs the driver that we are no longer interested in the result
+     * of in-flight requests, so don't waste the time if possible.
+     *
+     * One example usage is to avoid waiting for an nbd target node reconnect
+     * timeout during job-cancel with force=true.
+     */
+    void (*bdrv_cancel_in_flight)(BlockDriverState *bs);
+
+    /*
+     * Invalidate any cached meta-data.
+     */
+    void coroutine_fn (*bdrv_co_invalidate_cache)(BlockDriverState *bs,
+                                                  Error **errp);
+    int (*bdrv_inactivate)(BlockDriverState *bs);
+
+    /*
+     * Flushes all data for all layers by calling bdrv_co_flush for underlying
+     * layers, if needed. This function is needed for deterministic
+     * synchronization of the flush finishing callback.
+     */
+    int coroutine_fn (*bdrv_co_flush)(BlockDriverState *bs);
+
+    /* Delete a created file. */
+    int coroutine_fn (*bdrv_co_delete_file)(BlockDriverState *bs,
+                                            Error **errp);
+
+    /*
+     * Flushes all data that was already written to the OS all the way down to
+     * the disk (for example file-posix.c calls fsync()).
+     */
+    int coroutine_fn (*bdrv_co_flush_to_disk)(BlockDriverState *bs);
+
+    /*
+     * Flushes all internal caches to the OS. The data may still sit in a
+     * writeback cache of the host OS, but it will survive a crash of the qemu
+     * process.
+     */
+    int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
+
+    /*
+     * Drivers setting this field must be able to work with just a plain
+     * filename with '<protocol_name>:' as a prefix, and no other options.
+     * Options may be extracted from the filename by implementing
+     * bdrv_parse_filename.
+     */
+    const char *protocol_name;
+
+    /*
+     * Truncate @bs to @offset bytes using the given @prealloc mode
+     * when growing.  Modes other than PREALLOC_MODE_OFF should be
+     * rejected when shrinking @bs.
+     *
+     * If @exact is true, @bs must be resized to exactly @offset.
+     * Otherwise, it is sufficient for @bs (if it is a host block
+     * device and thus there is no way to resize it) to be at least
+     * @offset bytes in length.
+     *
+     * If @exact is true and this function fails but would succeed
+     * with @exact = false, it should return -ENOTSUP.
+     */
+    int coroutine_fn (*bdrv_co_truncate)(BlockDriverState *bs, int64_t offset,
+                                         bool exact, PreallocMode prealloc,
+                                         BdrvRequestFlags flags, Error **errp);
+
+    int64_t (*bdrv_getlength)(BlockDriverState *bs);
+    bool has_variable_length;
+    int64_t (*bdrv_get_allocated_file_size)(BlockDriverState *bs);
+    BlockMeasureInfo *(*bdrv_measure)(QemuOpts *opts, BlockDriverState *in_bs,
+                                      Error **errp);
+
+    int coroutine_fn (*bdrv_co_pwritev_compressed)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov);
+    int coroutine_fn (*bdrv_co_pwritev_compressed_part)(BlockDriverState *bs,
+        int64_t offset, int64_t bytes, QEMUIOVector *qiov, size_t qiov_offset);
+
+    int (*bdrv_snapshot_create)(BlockDriverState *bs,
+                                QEMUSnapshotInfo *sn_info);
+    int (*bdrv_snapshot_goto)(BlockDriverState *bs,
+                              const char *snapshot_id);
+    int (*bdrv_snapshot_delete)(BlockDriverState *bs,
+                                const char *snapshot_id,
+                                const char *name,
+                                Error **errp);
+    int (*bdrv_snapshot_list)(BlockDriverState *bs,
+                              QEMUSnapshotInfo **psn_info);
+    int (*bdrv_snapshot_load_tmp)(BlockDriverState *bs,
+                                  const char *snapshot_id,
+                                  const char *name,
+                                  Error **errp);
+    int (*bdrv_get_info)(BlockDriverState *bs, BlockDriverInfo *bdi);
+    ImageInfoSpecific *(*bdrv_get_specific_info)(BlockDriverState *bs,
+                                                 Error **errp);
+    BlockStatsSpecific *(*bdrv_get_specific_stats)(BlockDriverState *bs);
+
+    int coroutine_fn (*bdrv_save_vmstate)(BlockDriverState *bs,
+                                          QEMUIOVector *qiov,
+                                          int64_t pos);
+    int coroutine_fn (*bdrv_load_vmstate)(BlockDriverState *bs,
+                                          QEMUIOVector *qiov,
+                                          int64_t pos);
+
+    int (*bdrv_change_backing_file)(BlockDriverState *bs,
+        const char *backing_file, const char *backing_fmt);
+
+    /* removable device specific */
+    bool (*bdrv_is_inserted)(BlockDriverState *bs);
+    void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
+    void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
+
+    /* to control generic scsi devices */
+    BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
+        unsigned long int req, void *buf,
+        BlockCompletionFunc *cb, void *opaque);
+    int coroutine_fn (*bdrv_co_ioctl)(BlockDriverState *bs,
+                                      unsigned long int req, void *buf);
+
+    /* List of options for creating images, terminated by name == NULL */
+    QemuOptsList *create_opts;
+
+    /* List of options for image amend */
+    QemuOptsList *amend_opts;
+
+    /*
+     * If this driver supports reopening images this contains a
+     * NULL-terminated list of the runtime options that can be
+     * modified. If an option in this list is unspecified during
+     * reopen then it _must_ be reset to its default value or return
+     * an error.
+     */
+    const char *const *mutable_opts;
+
+    /*
+     * Returns 0 for completed check, -errno for internal errors.
+     * The check results are stored in result.
+     */
+    int coroutine_fn (*bdrv_co_check)(BlockDriverState *bs,
+                                      BdrvCheckResult *result,
+                                      BdrvCheckMode fix);
+
+    void (*bdrv_debug_event)(BlockDriverState *bs, BlkdebugEvent event);
+
+    /* TODO Better pass a option string/QDict/QemuOpts to add any rule? */
+    int (*bdrv_debug_breakpoint)(BlockDriverState *bs, const char *event,
+        const char *tag);
+    int (*bdrv_debug_remove_breakpoint)(BlockDriverState *bs,
+        const char *tag);
+    int (*bdrv_debug_resume)(BlockDriverState *bs, const char *tag);
+    bool (*bdrv_debug_is_suspended)(BlockDriverState *bs, const char *tag);
+
+    void (*bdrv_refresh_limits)(BlockDriverState *bs, Error **errp);
+
+    /*
+     * Returns 1 if newly created images are guaranteed to contain only
+     * zeros, 0 otherwise.
+     */
+    int (*bdrv_has_zero_init)(BlockDriverState *bs);
+
+    /* Remove fd handlers, timers, and other event loop callbacks so the event
+     * loop is no longer in use.  Called with no in-flight requests and in
+     * depth-first traversal order with parents before child nodes.
+     */
+    void (*bdrv_detach_aio_context)(BlockDriverState *bs);
+
+    /* Add fd handlers, timers, and other event loop callbacks so I/O requests
+     * can be processed again.  Called with no in-flight requests and in
+     * depth-first traversal order with child nodes before parent nodes.
+     */
+    void (*bdrv_attach_aio_context)(BlockDriverState *bs,
+                                    AioContext *new_context);
+
+    /* io queue for linux-aio */
+    void (*bdrv_io_plug)(BlockDriverState *bs);
+    void (*bdrv_io_unplug)(BlockDriverState *bs);
+
+    /**
+     * Try to get @bs's logical and physical block size.
+     * On success, store them in @bsz and return zero.
+     * On failure, return negative errno.
+     */
+    int (*bdrv_probe_blocksizes)(BlockDriverState *bs, BlockSizes *bsz);
+    /**
+     * Try to get @bs's geometry (cyls, heads, sectors)
+     * On success, store them in @geo and return 0.
+     * On failure return -errno.
+     * Only drivers that want to override guest geometry implement this
+     * callback; see hd_geometry_guess().
+     */
+    int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
+
+    /**
+     * bdrv_co_drain_begin is called if implemented in the beginning of a
+     * drain operation to drain and stop any internal sources of requests in
+     * the driver.
+     * bdrv_co_drain_end is called if implemented at the end of the drain.
+     *
+     * They should be used by the driver to e.g. manage scheduled I/O
+     * requests, or toggle an internal state. After the end of the drain new
+     * requests will continue normally.
+     */
+    void coroutine_fn (*bdrv_co_drain_begin)(BlockDriverState *bs);
+    void coroutine_fn (*bdrv_co_drain_end)(BlockDriverState *bs);
+
+    void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
+                           Error **errp);
+    void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
+                           Error **errp);
+
+    /**
+     * Informs the block driver that a permission change is intended. The
+     * driver checks whether the change is permissible and may take other
+     * preparations for the change (e.g. get file system locks). This operation
+     * is always followed either by a call to either .bdrv_set_perm or
+     * .bdrv_abort_perm_update.
+     *
+     * Checks whether the requested set of cumulative permissions in @perm
+     * can be granted for accessing @bs and whether no other users are using
+     * permissions other than those given in @shared (both arguments take
+     * BLK_PERM_* bitmasks).
+     *
+     * If both conditions are met, 0 is returned. Otherwise, -errno is returned
+     * and errp is set to an error describing the conflict.
+     */
+    int (*bdrv_check_perm)(BlockDriverState *bs, uint64_t perm,
+                           uint64_t shared, Error **errp);
+
+    /**
+     * Called to inform the driver that the set of cumulative set of used
+     * permissions for @bs has changed to @perm, and the set of sharable
+     * permission to @shared. The driver can use this to propagate changes to
+     * its children (i.e. request permissions only if a parent actually needs
+     * them).
+     *
+     * This function is only invoked after bdrv_check_perm(), so block drivers
+     * may rely on preparations made in their .bdrv_check_perm implementation.
+     */
+    void (*bdrv_set_perm)(BlockDriverState *bs, uint64_t perm, uint64_t shared);
+
+    /*
+     * Called to inform the driver that after a previous bdrv_check_perm()
+     * call, the permission update is not performed and any preparations made
+     * for it (e.g. taken file locks) need to be undone.
+     *
+     * This function can be called even for nodes that never saw a
+     * bdrv_check_perm() call. It is a no-op then.
+     */
+    void (*bdrv_abort_perm_update)(BlockDriverState *bs);
+
+    /**
+     * Returns in @nperm and @nshared the permissions that the driver for @bs
+     * needs on its child @c, based on the cumulative permissions requested by
+     * the parents in @parent_perm and @parent_shared.
+     *
+     * If @c is NULL, return the permissions for attaching a new child for the
+     * given @child_class and @role.
+     *
+     * If @reopen_queue is non-NULL, don't return the currently needed
+     * permissions, but those that will be needed after applying the
+     * @reopen_queue.
+     */
+     void (*bdrv_child_perm)(BlockDriverState *bs, BdrvChild *c,
+                             BdrvChildRole role,
+                             BlockReopenQueue *reopen_queue,
+                             uint64_t parent_perm, uint64_t parent_shared,
+                             uint64_t *nperm, uint64_t *nshared);
+
+    bool (*bdrv_supports_persistent_dirty_bitmap)(BlockDriverState *bs);
+    bool (*bdrv_co_can_store_new_dirty_bitmap)(BlockDriverState *bs,
+                                               const char *name,
+                                               uint32_t granularity,
+                                               Error **errp);
+    int (*bdrv_co_remove_persistent_dirty_bitmap)(BlockDriverState *bs,
+                                                  const char *name,
+                                                  Error **errp);
+
+    /**
+     * Register/unregister a buffer for I/O. For example, when the driver is
+     * interested to know the memory areas that will later be used in iovs, so
+     * that it can do IOMMU mapping with VFIO etc., in order to get better
+     * performance. In the case of VFIO drivers, this callback is used to do
+     * DMA mapping for hot buffers.
+     */
+    void (*bdrv_register_buf)(BlockDriverState *bs, void *host, size_t size);
+    void (*bdrv_unregister_buf)(BlockDriverState *bs, void *host);
+    QLIST_ENTRY(BlockDriver) list;
+
+    /* Pointer to a NULL-terminated array of names of strong options
+     * that can be specified for bdrv_open(). A strong option is one
+     * that changes the data of a BDS.
+     * If this pointer is NULL, the array is considered empty.
+     * "filename" and "driver" are always considered strong. */
+    const char *const *strong_runtime_opts;
+};
+
+static inline bool block_driver_can_compress(BlockDriver *drv)
+{
+    return drv->bdrv_co_pwritev_compressed ||
+           drv->bdrv_co_pwritev_compressed_part;
+}
+
+typedef struct BlockLimits {
+    /* Alignment requirement, in bytes, for offset/length of I/O
+     * requests. Must be a power of 2 less than INT_MAX; defaults to
+     * 1 for drivers with modern byte interfaces, and to 512
+     * otherwise. */
+    uint32_t request_alignment;
+
+    /*
+     * Maximum number of bytes that can be discarded at once. Must be multiple
+     * of pdiscard_alignment, but need not be power of 2. May be 0 if no
+     * inherent 64-bit limit.
+     */
+    int64_t max_pdiscard;
+
+    /* Optimal alignment for discard requests in bytes. A power of 2
+     * is best but not mandatory.  Must be a multiple of
+     * bl.request_alignment, and must be less than max_pdiscard if
+     * that is set. May be 0 if bl.request_alignment is good enough */
+    uint32_t pdiscard_alignment;
+
+    /*
+     * Maximum number of bytes that can zeroized at once. Must be multiple of
+     * pwrite_zeroes_alignment. 0 means no limit.
+     */
+    int64_t max_pwrite_zeroes;
+
+    /* Optimal alignment for write zeroes requests in bytes. A power
+     * of 2 is best but not mandatory.  Must be a multiple of
+     * bl.request_alignment, and must be less than max_pwrite_zeroes
+     * if that is set. May be 0 if bl.request_alignment is good
+     * enough */
+    uint32_t pwrite_zeroes_alignment;
+
+    /* Optimal transfer length in bytes.  A power of 2 is best but not
+     * mandatory.  Must be a multiple of bl.request_alignment, or 0 if
+     * no preferred size */
+    uint32_t opt_transfer;
+
+    /* Maximal transfer length in bytes.  Need not be power of 2, but
+     * must be multiple of opt_transfer and bl.request_alignment, or 0
+     * for no 32-bit limit.  For now, anything larger than INT_MAX is
+     * clamped down. */
+    uint32_t max_transfer;
+
+    /* Maximal hardware transfer length in bytes.  Applies whenever
+     * transfers to the device bypass the kernel I/O scheduler, for
+     * example with SG_IO.  If larger than max_transfer or if zero,
+     * blk_get_max_hw_transfer will fall back to max_transfer.
+     */
+    uint64_t max_hw_transfer;
+
+    /* Maximal number of scatter/gather elements allowed by the hardware.
+     * Applies whenever transfers to the device bypass the kernel I/O
+     * scheduler, for example with SG_IO.  If larger than max_iov
+     * or if zero, blk_get_max_hw_iov will fall back to max_iov.
+     */
+    int max_hw_iov;
+
+    /* memory alignment, in bytes so that no bounce buffer is needed */
+    size_t min_mem_alignment;
+
+    /* memory alignment, in bytes, for bounce buffer */
+    size_t opt_mem_alignment;
+
+    /* maximum number of iovec elements */
+    int max_iov;
+} BlockLimits;
+
+typedef struct BdrvOpBlocker BdrvOpBlocker;
+
+typedef struct BdrvAioNotifier {
+    void (*attached_aio_context)(AioContext *new_context, void *opaque);
+    void (*detach_aio_context)(void *opaque);
+
+    void *opaque;
+    bool deleted;
+
+    QLIST_ENTRY(BdrvAioNotifier) list;
+} BdrvAioNotifier;
+
+struct BdrvChildClass {
+    /* If true, bdrv_replace_node() doesn't change the node this BdrvChild
+     * points to. */
+    bool stay_at_node;
+
+    /* If true, the parent is a BlockDriverState and bdrv_next_all_states()
+     * will return it. This information is used for drain_all, where every node
+     * will be drained separately, so the drain only needs to be propagated to
+     * non-BDS parents. */
+    bool parent_is_bds;
+
+    void (*inherit_options)(BdrvChildRole role, bool parent_is_format,
+                            int *child_flags, QDict *child_options,
+                            int parent_flags, QDict *parent_options);
+
+    void (*change_media)(BdrvChild *child, bool load);
+    void (*resize)(BdrvChild *child);
+
+    /* Returns a name that is supposedly more useful for human users than the
+     * node name for identifying the node in question (in particular, a BB
+     * name), or NULL if the parent can't provide a better name. */
+    const char *(*get_name)(BdrvChild *child);
+
+    /* Returns a malloced string that describes the parent of the child for a
+     * human reader. This could be a node-name, BlockBackend name, qdev ID or
+     * QOM path of the device owning the BlockBackend, job type and ID etc. The
+     * caller is responsible for freeing the memory. */
+    char *(*get_parent_desc)(BdrvChild *child);
+
+    /*
+     * If this pair of functions is implemented, the parent doesn't issue new
+     * requests after returning from .drained_begin() until .drained_end() is
+     * called.
+     *
+     * These functions must not change the graph (and therefore also must not
+     * call aio_poll(), which could change the graph indirectly).
+     *
+     * If drained_end() schedules background operations, it must atomically
+     * increment *drained_end_counter for each such operation and atomically
+     * decrement it once the operation has settled.
+     *
+     * Note that this can be nested. If drained_begin() was called twice, new
+     * I/O is allowed only after drained_end() was called twice, too.
+     */
+    void (*drained_begin)(BdrvChild *child);
+    void (*drained_end)(BdrvChild *child, int *drained_end_counter);
+
+    /*
+     * Returns whether the parent has pending requests for the child. This
+     * callback is polled after .drained_begin() has been called until all
+     * activity on the child has stopped.
+     */
+    bool (*drained_poll)(BdrvChild *child);
+
+    /* Notifies the parent that the child has been activated/inactivated (e.g.
+     * when migration is completing) and it can start/stop requesting
+     * permissions and doing I/O on it. */
+    void (*activate)(BdrvChild *child, Error **errp);
+    int (*inactivate)(BdrvChild *child);
+
+    void (*attach)(BdrvChild *child);
+    void (*detach)(BdrvChild *child);
+
+    /* Notifies the parent that the filename of its child has changed (e.g.
+     * because the direct child was removed from the backing chain), so that it
+     * can update its reference. */
+    int (*update_filename)(BdrvChild *child, BlockDriverState *new_base,
+                           const char *filename, Error **errp);
+
+    bool (*can_set_aio_ctx)(BdrvChild *child, AioContext *ctx,
+                            GSList **ignore, Error **errp);
+    void (*set_aio_ctx)(BdrvChild *child, AioContext *ctx, GSList **ignore);
+
+    AioContext *(*get_parent_aio_context)(BdrvChild *child);
+};
+
+extern const BdrvChildClass child_of_bds;
+
+struct BdrvChild {
+    BlockDriverState *bs;
+    char *name;
+    const BdrvChildClass *klass;
+    BdrvChildRole role;
+    void *opaque;
+
+    /**
+     * Granted permissions for operating on this BdrvChild (BLK_PERM_* bitmask)
+     */
+    uint64_t perm;
+
+    /**
+     * Permissions that can still be granted to other users of @bs while this
+     * BdrvChild is still attached to it. (BLK_PERM_* bitmask)
+     */
+    uint64_t shared_perm;
+
+    /*
+     * This link is frozen: the child can neither be replaced nor
+     * detached from the parent.
+     */
+    bool frozen;
+
+    /*
+     * How many times the parent of this child has been drained
+     * (through klass->drained_*).
+     * Usually, this is equal to bs->quiesce_counter (potentially
+     * reduced by bdrv_drain_all_count).  It may differ while the
+     * child is entering or leaving a drained section.
+     */
+    int parent_quiesce_counter;
+
+    QLIST_ENTRY(BdrvChild) next;
+    QLIST_ENTRY(BdrvChild) next_parent;
+};
+
+/*
+ * Allows bdrv_co_block_status() to cache one data region for a
+ * protocol node.
+ *
+ * @valid: Whether the cache is valid (should be accessed with atomic
+ *         functions so this can be reset by RCU readers)
+ * @data_start: Offset where we know (or strongly assume) is data
+ * @data_end: Offset where the data region ends (which is not necessarily
+ *            the start of a zeroed region)
+ */
+typedef struct BdrvBlockStatusCache {
+    struct rcu_head rcu;
+
+    bool valid;
+    int64_t data_start;
+    int64_t data_end;
+} BdrvBlockStatusCache;
+
+struct BlockDriverState {
+    /* Protected by big QEMU lock or read-only after opening.  No special
+     * locking needed during I/O...
+     */
+    int open_flags; /* flags used to open the file, re-used for re-open */
+    bool encrypted; /* if true, the media is encrypted */
+    bool sg;        /* if true, the device is a /dev/sg* */
+    bool probed;    /* if true, format was probed rather than specified */
+    bool force_share; /* if true, always allow all shared permissions */
+    bool implicit;  /* if true, this filter node was automatically inserted */
+
+    BlockDriver *drv; /* NULL means no media */
+    void *opaque;
+
+    AioContext *aio_context; /* event loop used for fd handlers, timers, etc */
+    /* long-running tasks intended to always use the same AioContext as this
+     * BDS may register themselves in this list to be notified of changes
+     * regarding this BDS's context */
+    QLIST_HEAD(, BdrvAioNotifier) aio_notifiers;
+    bool walking_aio_notifiers; /* to make removal during iteration safe */
+
+    char filename[PATH_MAX];
+    /*
+     * If not empty, this image is a diff in relation to backing_file.
+     * Note that this is the name given in the image header and
+     * therefore may or may not be equal to .backing->bs->filename.
+     * If this field contains a relative path, it is to be resolved
+     * relatively to the overlay's location.
+     */
+    char backing_file[PATH_MAX];
+    /*
+     * The backing filename indicated by the image header.  Contrary
+     * to backing_file, if we ever open this file, auto_backing_file
+     * is replaced by the resulting BDS's filename (i.e. after a
+     * bdrv_refresh_filename() run).
+     */
+    char auto_backing_file[PATH_MAX];
+    char backing_format[16]; /* if non-zero and backing_file exists */
+
+    QDict *full_open_options;
+    char exact_filename[PATH_MAX];
+
+    BdrvChild *backing;
+    BdrvChild *file;
+
+    /* I/O Limits */
+    BlockLimits bl;
+
+    /*
+     * Flags honored during pread
+     */
+    unsigned int supported_read_flags;
+    /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
+     * BDRV_REQ_WRITE_UNCHANGED).
+     * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
+     * writes will be issued as normal writes without the flag set.
+     * This is important to note for drivers that do not explicitly
+     * request a WRITE permission for their children and instead take
+     * the same permissions as their parent did (this is commonly what
+     * block filters do).  Such drivers have to be aware that the
+     * parent may have taken a WRITE_UNCHANGED permission only and is
+     * issuing such requests.  Drivers either must make sure that
+     * these requests do not result in plain WRITE accesses (usually
+     * by supporting BDRV_REQ_WRITE_UNCHANGED, and then forwarding
+     * every incoming write request as-is, including potentially that
+     * flag), or they have to explicitly take the WRITE permission for
+     * their children. */
+    unsigned int supported_write_flags;
+    /* Flags honored during pwrite_zeroes (so far: BDRV_REQ_FUA,
+     * BDRV_REQ_MAY_UNMAP, BDRV_REQ_WRITE_UNCHANGED) */
+    unsigned int supported_zero_flags;
+    /*
+     * Flags honoured during truncate (so far: BDRV_REQ_ZERO_WRITE).
+     *
+     * If BDRV_REQ_ZERO_WRITE is given, the truncate operation must make sure
+     * that any added space reads as all zeros. If this can't be guaranteed,
+     * the operation must fail.
+     */
+    unsigned int supported_truncate_flags;
+
+    /* the following member gives a name to every node on the bs graph. */
+    char node_name[32];
+    /* element of the list of named nodes building the graph */
+    QTAILQ_ENTRY(BlockDriverState) node_list;
+    /* element of the list of all BlockDriverStates (all_bdrv_states) */
+    QTAILQ_ENTRY(BlockDriverState) bs_list;
+    /* element of the list of monitor-owned BDS */
+    QTAILQ_ENTRY(BlockDriverState) monitor_list;
+    int refcnt;
+
+    /* operation blockers */
+    QLIST_HEAD(, BdrvOpBlocker) op_blockers[BLOCK_OP_TYPE_MAX];
+
+    /* The node that this node inherited default options from (and a reopen on
+     * which can affect this node by changing these defaults). This is always a
+     * parent node of this node. */
+    BlockDriverState *inherits_from;
+    QLIST_HEAD(, BdrvChild) children;
+    QLIST_HEAD(, BdrvChild) parents;
+
+    QDict *options;
+    QDict *explicit_options;
+    BlockdevDetectZeroesOptions detect_zeroes;
+
+    /* The error object in use for blocking operations on backing_hd */
+    Error *backing_blocker;
+
+    /* Protected by AioContext lock */
+
+    /* If we are reading a disk image, give its size in sectors.
+     * Generally read-only; it is written to by load_snapshot and
+     * save_snaphost, but the block layer is quiescent during those.
+     */
+    int64_t total_sectors;
+
+    /* threshold limit for writes, in bytes. "High water mark". */
+    uint64_t write_threshold_offset;
+
+    /* Writing to the list requires the BQL _and_ the dirty_bitmap_mutex.
+     * Reading from the list can be done with either the BQL or the
+     * dirty_bitmap_mutex.  Modifying a bitmap only requires
+     * dirty_bitmap_mutex.  */
+    QemuMutex dirty_bitmap_mutex;
+    QLIST_HEAD(, BdrvDirtyBitmap) dirty_bitmaps;
+
+    /* Offset after the highest byte written to */
+    Stat64 wr_highest_offset;
+
+    /* If true, copy read backing sectors into image.  Can be >1 if more
+     * than one client has requested copy-on-read.  Accessed with atomic
+     * ops.
+     */
+    int copy_on_read;
+
+    /* number of in-flight requests; overall and serialising.
+     * Accessed with atomic ops.
+     */
+    unsigned int in_flight;
+    unsigned int serialising_in_flight;
+
+    /* counter for nested bdrv_io_plug.
+     * Accessed with atomic ops.
+    */
+    unsigned io_plugged;
+
+    /* do we need to tell the quest if we have a volatile write cache? */
+    int enable_write_cache;
+
+    /* Accessed with atomic ops.  */
+    int quiesce_counter;
+    int recursive_quiesce_counter;
+
+    unsigned int write_gen;               /* Current data generation */
+
+    /* Protected by reqs_lock.  */
+    CoMutex reqs_lock;
+    QLIST_HEAD(, BdrvTrackedRequest) tracked_requests;
+    CoQueue flush_queue;                  /* Serializing flush queue */
+    bool active_flush_req;                /* Flush request in flight? */
+
+    /* Only read/written by whoever has set active_flush_req to true.  */
+    unsigned int flushed_gen;             /* Flushed write generation */
+
+    /* BdrvChild links to this node may never be frozen */
+    bool never_freeze;
+
+    /* Lock for block-status cache RCU writers */
+    CoMutex bsc_modify_lock;
+    /* Always non-NULL, but must only be dereferenced under an RCU read guard */
+    BdrvBlockStatusCache *block_status_cache;
+};
+
+struct BlockBackendRootState {
+    int open_flags;
+    BlockdevDetectZeroesOptions detect_zeroes;
+};
+
+typedef enum BlockMirrorBackingMode {
+    /* Reuse the existing backing chain from the source for the target.
+     * - sync=full: Set backing BDS to NULL.
+     * - sync=top:  Use source's backing BDS.
+     * - sync=none: Use source as the backing BDS. */
+    MIRROR_SOURCE_BACKING_CHAIN,
+
+    /* Open the target's backing chain completely anew */
+    MIRROR_OPEN_BACKING_CHAIN,
+
+    /* Do not change the target's backing BDS after job completion */
+    MIRROR_LEAVE_BACKING_CHAIN,
+} BlockMirrorBackingMode;
+
+
+/* Essential block drivers which must always be statically linked into qemu, and
+ * which therefore can be accessed without using bdrv_find_format() */
+extern BlockDriver bdrv_file;
+extern BlockDriver bdrv_raw;
+extern BlockDriver bdrv_qcow2;
+
+int coroutine_fn bdrv_co_preadv(BdrvChild *child,
+    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_preadv_part(BdrvChild *child,
+    int64_t offset, int64_t bytes,
+    QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
+    int64_t offset, int64_t bytes, QEMUIOVector *qiov,
+    BdrvRequestFlags flags);
+int coroutine_fn bdrv_co_pwritev_part(BdrvChild *child,
+    int64_t offset, int64_t bytes,
+    QEMUIOVector *qiov, size_t qiov_offset, BdrvRequestFlags flags);
+
+static inline int coroutine_fn bdrv_co_pread(BdrvChild *child,
+    int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+    return bdrv_co_preadv(child, offset, bytes, &qiov, flags);
+}
+
+static inline int coroutine_fn bdrv_co_pwrite(BdrvChild *child,
+    int64_t offset, unsigned int bytes, void *buf, BdrvRequestFlags flags)
+{
+    QEMUIOVector qiov = QEMU_IOVEC_INIT_BUF(qiov, buf, bytes);
+
+    return bdrv_co_pwritev(child, offset, bytes, &qiov, flags);
+}
+
+extern unsigned int bdrv_drain_all_count;
+void bdrv_apply_subtree_drain(BdrvChild *child, BlockDriverState *new_parent);
+void bdrv_unapply_subtree_drain(BdrvChild *child, BlockDriverState *old_parent);
+
+bool coroutine_fn bdrv_make_request_serialising(BdrvTrackedRequest *req,
+                                                uint64_t align);
+BdrvTrackedRequest *coroutine_fn bdrv_co_get_self_request(BlockDriverState *bs);
+
+int get_tmp_filename(char *filename, int size);
+BlockDriver *bdrv_probe_all(const uint8_t *buf, int buf_size,
+                            const char *filename);
+
+void bdrv_parse_filename_strip_prefix(const char *filename, const char *prefix,
+                                      QDict *options);
+
+bool bdrv_backing_overridden(BlockDriverState *bs);
+
+
+/**
+ * bdrv_add_aio_context_notifier:
+ *
+ * If a long-running job intends to be always run in the same AioContext as a
+ * certain BDS, it may use this function to be notified of changes regarding the
+ * association of the BDS to an AioContext.
+ *
+ * attached_aio_context() is called after the target BDS has been attached to a
+ * new AioContext; detach_aio_context() is called before the target BDS is being
+ * detached from its old AioContext.
+ */
+void bdrv_add_aio_context_notifier(BlockDriverState *bs,
+        void (*attached_aio_context)(AioContext *new_context, void *opaque),
+        void (*detach_aio_context)(void *opaque), void *opaque);
+
+/**
+ * bdrv_remove_aio_context_notifier:
+ *
+ * Unsubscribe of change notifications regarding the BDS's AioContext. The
+ * parameters given here have to be the same as those given to
+ * bdrv_add_aio_context_notifier().
+ */
+void bdrv_remove_aio_context_notifier(BlockDriverState *bs,
+                                      void (*aio_context_attached)(AioContext *,
+                                                                   void *),
+                                      void (*aio_context_detached)(void *),
+                                      void *opaque);
+
+/**
+ * bdrv_wakeup:
+ * @bs: The BlockDriverState for which an I/O operation has been completed.
+ *
+ * Wake up the main thread if it is waiting on BDRV_POLL_WHILE.  During
+ * synchronous I/O on a BlockDriverState that is attached to another
+ * I/O thread, the main thread lets the I/O thread's event loop run,
+ * waiting for the I/O operation to complete.  A bdrv_wakeup will wake
+ * up the main thread if necessary.
+ *
+ * Manual calls to bdrv_wakeup are rarely necessary, because
+ * bdrv_dec_in_flight already calls it.
+ */
+void bdrv_wakeup(BlockDriverState *bs);
+
+#ifdef _WIN32
+int is_windows_drive(const char *filename);
+#endif
+
+/**
+ * stream_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @base: Block device that will become the new base, or %NULL to
+ * flatten the whole backing file chain onto @bs.
+ * @backing_file_str: The file name that will be written to @bs as the
+ * the new backing file if the job completes. Ignored if @base is %NULL.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ *                    driver that the stream job inserts into the graph above
+ *                    @bs. NULL means that a node name should be autogenerated.
+ * @errp: Error object.
+ *
+ * Start a streaming operation on @bs.  Clusters that are unallocated
+ * in @bs, but allocated in any image between @base and @bs (both
+ * exclusive) will be written to @bs.  At the end of a successful
+ * streaming job, the backing file of @bs will be changed to
+ * @backing_file_str in the written image and to @base in the live
+ * BlockDriverState.
+ */
+void stream_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *base, const char *backing_file_str,
+                  BlockDriverState *bottom,
+                  int creation_flags, int64_t speed,
+                  BlockdevOnError on_error,
+                  const char *filter_node_name,
+                  Error **errp);
+
+/**
+ * commit_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Active block device.
+ * @top: Top block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @backing_file_str: String to use as the backing file in @top's overlay
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @top. NULL means
+ * that a node name should be autogenerated.
+ * @errp: Error object.
+ *
+ */
+void commit_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *base, BlockDriverState *top,
+                  int creation_flags, int64_t speed,
+                  BlockdevOnError on_error, const char *backing_file_str,
+                  const char *filter_node_name, Error **errp);
+/**
+ * commit_active_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Active block device to be committed.
+ * @base: Block device that will be written into, and become the new top.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the commit job inserts into the graph above @bs. NULL means that
+ * a node name should be autogenerated.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @auto_complete: Auto complete the job.
+ * @errp: Error object.
+ *
+ */
+BlockJob *commit_active_start(const char *job_id, BlockDriverState *bs,
+                              BlockDriverState *base, int creation_flags,
+                              int64_t speed, BlockdevOnError on_error,
+                              const char *filter_node_name,
+                              BlockCompletionFunc *cb, void *opaque,
+                              bool auto_complete, Error **errp);
+/*
+ * mirror_start:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @replaces: Block graph node name to replace once the mirror is done. Can
+ *            only be used when full mirroring is selected.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @granularity: The chosen granularity for the dirty bitmap.
+ * @buf_size: The amount of data that can be in flight at one time.
+ * @mode: Whether to collapse all images in the chain to the target.
+ * @backing_mode: How to establish the target's backing chain after completion.
+ * @zero_target: Whether the target should be explicitly zero-initialized
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @unmap: Whether to unmap target where source sectors only contain zeroes.
+ * @filter_node_name: The node name that should be assigned to the filter
+ * driver that the mirror job inserts into the graph above @bs. NULL means that
+ * a node name should be autogenerated.
+ * @copy_mode: When to trigger writes to the target.
+ * @errp: Error object.
+ *
+ * Start a mirroring operation on @bs.  Clusters that are allocated
+ * in @bs will be written to @target until the job is cancelled or
+ * manually completed.  At the end of a successful mirroring job,
+ * @bs will be switched to read from @target.
+ */
+void mirror_start(const char *job_id, BlockDriverState *bs,
+                  BlockDriverState *target, const char *replaces,
+                  int creation_flags, int64_t speed,
+                  uint32_t granularity, int64_t buf_size,
+                  MirrorSyncMode mode, BlockMirrorBackingMode backing_mode,
+                  bool zero_target,
+                  BlockdevOnError on_source_error,
+                  BlockdevOnError on_target_error,
+                  bool unmap, const char *filter_node_name,
+                  MirrorCopyMode copy_mode, Error **errp);
+
+/*
+ * backup_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to use the
+ * device name of @bs.
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @sync_mode: What parts of the disk image should be copied to the destination.
+ * @sync_bitmap: The dirty bitmap if sync_mode is 'bitmap' or 'incremental'
+ * @bitmap_mode: The bitmap synchronization policy to use.
+ * @perf: Performance options. All actual fields assumed to be present,
+ *        all ".has_*" fields are ignored.
+ * @on_source_error: The action to take upon error reading from the source.
+ * @on_target_error: The action to take upon error writing to the target.
+ * @creation_flags: Flags that control the behavior of the Job lifetime.
+ *                  See @BlockJobCreateFlags
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @txn: Transaction that this job is part of (may be NULL).
+ *
+ * Create a backup operation on @bs.  Clusters in @bs are written to @target
+ * until the job is cancelled or manually completed.
+ */
+BlockJob *backup_job_create(const char *job_id, BlockDriverState *bs,
+                            BlockDriverState *target, int64_t speed,
+                            MirrorSyncMode sync_mode,
+                            BdrvDirtyBitmap *sync_bitmap,
+                            BitmapSyncMode bitmap_mode,
+                            bool compress,
+                            const char *filter_node_name,
+                            BackupPerf *perf,
+                            BlockdevOnError on_source_error,
+                            BlockdevOnError on_target_error,
+                            int creation_flags,
+                            BlockCompletionFunc *cb, void *opaque,
+                            JobTxn *txn, Error **errp);
+
+BdrvChild *bdrv_root_attach_child(BlockDriverState *child_bs,
+                                  const char *child_name,
+                                  const BdrvChildClass *child_class,
+                                  BdrvChildRole child_role,
+                                  uint64_t perm, uint64_t shared_perm,
+                                  void *opaque, Error **errp);
+void bdrv_root_unref_child(BdrvChild *child);
+
+void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
+                              uint64_t *shared_perm);
+
+/**
+ * Sets a BdrvChild's permissions.  Avoid if the parent is a BDS; use
+ * bdrv_child_refresh_perms() instead and make the parent's
+ * .bdrv_child_perm() implementation return the correct values.
+ */
+int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared,
+                            Error **errp);
+
+/**
+ * Calls bs->drv->bdrv_child_perm() and updates the child's permission
+ * masks with the result.
+ * Drivers should invoke this function whenever an event occurs that
+ * makes their .bdrv_child_perm() implementation return different
+ * values than before, but which will not result in the block layer
+ * automatically refreshing the permissions.
+ */
+int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
+
+bool bdrv_recurse_can_replace(BlockDriverState *bs,
+                              BlockDriverState *to_replace);
+
+/*
+ * Default implementation for BlockDriver.bdrv_child_perm() that can
+ * be used by block filters and image formats, as long as they use the
+ * child_of_bds child class and set an appropriate BdrvChildRole.
+ */
+void bdrv_default_perms(BlockDriverState *bs, BdrvChild *c,
+                        BdrvChildRole role, BlockReopenQueue *reopen_queue,
+                        uint64_t perm, uint64_t shared,
+                        uint64_t *nperm, uint64_t *nshared);
+
+const char *bdrv_get_parent_name(const BlockDriverState *bs);
+void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
+bool blk_dev_has_removable_media(BlockBackend *blk);
+bool blk_dev_has_tray(BlockBackend *blk);
+void blk_dev_eject_request(BlockBackend *blk, bool force);
+bool blk_dev_is_tray_open(BlockBackend *blk);
+bool blk_dev_is_medium_locked(BlockBackend *blk);
+
+void bdrv_set_dirty(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
+void bdrv_restore_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *backup);
+bool bdrv_dirty_bitmap_merge_internal(BdrvDirtyBitmap *dest,
+                                      const BdrvDirtyBitmap *src,
+                                      HBitmap **backup, bool lock);
+
+void bdrv_inc_in_flight(BlockDriverState *bs);
+void bdrv_dec_in_flight(BlockDriverState *bs);
+
+void blockdev_close_all_bdrv_states(void);
+
+int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, int64_t src_offset,
+                                         BdrvChild *dst, int64_t dst_offset,
+                                         int64_t bytes,
+                                         BdrvRequestFlags read_flags,
+                                         BdrvRequestFlags write_flags);
+int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, int64_t src_offset,
+                                       BdrvChild *dst, int64_t dst_offset,
+                                       int64_t bytes,
+                                       BdrvRequestFlags read_flags,
+                                       BdrvRequestFlags write_flags);
+
+int refresh_total_sectors(BlockDriverState *bs, int64_t hint);
+
+void bdrv_set_monitor_owned(BlockDriverState *bs);
+BlockDriverState *bds_tree_init(QDict *bs_opts, Error **errp);
+
+/**
+ * Simple implementation of bdrv_co_create_opts for protocol drivers
+ * which only support creation via opening a file
+ * (usually existing raw storage device)
+ */
+int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
+                                            const char *filename,
+                                            QemuOpts *opts,
+                                            Error **errp);
+extern QemuOptsList bdrv_create_opts_simple;
+
+BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
+                                           const char *name,
+                                           BlockDriverState **pbs,
+                                           Error **errp);
+BdrvDirtyBitmap *block_dirty_bitmap_merge(const char *node, const char *target,
+                                          BlockDirtyBitmapMergeSourceList *bms,
+                                          HBitmap **backup, Error **errp);
+BdrvDirtyBitmap *block_dirty_bitmap_remove(const char *node, const char *name,
+                                           bool release,
+                                           BlockDriverState **bitmap_bs,
+                                           Error **errp);
+
+BdrvChild *bdrv_cow_child(BlockDriverState *bs);
+BdrvChild *bdrv_filter_child(BlockDriverState *bs);
+BdrvChild *bdrv_filter_or_cow_child(BlockDriverState *bs);
+BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs);
+
+static inline BlockDriverState *child_bs(BdrvChild *child)
+{
+    return child ? child->bs : NULL;
+}
+
+static inline BlockDriverState *bdrv_cow_bs(BlockDriverState *bs)
+{
+    return child_bs(bdrv_cow_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filter_bs(BlockDriverState *bs)
+{
+    return child_bs(bdrv_filter_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filter_or_cow_bs(BlockDriverState *bs)
+{
+    return child_bs(bdrv_filter_or_cow_child(bs));
+}
+
+static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
+{
+    return child_bs(bdrv_primary_child(bs));
+}
+
+/**
+ * End all quiescent sections started by bdrv_drain_all_begin(). This is
+ * needed when deleting a BDS before bdrv_drain_all_end() is called.
+ *
+ * NOTE: this is an internal helper for bdrv_close() *only*. No one else
+ * should call it.
+ */
+void bdrv_drain_all_end_quiesce(BlockDriverState *bs);
+
+/**
+ * Check whether the given offset is in the cached block-status data
+ * region.
+ *
+ * If it is, and @pnum is not NULL, *pnum is set to
+ * `bsc.data_end - offset`, i.e. how many bytes, starting from
+ * @offset, are data (according to the cache).
+ * Otherwise, *pnum is not touched.
+ */
+bool bdrv_bsc_is_data(BlockDriverState *bs, int64_t offset, int64_t *pnum);
+
+/**
+ * If [offset, offset + bytes) overlaps with the currently cached
+ * block-status region, invalidate the cache.
+ *
+ * (To be used by I/O paths that cause data regions to be zero or
+ * holes.)
+ */
+void bdrv_bsc_invalidate_range(BlockDriverState *bs,
+                               int64_t offset, int64_t bytes);
+
+/**
+ * Mark the range [offset, offset + bytes) as a data region.
+ */
+void bdrv_bsc_fill(BlockDriverState *bs, int64_t offset, int64_t bytes);
+
+#endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
new file mode 100644
index 000000000..d200f33c1
--- /dev/null
+++ b/include/block/blockjob.h
@@ -0,0 +1,176 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_H
+#define BLOCKJOB_H
+
+#include "qemu/job.h"
+#include "block/block.h"
+#include "qemu/ratelimit.h"
+
+#define BLOCK_JOB_SLICE_TIME 100000000ULL /* ns */
+
+typedef struct BlockJobDriver BlockJobDriver;
+
+/**
+ * BlockJob:
+ *
+ * Long-running operation on a BlockDriverState.
+ */
+typedef struct BlockJob {
+    /** Data belonging to the generic Job infrastructure */
+    Job job;
+
+    /** The block device on which the job is operating.  */
+    BlockBackend *blk;
+
+    /** Status that is published by the query-block-jobs QMP API */
+    BlockDeviceIoStatus iostatus;
+
+    /** Speed that was set with @block_job_set_speed.  */
+    int64_t speed;
+
+    /** Rate limiting data structure for implementing @speed. */
+    RateLimit limit;
+
+    /** Block other operations when block job is running */
+    Error *blocker;
+
+    /** Called when a cancelled job is finalised. */
+    Notifier finalize_cancelled_notifier;
+
+    /** Called when a successfully completed job is finalised. */
+    Notifier finalize_completed_notifier;
+
+    /** Called when the job transitions to PENDING */
+    Notifier pending_notifier;
+
+    /** Called when the job transitions to READY */
+    Notifier ready_notifier;
+
+    /** Called when the job coroutine yields or terminates */
+    Notifier idle_notifier;
+
+    /** BlockDriverStates that are involved in this block job */
+    GSList *nodes;
+} BlockJob;
+
+/**
+ * block_job_next:
+ * @job: A block job, or %NULL.
+ *
+ * Get the next element from the list of block jobs after @job, or the
+ * first one if @job is %NULL.
+ *
+ * Returns the requested job, or %NULL if there are no more jobs left.
+ */
+BlockJob *block_job_next(BlockJob *job);
+
+/**
+ * block_job_get:
+ * @id: The id of the block job.
+ *
+ * Get the block job identified by @id (which must not be %NULL).
+ *
+ * Returns the requested job, or %NULL if it doesn't exist.
+ */
+BlockJob *block_job_get(const char *id);
+
+/**
+ * block_job_add_bdrv:
+ * @job: A block job
+ * @name: The name to assign to the new BdrvChild
+ * @bs: A BlockDriverState that is involved in @job
+ * @perm, @shared_perm: Permissions to request on the node
+ *
+ * Add @bs to the list of BlockDriverState that are involved in
+ * @job. This means that all operations will be blocked on @bs while
+ * @job exists.
+ */
+int block_job_add_bdrv(BlockJob *job, const char *name, BlockDriverState *bs,
+                       uint64_t perm, uint64_t shared_perm, Error **errp);
+
+/**
+ * block_job_remove_all_bdrv:
+ * @job: The block job
+ *
+ * Remove all BlockDriverStates from the list of nodes that are involved in the
+ * job. This removes the blockers added with block_job_add_bdrv().
+ */
+void block_job_remove_all_bdrv(BlockJob *job);
+
+/**
+ * block_job_has_bdrv:
+ * @job: The block job
+ *
+ * Searches for @bs in the list of nodes that are involved in the
+ * job.
+ */
+bool block_job_has_bdrv(BlockJob *job, BlockDriverState *bs);
+
+/**
+ * block_job_set_speed:
+ * @job: The job to set the speed for.
+ * @speed: The new value
+ * @errp: Error object.
+ *
+ * Set a rate-limiting parameter for the job; the actual meaning may
+ * vary depending on the job type.
+ */
+bool block_job_set_speed(BlockJob *job, int64_t speed, Error **errp);
+
+/**
+ * block_job_query:
+ * @job: The job to get information about.
+ *
+ * Return information about a job.
+ */
+BlockJobInfo *block_job_query(BlockJob *job, Error **errp);
+
+/**
+ * block_job_iostatus_reset:
+ * @job: The job whose I/O status should be reset.
+ *
+ * Reset I/O status on @job and on BlockDriverState objects it uses,
+ * other than job->blk.
+ */
+void block_job_iostatus_reset(BlockJob *job);
+
+/**
+ * block_job_is_internal:
+ * @job: The job to determine if it is user-visible or not.
+ *
+ * Returns true if the job should not be visible to the management layer.
+ */
+bool block_job_is_internal(BlockJob *job);
+
+/**
+ * block_job_driver:
+ *
+ * Returns the driver associated with a block job.
+ */
+const BlockJobDriver *block_job_driver(BlockJob *job);
+
+#endif
diff --git a/include/block/blockjob_int.h b/include/block/blockjob_int.h
new file mode 100644
index 000000000..6633d83da
--- /dev/null
+++ b/include/block/blockjob_int.h
@@ -0,0 +1,122 @@
+/*
+ * Declarations for long-running block device operations
+ *
+ * Copyright (c) 2011 IBM Corp.
+ * Copyright (c) 2012 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCKJOB_INT_H
+#define BLOCKJOB_INT_H
+
+#include "block/blockjob.h"
+#include "block/block.h"
+
+/**
+ * BlockJobDriver:
+ *
+ * A class type for block job driver.
+ */
+struct BlockJobDriver {
+    /** Generic JobDriver callbacks and settings */
+    JobDriver job_driver;
+
+    /*
+     * Returns whether the job has pending requests for the child or will
+     * submit new requests before the next pause point. This callback is polled
+     * in the context of draining a job node after requesting that the job be
+     * paused, until all activity on the child has stopped.
+     */
+    bool (*drained_poll)(BlockJob *job);
+
+    /*
+     * If the callback is not NULL, it will be invoked before the job is
+     * resumed in a new AioContext.  This is the place to move any resources
+     * besides job->blk to the new AioContext.
+     */
+    void (*attached_aio_context)(BlockJob *job, AioContext *new_context);
+
+    void (*set_speed)(BlockJob *job, int64_t speed);
+};
+
+/**
+ * block_job_create:
+ * @job_id: The id of the newly-created job, or %NULL to have one
+ * generated automatically.
+ * @driver: The class object for the newly-created job.
+ * @txn: The transaction this job belongs to, if any. %NULL otherwise.
+ * @bs: The block
+ * @perm, @shared_perm: Permissions to request for @bs
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @flags: Creation flags for the Block Job. See @JobCreateFlags.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ * @errp: Error object.
+ *
+ * Create a new long-running block device job and return it.  The job
+ * will call @cb asynchronously when the job completes.  Note that
+ * @bs may have been closed at the time the @cb it is called.  If
+ * this is the case, the job may be reported as either cancelled or
+ * completed.
+ *
+ * This function is not part of the public job interface; it should be
+ * called from a wrapper that is specific to the job type.
+ */
+void *block_job_create(const char *job_id, const BlockJobDriver *driver,
+                       JobTxn *txn, BlockDriverState *bs, uint64_t perm,
+                       uint64_t shared_perm, int64_t speed, int flags,
+                       BlockCompletionFunc *cb, void *opaque, Error **errp);
+
+/**
+ * block_job_free:
+ * Callback to be used for JobDriver.free in all block jobs. Frees block job
+ * specific resources in @job.
+ */
+void block_job_free(Job *job);
+
+/**
+ * block_job_user_resume:
+ * Callback to be used for JobDriver.user_resume in all block jobs. Resets the
+ * iostatus when the user resumes @job.
+ */
+void block_job_user_resume(Job *job);
+
+/**
+ * block_job_ratelimit_get_delay:
+ *
+ * Calculate and return delay for the next request in ns. See the documentation
+ * of ratelimit_calculate_delay() for details.
+ */
+int64_t block_job_ratelimit_get_delay(BlockJob *job, uint64_t n);
+
+/**
+ * block_job_error_action:
+ * @job: The job to signal an error for.
+ * @on_err: The error action setting.
+ * @is_read: Whether the operation was a read.
+ * @error: The error that was reported.
+ *
+ * Report an I/O error for a block job and possibly stop the VM.  Return the
+ * action that was selected based on @on_err and @error.
+ */
+BlockErrorAction block_job_error_action(BlockJob *job, BlockdevOnError on_err,
+                                        int is_read, int error);
+
+#endif
diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
new file mode 100644
index 000000000..40950ae3d
--- /dev/null
+++ b/include/block/dirty-bitmap.h
@@ -0,0 +1,121 @@
+#ifndef BLOCK_DIRTY_BITMAP_H
+#define BLOCK_DIRTY_BITMAP_H
+
+#include "qapi/qapi-types-block-core.h"
+#include "qemu/hbitmap.h"
+
+typedef enum BitmapCheckFlags {
+    BDRV_BITMAP_BUSY = 1,
+    BDRV_BITMAP_RO = 2,
+    BDRV_BITMAP_INCONSISTENT = 4,
+} BitmapCheckFlags;
+
+#define BDRV_BITMAP_DEFAULT (BDRV_BITMAP_BUSY | BDRV_BITMAP_RO |        \
+                             BDRV_BITMAP_INCONSISTENT)
+#define BDRV_BITMAP_ALLOW_RO (BDRV_BITMAP_BUSY | BDRV_BITMAP_INCONSISTENT)
+
+#define BDRV_BITMAP_MAX_NAME_SIZE 1023
+
+bool bdrv_supports_persistent_dirty_bitmap(BlockDriverState *bs);
+BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs,
+                                          uint32_t granularity,
+                                          const char *name,
+                                          Error **errp);
+int bdrv_dirty_bitmap_create_successor(BdrvDirtyBitmap *bitmap,
+                                       Error **errp);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BdrvDirtyBitmap *bitmap,
+                                            Error **errp);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                                           Error **errp);
+void bdrv_dirty_bitmap_enable_successor(BdrvDirtyBitmap *bitmap);
+BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs,
+                                        const char *name);
+int bdrv_dirty_bitmap_check(const BdrvDirtyBitmap *bitmap, uint32_t flags,
+                            Error **errp);
+void bdrv_release_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs);
+int bdrv_remove_persistent_dirty_bitmap(BlockDriverState *bs, const char *name,
+                                        Error **errp);
+void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap);
+void bdrv_enable_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap);
+BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs);
+uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs);
+uint32_t bdrv_dirty_bitmap_granularity(const BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_has_successor(BdrvDirtyBitmap *bitmap);
+const char *bdrv_dirty_bitmap_name(const BdrvDirtyBitmap *bitmap);
+int64_t bdrv_dirty_bitmap_size(const BdrvDirtyBitmap *bitmap);
+void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                           int64_t offset, int64_t bytes);
+void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
+                             int64_t offset, int64_t bytes);
+BdrvDirtyBitmapIter *bdrv_dirty_iter_new(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_iter_free(BdrvDirtyBitmapIter *iter);
+
+uint64_t bdrv_dirty_bitmap_serialization_size(const BdrvDirtyBitmap *bitmap,
+                                              uint64_t offset, uint64_t bytes);
+uint64_t bdrv_dirty_bitmap_serialization_align(const BdrvDirtyBitmap *bitmap);
+uint64_t bdrv_dirty_bitmap_serialization_coverage(int serialized_chunk_size,
+        const BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_serialize_part(const BdrvDirtyBitmap *bitmap,
+                                      uint8_t *buf, uint64_t offset,
+                                      uint64_t bytes);
+void bdrv_dirty_bitmap_deserialize_part(BdrvDirtyBitmap *bitmap,
+                                        uint8_t *buf, uint64_t offset,
+                                        uint64_t bytes, bool finish);
+void bdrv_dirty_bitmap_deserialize_zeroes(BdrvDirtyBitmap *bitmap,
+                                          uint64_t offset, uint64_t bytes,
+                                          bool finish);
+void bdrv_dirty_bitmap_deserialize_ones(BdrvDirtyBitmap *bitmap,
+                                        uint64_t offset, uint64_t bytes,
+                                        bool finish);
+void bdrv_dirty_bitmap_deserialize_finish(BdrvDirtyBitmap *bitmap);
+
+void bdrv_dirty_bitmap_set_readonly(BdrvDirtyBitmap *bitmap, bool value);
+void bdrv_dirty_bitmap_set_persistence(BdrvDirtyBitmap *bitmap,
+                                       bool persistent);
+void bdrv_dirty_bitmap_set_inconsistent(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_set_busy(BdrvDirtyBitmap *bitmap, bool busy);
+void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
+                             HBitmap **backup, Error **errp);
+void bdrv_dirty_bitmap_skip_store(BdrvDirtyBitmap *bitmap, bool skip);
+bool bdrv_dirty_bitmap_get(BdrvDirtyBitmap *bitmap, int64_t offset);
+
+/* Functions that require manual locking.  */
+void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_unlock(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_get_locked(BdrvDirtyBitmap *bitmap, int64_t offset);
+void bdrv_set_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                  int64_t offset, int64_t bytes);
+void bdrv_reset_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                    int64_t offset, int64_t bytes);
+int64_t bdrv_dirty_iter_next(BdrvDirtyBitmapIter *iter);
+void bdrv_set_dirty_iter(BdrvDirtyBitmapIter *hbi, int64_t offset);
+int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
+void bdrv_dirty_bitmap_truncate(BlockDriverState *bs, int64_t bytes);
+bool bdrv_dirty_bitmap_readonly(const BdrvDirtyBitmap *bitmap);
+bool bdrv_has_readonly_bitmaps(BlockDriverState *bs);
+bool bdrv_has_named_bitmaps(BlockDriverState *bs);
+bool bdrv_dirty_bitmap_get_autoload(const BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_get_persistence(BdrvDirtyBitmap *bitmap);
+bool bdrv_dirty_bitmap_inconsistent(const BdrvDirtyBitmap *bitmap);
+
+BdrvDirtyBitmap *bdrv_dirty_bitmap_first(BlockDriverState *bs);
+BdrvDirtyBitmap *bdrv_dirty_bitmap_next(BdrvDirtyBitmap *bitmap);
+#define FOR_EACH_DIRTY_BITMAP(bs, bitmap) \
+for (bitmap = bdrv_dirty_bitmap_first(bs); bitmap; \
+     bitmap = bdrv_dirty_bitmap_next(bitmap))
+
+char *bdrv_dirty_bitmap_sha256(const BdrvDirtyBitmap *bitmap, Error **errp);
+int64_t bdrv_dirty_bitmap_next_dirty(BdrvDirtyBitmap *bitmap, int64_t offset,
+                                     int64_t bytes);
+int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap *bitmap, int64_t offset,
+                                    int64_t bytes);
+bool bdrv_dirty_bitmap_next_dirty_area(BdrvDirtyBitmap *bitmap,
+        int64_t start, int64_t end, int64_t max_dirty_count,
+        int64_t *dirty_start, int64_t *dirty_count);
+BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap_locked(BdrvDirtyBitmap *bitmap,
+                                                  Error **errp);
+
+#endif
diff --git a/include/block/export.h b/include/block/export.h
new file mode 100644
index 000000000..7feb02e10
--- /dev/null
+++ b/include/block/export.h
@@ -0,0 +1,89 @@
+/*
+ * Declarations for block exports
+ *
+ * Copyright (c) 2012, 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Kevin Wolf <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#ifndef BLOCK_EXPORT_H
+#define BLOCK_EXPORT_H
+
+#include "qapi/qapi-types-block-export.h"
+#include "qemu/queue.h"
+
+typedef struct BlockExport BlockExport;
+
+typedef struct BlockExportDriver {
+    /* The export type that this driver services */
+    BlockExportType type;
+
+    /*
+     * The size of the driver-specific state that contains BlockExport as its
+     * first field.
+     */
+    size_t instance_size;
+
+    /* Creates and starts a new block export */
+    int (*create)(BlockExport *, BlockExportOptions *, Error **);
+
+    /*
+     * Frees a removed block export. This function is only called after all
+     * references have been dropped.
+     */
+    void (*delete)(BlockExport *);
+
+    /*
+     * Start to disconnect all clients and drop other references held
+     * internally by the export driver. When the function returns, there may
+     * still be active references while the export is in the process of
+     * shutting down.
+     */
+    void (*request_shutdown)(BlockExport *);
+} BlockExportDriver;
+
+struct BlockExport {
+    const BlockExportDriver *drv;
+
+    /* Unique identifier for the export */
+    char *id;
+
+    /*
+     * Reference count for this block export. This includes strong references
+     * both from the owner (qemu-nbd or the monitor) and clients connected to
+     * the export.
+     */
+    int refcount;
+
+    /*
+     * True if one of the references in refcount belongs to the user. After the
+     * user has dropped their reference, they may not e.g. remove the same
+     * export a second time (which would decrease the refcount without having
+     * it incremented first).
+     */
+    bool user_owned;
+
+    /* The AioContext whose lock protects this BlockExport object. */
+    AioContext *ctx;
+
+    /* The block device to export */
+    BlockBackend *blk;
+
+    /* List entry for block_exports */
+    QLIST_ENTRY(BlockExport) next;
+};
+
+BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp);
+BlockExport *blk_exp_find(const char *id);
+void blk_exp_ref(BlockExport *exp);
+void blk_exp_unref(BlockExport *exp);
+void blk_exp_request_shutdown(BlockExport *exp);
+void blk_exp_close_all(void);
+void blk_exp_close_all_type(BlockExportType type);
+
+#endif
diff --git a/include/block/fuse.h b/include/block/fuse.h
new file mode 100644
index 000000000..ffa91fe36
--- /dev/null
+++ b/include/block/fuse.h
@@ -0,0 +1,30 @@
+/*
+ * Present a block device as a raw image through FUSE
+ *
+ * Copyright (c) 2020 Max Reitz <mreitz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 or later of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef BLOCK_FUSE_H
+#define BLOCK_FUSE_H
+
+#ifdef CONFIG_FUSE
+
+#include "block/export.h"
+
+extern const BlockExportDriver blk_exp_fuse;
+
+#endif /* CONFIG_FUSE */
+
+#endif
diff --git a/include/block/nbd.h b/include/block/nbd.h
new file mode 100644
index 000000000..78d101b77
--- /dev/null
+++ b/include/block/nbd.h
@@ -0,0 +1,427 @@
+/*
+ *  Copyright (C) 2016-2020 Red Hat, Inc.
+ *  Copyright (C) 2005  Anthony Liguori <anthony@codemonkey.ws>
+ *
+ *  Network Block Device
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef NBD_H
+#define NBD_H
+
+#include "block/export.h"
+#include "io/channel-socket.h"
+#include "crypto/tlscreds.h"
+#include "qapi/error.h"
+
+extern const BlockExportDriver blk_exp_nbd;
+
+/* Handshake phase structs - this struct is passed on the wire */
+
+struct NBDOption {
+    uint64_t magic; /* NBD_OPTS_MAGIC */
+    uint32_t option; /* NBD_OPT_* */
+    uint32_t length;
+} QEMU_PACKED;
+typedef struct NBDOption NBDOption;
+
+struct NBDOptionReply {
+    uint64_t magic; /* NBD_REP_MAGIC */
+    uint32_t option; /* NBD_OPT_* */
+    uint32_t type; /* NBD_REP_* */
+    uint32_t length;
+} QEMU_PACKED;
+typedef struct NBDOptionReply NBDOptionReply;
+
+typedef struct NBDOptionReplyMetaContext {
+    NBDOptionReply h; /* h.type = NBD_REP_META_CONTEXT, h.length > 4 */
+    uint32_t context_id;
+    /* metadata context name follows */
+} QEMU_PACKED NBDOptionReplyMetaContext;
+
+/* Transmission phase structs
+ *
+ * Note: these are _NOT_ the same as the network representation of an NBD
+ * request and reply!
+ */
+struct NBDRequest {
+    uint64_t handle;
+    uint64_t from;
+    uint32_t len;
+    uint16_t flags; /* NBD_CMD_FLAG_* */
+    uint16_t type; /* NBD_CMD_* */
+};
+typedef struct NBDRequest NBDRequest;
+
+typedef struct NBDSimpleReply {
+    uint32_t magic;  /* NBD_SIMPLE_REPLY_MAGIC */
+    uint32_t error;
+    uint64_t handle;
+} QEMU_PACKED NBDSimpleReply;
+
+/* Header of all structured replies */
+typedef struct NBDStructuredReplyChunk {
+    uint32_t magic;  /* NBD_STRUCTURED_REPLY_MAGIC */
+    uint16_t flags;  /* combination of NBD_REPLY_FLAG_* */
+    uint16_t type;   /* NBD_REPLY_TYPE_* */
+    uint64_t handle; /* request handle */
+    uint32_t length; /* length of payload */
+} QEMU_PACKED NBDStructuredReplyChunk;
+
+typedef union NBDReply {
+    NBDSimpleReply simple;
+    NBDStructuredReplyChunk structured;
+    struct {
+        /* @magic and @handle fields have the same offset and size both in
+         * simple reply and structured reply chunk, so let them be accessible
+         * without ".simple." or ".structured." specification
+         */
+        uint32_t magic;
+        uint32_t _skip;
+        uint64_t handle;
+    } QEMU_PACKED;
+} NBDReply;
+
+/* Header of chunk for NBD_REPLY_TYPE_OFFSET_DATA */
+typedef struct NBDStructuredReadData {
+    NBDStructuredReplyChunk h; /* h.length >= 9 */
+    uint64_t offset;
+    /* At least one byte of data payload follows, calculated from h.length */
+} QEMU_PACKED NBDStructuredReadData;
+
+/* Complete chunk for NBD_REPLY_TYPE_OFFSET_HOLE */
+typedef struct NBDStructuredReadHole {
+    NBDStructuredReplyChunk h; /* h.length == 12 */
+    uint64_t offset;
+    uint32_t length;
+} QEMU_PACKED NBDStructuredReadHole;
+
+/* Header of all NBD_REPLY_TYPE_ERROR* errors */
+typedef struct NBDStructuredError {
+    NBDStructuredReplyChunk h; /* h.length >= 6 */
+    uint32_t error;
+    uint16_t message_length;
+} QEMU_PACKED NBDStructuredError;
+
+/* Header of NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDStructuredMeta {
+    NBDStructuredReplyChunk h; /* h.length >= 12 (at least one extent) */
+    uint32_t context_id;
+    /* extents follows */
+} QEMU_PACKED NBDStructuredMeta;
+
+/* Extent chunk for NBD_REPLY_TYPE_BLOCK_STATUS */
+typedef struct NBDExtent {
+    uint32_t length;
+    uint32_t flags; /* NBD_STATE_* */
+} QEMU_PACKED NBDExtent;
+
+/* Transmission (export) flags: sent from server to client during handshake,
+   but describe what will happen during transmission */
+enum {
+    NBD_FLAG_HAS_FLAGS_BIT          =  0, /* Flags are there */
+    NBD_FLAG_READ_ONLY_BIT          =  1, /* Device is read-only */
+    NBD_FLAG_SEND_FLUSH_BIT         =  2, /* Send FLUSH */
+    NBD_FLAG_SEND_FUA_BIT           =  3, /* Send FUA (Force Unit Access) */
+    NBD_FLAG_ROTATIONAL_BIT         =  4, /* Use elevator algorithm -
+                                             rotational media */
+    NBD_FLAG_SEND_TRIM_BIT          =  5, /* Send TRIM (discard) */
+    NBD_FLAG_SEND_WRITE_ZEROES_BIT  =  6, /* Send WRITE_ZEROES */
+    NBD_FLAG_SEND_DF_BIT            =  7, /* Send DF (Do not Fragment) */
+    NBD_FLAG_CAN_MULTI_CONN_BIT     =  8, /* Multi-client cache consistent */
+    NBD_FLAG_SEND_RESIZE_BIT        =  9, /* Send resize */
+    NBD_FLAG_SEND_CACHE_BIT         = 10, /* Send CACHE (prefetch) */
+    NBD_FLAG_SEND_FAST_ZERO_BIT     = 11, /* FAST_ZERO flag for WRITE_ZEROES */
+};
+
+#define NBD_FLAG_HAS_FLAGS         (1 << NBD_FLAG_HAS_FLAGS_BIT)
+#define NBD_FLAG_READ_ONLY         (1 << NBD_FLAG_READ_ONLY_BIT)
+#define NBD_FLAG_SEND_FLUSH        (1 << NBD_FLAG_SEND_FLUSH_BIT)
+#define NBD_FLAG_SEND_FUA          (1 << NBD_FLAG_SEND_FUA_BIT)
+#define NBD_FLAG_ROTATIONAL        (1 << NBD_FLAG_ROTATIONAL_BIT)
+#define NBD_FLAG_SEND_TRIM         (1 << NBD_FLAG_SEND_TRIM_BIT)
+#define NBD_FLAG_SEND_WRITE_ZEROES (1 << NBD_FLAG_SEND_WRITE_ZEROES_BIT)
+#define NBD_FLAG_SEND_DF           (1 << NBD_FLAG_SEND_DF_BIT)
+#define NBD_FLAG_CAN_MULTI_CONN    (1 << NBD_FLAG_CAN_MULTI_CONN_BIT)
+#define NBD_FLAG_SEND_RESIZE       (1 << NBD_FLAG_SEND_RESIZE_BIT)
+#define NBD_FLAG_SEND_CACHE        (1 << NBD_FLAG_SEND_CACHE_BIT)
+#define NBD_FLAG_SEND_FAST_ZERO    (1 << NBD_FLAG_SEND_FAST_ZERO_BIT)
+
+/* New-style handshake (global) flags, sent from server to client, and
+   control what will happen during handshake phase. */
+#define NBD_FLAG_FIXED_NEWSTYLE   (1 << 0) /* Fixed newstyle protocol. */
+#define NBD_FLAG_NO_ZEROES        (1 << 1) /* End handshake without zeroes. */
+
+/* New-style client flags, sent from client to server to control what happens
+   during handshake phase. */
+#define NBD_FLAG_C_FIXED_NEWSTYLE (1 << 0) /* Fixed newstyle protocol. */
+#define NBD_FLAG_C_NO_ZEROES      (1 << 1) /* End handshake without zeroes. */
+
+/* Option requests. */
+#define NBD_OPT_EXPORT_NAME       (1)
+#define NBD_OPT_ABORT             (2)
+#define NBD_OPT_LIST              (3)
+/* #define NBD_OPT_PEEK_EXPORT    (4) not in use */
+#define NBD_OPT_STARTTLS          (5)
+#define NBD_OPT_INFO              (6)
+#define NBD_OPT_GO                (7)
+#define NBD_OPT_STRUCTURED_REPLY  (8)
+#define NBD_OPT_LIST_META_CONTEXT (9)
+#define NBD_OPT_SET_META_CONTEXT  (10)
+
+/* Option reply types. */
+#define NBD_REP_ERR(value) ((UINT32_C(1) << 31) | (value))
+
+#define NBD_REP_ACK             (1)    /* Data sending finished. */
+#define NBD_REP_SERVER          (2)    /* Export description. */
+#define NBD_REP_INFO            (3)    /* NBD_OPT_INFO/GO. */
+#define NBD_REP_META_CONTEXT    (4)    /* NBD_OPT_{LIST,SET}_META_CONTEXT */
+
+#define NBD_REP_ERR_UNSUP           NBD_REP_ERR(1)  /* Unknown option */
+#define NBD_REP_ERR_POLICY          NBD_REP_ERR(2)  /* Server denied */
+#define NBD_REP_ERR_INVALID         NBD_REP_ERR(3)  /* Invalid length */
+#define NBD_REP_ERR_PLATFORM        NBD_REP_ERR(4)  /* Not compiled in */
+#define NBD_REP_ERR_TLS_REQD        NBD_REP_ERR(5)  /* TLS required */
+#define NBD_REP_ERR_UNKNOWN         NBD_REP_ERR(6)  /* Export unknown */
+#define NBD_REP_ERR_SHUTDOWN        NBD_REP_ERR(7)  /* Server shutting down */
+#define NBD_REP_ERR_BLOCK_SIZE_REQD NBD_REP_ERR(8)  /* Need INFO_BLOCK_SIZE */
+
+/* Info types, used during NBD_REP_INFO */
+#define NBD_INFO_EXPORT         0
+#define NBD_INFO_NAME           1
+#define NBD_INFO_DESCRIPTION    2
+#define NBD_INFO_BLOCK_SIZE     3
+
+/* Request flags, sent from client to server during transmission phase */
+#define NBD_CMD_FLAG_FUA        (1 << 0) /* 'force unit access' during write */
+#define NBD_CMD_FLAG_NO_HOLE    (1 << 1) /* don't punch hole on zero run */
+#define NBD_CMD_FLAG_DF         (1 << 2) /* don't fragment structured read */
+#define NBD_CMD_FLAG_REQ_ONE    (1 << 3) /* only one extent in BLOCK_STATUS
+                                          * reply chunk */
+#define NBD_CMD_FLAG_FAST_ZERO  (1 << 4) /* fail if WRITE_ZEROES is not fast */
+
+/* Supported request types */
+enum {
+    NBD_CMD_READ = 0,
+    NBD_CMD_WRITE = 1,
+    NBD_CMD_DISC = 2,
+    NBD_CMD_FLUSH = 3,
+    NBD_CMD_TRIM = 4,
+    NBD_CMD_CACHE = 5,
+    NBD_CMD_WRITE_ZEROES = 6,
+    NBD_CMD_BLOCK_STATUS = 7,
+};
+
+#define NBD_DEFAULT_PORT	10809
+
+/* Maximum size of a single READ/WRITE data buffer */
+#define NBD_MAX_BUFFER_SIZE (32 * 1024 * 1024)
+
+/*
+ * Maximum size of a protocol string (export name, metadata context name,
+ * etc.).  Use malloc rather than stack allocation for storage of a
+ * string.
+ */
+#define NBD_MAX_STRING_SIZE 4096
+
+/* Two types of reply structures */
+#define NBD_SIMPLE_REPLY_MAGIC      0x67446698
+#define NBD_STRUCTURED_REPLY_MAGIC  0x668e33ef
+
+/* Structured reply flags */
+#define NBD_REPLY_FLAG_DONE          (1 << 0) /* This reply-chunk is last */
+
+/* Structured reply types */
+#define NBD_REPLY_ERR(value)         ((1 << 15) | (value))
+
+#define NBD_REPLY_TYPE_NONE          0
+#define NBD_REPLY_TYPE_OFFSET_DATA   1
+#define NBD_REPLY_TYPE_OFFSET_HOLE   2
+#define NBD_REPLY_TYPE_BLOCK_STATUS  5
+#define NBD_REPLY_TYPE_ERROR         NBD_REPLY_ERR(1)
+#define NBD_REPLY_TYPE_ERROR_OFFSET  NBD_REPLY_ERR(2)
+
+/* Extent flags for base:allocation in NBD_REPLY_TYPE_BLOCK_STATUS */
+#define NBD_STATE_HOLE (1 << 0)
+#define NBD_STATE_ZERO (1 << 1)
+
+/* Extent flags for qemu:dirty-bitmap in NBD_REPLY_TYPE_BLOCK_STATUS */
+#define NBD_STATE_DIRTY (1 << 0)
+
+/* No flags needed for qemu:allocation-depth in NBD_REPLY_TYPE_BLOCK_STATUS */
+
+static inline bool nbd_reply_type_is_error(int type)
+{
+    return type & (1 << 15);
+}
+
+/* NBD errors are based on errno numbers, so there is a 1:1 mapping,
+ * but only a limited set of errno values is specified in the protocol.
+ * Everything else is squashed to EINVAL.
+ */
+#define NBD_SUCCESS    0
+#define NBD_EPERM      1
+#define NBD_EIO        5
+#define NBD_ENOMEM     12
+#define NBD_EINVAL     22
+#define NBD_ENOSPC     28
+#define NBD_EOVERFLOW  75
+#define NBD_ENOTSUP    95
+#define NBD_ESHUTDOWN  108
+
+/* Details collected by NBD_OPT_EXPORT_NAME and NBD_OPT_GO */
+struct NBDExportInfo {
+    /* Set by client before nbd_receive_negotiate() */
+    bool request_sizes;
+    char *x_dirty_bitmap;
+
+    /* Set by client before nbd_receive_negotiate(), or by server results
+     * during nbd_receive_export_list() */
+    char *name; /* must be non-NULL */
+
+    /* In-out fields, set by client before nbd_receive_negotiate() and
+     * updated by server results during nbd_receive_negotiate() */
+    bool structured_reply;
+    bool base_allocation; /* base:allocation context for NBD_CMD_BLOCK_STATUS */
+
+    /* Set by server results during nbd_receive_negotiate() and
+     * nbd_receive_export_list() */
+    uint64_t size;
+    uint16_t flags;
+    uint32_t min_block;
+    uint32_t opt_block;
+    uint32_t max_block;
+
+    uint32_t context_id;
+
+    /* Set by server results during nbd_receive_export_list() */
+    char *description;
+    int n_contexts;
+    char **contexts;
+};
+typedef struct NBDExportInfo NBDExportInfo;
+
+int nbd_receive_negotiate(AioContext *aio_context, QIOChannel *ioc,
+                          QCryptoTLSCreds *tlscreds,
+                          const char *hostname, QIOChannel **outioc,
+                          NBDExportInfo *info, Error **errp);
+void nbd_free_export_list(NBDExportInfo *info, int count);
+int nbd_receive_export_list(QIOChannel *ioc, QCryptoTLSCreds *tlscreds,
+                            const char *hostname, NBDExportInfo **info,
+                            Error **errp);
+int nbd_init(int fd, QIOChannelSocket *sioc, NBDExportInfo *info,
+             Error **errp);
+int nbd_send_request(QIOChannel *ioc, NBDRequest *request);
+int coroutine_fn nbd_receive_reply(BlockDriverState *bs, QIOChannel *ioc,
+                                   NBDReply *reply, Error **errp);
+int nbd_client(int fd);
+int nbd_disconnect(int fd);
+int nbd_errno_to_system_errno(int err);
+
+typedef struct NBDExport NBDExport;
+typedef struct NBDClient NBDClient;
+
+void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk);
+
+AioContext *nbd_export_aio_context(NBDExport *exp);
+NBDExport *nbd_export_find(const char *name);
+
+void nbd_client_new(QIOChannelSocket *sioc,
+                    QCryptoTLSCreds *tlscreds,
+                    const char *tlsauthz,
+                    void (*close_fn)(NBDClient *, bool));
+void nbd_client_get(NBDClient *client);
+void nbd_client_put(NBDClient *client);
+
+void nbd_server_is_qemu_nbd(bool value);
+bool nbd_server_is_running(void);
+void nbd_server_start(SocketAddress *addr, const char *tls_creds,
+                      const char *tls_authz, uint32_t max_connections,
+                      Error **errp);
+void nbd_server_start_options(NbdServerOptions *arg, Error **errp);
+
+/* nbd_read
+ * Reads @size bytes from @ioc. Returns 0 on success.
+ */
+static inline int nbd_read(QIOChannel *ioc, void *buffer, size_t size,
+                           const char *desc, Error **errp)
+{
+    ERRP_GUARD();
+    int ret = qio_channel_read_all(ioc, buffer, size, errp) < 0 ? -EIO : 0;
+
+    if (ret < 0) {
+        if (desc) {
+            error_prepend(errp, "Failed to read %s: ", desc);
+        }
+        return ret;
+    }
+
+    return 0;
+}
+
+#define DEF_NBD_READ_N(bits)                                            \
+static inline int nbd_read##bits(QIOChannel *ioc,                       \
+                                 uint##bits##_t *val,                   \
+                                 const char *desc, Error **errp)        \
+{                                                                       \
+    int ret = nbd_read(ioc, val, sizeof(*val), desc, errp);             \
+    if (ret < 0) {                                                      \
+        return ret;                                                     \
+    }                                                                   \
+    *val = be##bits##_to_cpu(*val);                                     \
+    return 0;                                                           \
+}
+
+DEF_NBD_READ_N(16) /* Defines nbd_read16(). */
+DEF_NBD_READ_N(32) /* Defines nbd_read32(). */
+DEF_NBD_READ_N(64) /* Defines nbd_read64(). */
+
+#undef DEF_NBD_READ_N
+
+static inline bool nbd_reply_is_simple(NBDReply *reply)
+{
+    return reply->magic == NBD_SIMPLE_REPLY_MAGIC;
+}
+
+static inline bool nbd_reply_is_structured(NBDReply *reply)
+{
+    return reply->magic == NBD_STRUCTURED_REPLY_MAGIC;
+}
+
+const char *nbd_reply_type_lookup(uint16_t type);
+const char *nbd_opt_lookup(uint32_t opt);
+const char *nbd_rep_lookup(uint32_t rep);
+const char *nbd_info_lookup(uint16_t info);
+const char *nbd_cmd_lookup(uint16_t info);
+const char *nbd_err_lookup(int err);
+
+/* nbd/client-connection.c */
+typedef struct NBDClientConnection NBDClientConnection;
+
+void nbd_client_connection_enable_retry(NBDClientConnection *conn);
+
+NBDClientConnection *nbd_client_connection_new(const SocketAddress *saddr,
+                                               bool do_negotiation,
+                                               const char *export_name,
+                                               const char *x_dirty_bitmap,
+                                               QCryptoTLSCreds *tlscreds);
+void nbd_client_connection_release(NBDClientConnection *conn);
+
+QIOChannel *coroutine_fn
+nbd_co_establish_connection(NBDClientConnection *conn, NBDExportInfo *info,
+                            bool blocking, Error **errp);
+
+void coroutine_fn nbd_co_establish_connection_cancel(NBDClientConnection *conn);
+
+#endif
diff --git a/include/block/nvme.h b/include/block/nvme.h
new file mode 100644
index 000000000..e3bd47bf7
--- /dev/null
+++ b/include/block/nvme.h
@@ -0,0 +1,1501 @@
+#ifndef BLOCK_NVME_H
+#define BLOCK_NVME_H
+
+typedef struct QEMU_PACKED NvmeBar {
+    uint64_t    cap;
+    uint32_t    vs;
+    uint32_t    intms;
+    uint32_t    intmc;
+    uint32_t    cc;
+    uint8_t     rsvd24[4];
+    uint32_t    csts;
+    uint32_t    nssr;
+    uint32_t    aqa;
+    uint64_t    asq;
+    uint64_t    acq;
+    uint32_t    cmbloc;
+    uint32_t    cmbsz;
+    uint32_t    bpinfo;
+    uint32_t    bprsel;
+    uint64_t    bpmbl;
+    uint64_t    cmbmsc;
+    uint32_t    cmbsts;
+    uint8_t     rsvd92[3492];
+    uint32_t    pmrcap;
+    uint32_t    pmrctl;
+    uint32_t    pmrsts;
+    uint32_t    pmrebs;
+    uint32_t    pmrswtp;
+    uint32_t    pmrmscl;
+    uint32_t    pmrmscu;
+    uint8_t     css[484];
+} NvmeBar;
+
+enum NvmeBarRegs {
+    NVME_REG_CAP     = offsetof(NvmeBar, cap),
+    NVME_REG_VS      = offsetof(NvmeBar, vs),
+    NVME_REG_INTMS   = offsetof(NvmeBar, intms),
+    NVME_REG_INTMC   = offsetof(NvmeBar, intmc),
+    NVME_REG_CC      = offsetof(NvmeBar, cc),
+    NVME_REG_CSTS    = offsetof(NvmeBar, csts),
+    NVME_REG_NSSR    = offsetof(NvmeBar, nssr),
+    NVME_REG_AQA     = offsetof(NvmeBar, aqa),
+    NVME_REG_ASQ     = offsetof(NvmeBar, asq),
+    NVME_REG_ACQ     = offsetof(NvmeBar, acq),
+    NVME_REG_CMBLOC  = offsetof(NvmeBar, cmbloc),
+    NVME_REG_CMBSZ   = offsetof(NvmeBar, cmbsz),
+    NVME_REG_BPINFO  = offsetof(NvmeBar, bpinfo),
+    NVME_REG_BPRSEL  = offsetof(NvmeBar, bprsel),
+    NVME_REG_BPMBL   = offsetof(NvmeBar, bpmbl),
+    NVME_REG_CMBMSC  = offsetof(NvmeBar, cmbmsc),
+    NVME_REG_CMBSTS  = offsetof(NvmeBar, cmbsts),
+    NVME_REG_PMRCAP  = offsetof(NvmeBar, pmrcap),
+    NVME_REG_PMRCTL  = offsetof(NvmeBar, pmrctl),
+    NVME_REG_PMRSTS  = offsetof(NvmeBar, pmrsts),
+    NVME_REG_PMREBS  = offsetof(NvmeBar, pmrebs),
+    NVME_REG_PMRSWTP = offsetof(NvmeBar, pmrswtp),
+    NVME_REG_PMRMSCL = offsetof(NvmeBar, pmrmscl),
+    NVME_REG_PMRMSCU = offsetof(NvmeBar, pmrmscu),
+};
+
+enum NvmeCapShift {
+    CAP_MQES_SHIFT     = 0,
+    CAP_CQR_SHIFT      = 16,
+    CAP_AMS_SHIFT      = 17,
+    CAP_TO_SHIFT       = 24,
+    CAP_DSTRD_SHIFT    = 32,
+    CAP_NSSRS_SHIFT    = 36,
+    CAP_CSS_SHIFT      = 37,
+    CAP_MPSMIN_SHIFT   = 48,
+    CAP_MPSMAX_SHIFT   = 52,
+    CAP_PMRS_SHIFT     = 56,
+    CAP_CMBS_SHIFT     = 57,
+};
+
+enum NvmeCapMask {
+    CAP_MQES_MASK      = 0xffff,
+    CAP_CQR_MASK       = 0x1,
+    CAP_AMS_MASK       = 0x3,
+    CAP_TO_MASK        = 0xff,
+    CAP_DSTRD_MASK     = 0xf,
+    CAP_NSSRS_MASK     = 0x1,
+    CAP_CSS_MASK       = 0xff,
+    CAP_MPSMIN_MASK    = 0xf,
+    CAP_MPSMAX_MASK    = 0xf,
+    CAP_PMRS_MASK      = 0x1,
+    CAP_CMBS_MASK      = 0x1,
+};
+
+#define NVME_CAP_MQES(cap)  (((cap) >> CAP_MQES_SHIFT)   & CAP_MQES_MASK)
+#define NVME_CAP_CQR(cap)   (((cap) >> CAP_CQR_SHIFT)    & CAP_CQR_MASK)
+#define NVME_CAP_AMS(cap)   (((cap) >> CAP_AMS_SHIFT)    & CAP_AMS_MASK)
+#define NVME_CAP_TO(cap)    (((cap) >> CAP_TO_SHIFT)     & CAP_TO_MASK)
+#define NVME_CAP_DSTRD(cap) (((cap) >> CAP_DSTRD_SHIFT)  & CAP_DSTRD_MASK)
+#define NVME_CAP_NSSRS(cap) (((cap) >> CAP_NSSRS_SHIFT)  & CAP_NSSRS_MASK)
+#define NVME_CAP_CSS(cap)   (((cap) >> CAP_CSS_SHIFT)    & CAP_CSS_MASK)
+#define NVME_CAP_MPSMIN(cap)(((cap) >> CAP_MPSMIN_SHIFT) & CAP_MPSMIN_MASK)
+#define NVME_CAP_MPSMAX(cap)(((cap) >> CAP_MPSMAX_SHIFT) & CAP_MPSMAX_MASK)
+#define NVME_CAP_PMRS(cap)  (((cap) >> CAP_PMRS_SHIFT)   & CAP_PMRS_MASK)
+#define NVME_CAP_CMBS(cap)  (((cap) >> CAP_CMBS_SHIFT)   & CAP_CMBS_MASK)
+
+#define NVME_CAP_SET_MQES(cap, val)   (cap |= (uint64_t)(val & CAP_MQES_MASK)  \
+                                                           << CAP_MQES_SHIFT)
+#define NVME_CAP_SET_CQR(cap, val)    (cap |= (uint64_t)(val & CAP_CQR_MASK)   \
+                                                           << CAP_CQR_SHIFT)
+#define NVME_CAP_SET_AMS(cap, val)    (cap |= (uint64_t)(val & CAP_AMS_MASK)   \
+                                                           << CAP_AMS_SHIFT)
+#define NVME_CAP_SET_TO(cap, val)     (cap |= (uint64_t)(val & CAP_TO_MASK)    \
+                                                           << CAP_TO_SHIFT)
+#define NVME_CAP_SET_DSTRD(cap, val)  (cap |= (uint64_t)(val & CAP_DSTRD_MASK) \
+                                                           << CAP_DSTRD_SHIFT)
+#define NVME_CAP_SET_NSSRS(cap, val)  (cap |= (uint64_t)(val & CAP_NSSRS_MASK) \
+                                                           << CAP_NSSRS_SHIFT)
+#define NVME_CAP_SET_CSS(cap, val)    (cap |= (uint64_t)(val & CAP_CSS_MASK)   \
+                                                           << CAP_CSS_SHIFT)
+#define NVME_CAP_SET_MPSMIN(cap, val) (cap |= (uint64_t)(val & CAP_MPSMIN_MASK)\
+                                                           << CAP_MPSMIN_SHIFT)
+#define NVME_CAP_SET_MPSMAX(cap, val) (cap |= (uint64_t)(val & CAP_MPSMAX_MASK)\
+                                                           << CAP_MPSMAX_SHIFT)
+#define NVME_CAP_SET_PMRS(cap, val)   (cap |= (uint64_t)(val & CAP_PMRS_MASK)  \
+                                                           << CAP_PMRS_SHIFT)
+#define NVME_CAP_SET_CMBS(cap, val)   (cap |= (uint64_t)(val & CAP_CMBS_MASK)  \
+                                                           << CAP_CMBS_SHIFT)
+
+enum NvmeCapCss {
+    NVME_CAP_CSS_NVM        = 1 << 0,
+    NVME_CAP_CSS_CSI_SUPP   = 1 << 6,
+    NVME_CAP_CSS_ADMIN_ONLY = 1 << 7,
+};
+
+enum NvmeCcShift {
+    CC_EN_SHIFT     = 0,
+    CC_CSS_SHIFT    = 4,
+    CC_MPS_SHIFT    = 7,
+    CC_AMS_SHIFT    = 11,
+    CC_SHN_SHIFT    = 14,
+    CC_IOSQES_SHIFT = 16,
+    CC_IOCQES_SHIFT = 20,
+};
+
+enum NvmeCcMask {
+    CC_EN_MASK      = 0x1,
+    CC_CSS_MASK     = 0x7,
+    CC_MPS_MASK     = 0xf,
+    CC_AMS_MASK     = 0x7,
+    CC_SHN_MASK     = 0x3,
+    CC_IOSQES_MASK  = 0xf,
+    CC_IOCQES_MASK  = 0xf,
+};
+
+#define NVME_CC_EN(cc)     ((cc >> CC_EN_SHIFT)     & CC_EN_MASK)
+#define NVME_CC_CSS(cc)    ((cc >> CC_CSS_SHIFT)    & CC_CSS_MASK)
+#define NVME_CC_MPS(cc)    ((cc >> CC_MPS_SHIFT)    & CC_MPS_MASK)
+#define NVME_CC_AMS(cc)    ((cc >> CC_AMS_SHIFT)    & CC_AMS_MASK)
+#define NVME_CC_SHN(cc)    ((cc >> CC_SHN_SHIFT)    & CC_SHN_MASK)
+#define NVME_CC_IOSQES(cc) ((cc >> CC_IOSQES_SHIFT) & CC_IOSQES_MASK)
+#define NVME_CC_IOCQES(cc) ((cc >> CC_IOCQES_SHIFT) & CC_IOCQES_MASK)
+
+enum NvmeCcCss {
+    NVME_CC_CSS_NVM        = 0x0,
+    NVME_CC_CSS_CSI        = 0x6,
+    NVME_CC_CSS_ADMIN_ONLY = 0x7,
+};
+
+#define NVME_SET_CC_EN(cc, val)     \
+    (cc |= (uint32_t)((val) & CC_EN_MASK) << CC_EN_SHIFT)
+#define NVME_SET_CC_CSS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_CSS_MASK) << CC_CSS_SHIFT)
+#define NVME_SET_CC_MPS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_MPS_MASK) << CC_MPS_SHIFT)
+#define NVME_SET_CC_AMS(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_AMS_MASK) << CC_AMS_SHIFT)
+#define NVME_SET_CC_SHN(cc, val)    \
+    (cc |= (uint32_t)((val) & CC_SHN_MASK) << CC_SHN_SHIFT)
+#define NVME_SET_CC_IOSQES(cc, val) \
+    (cc |= (uint32_t)((val) & CC_IOSQES_MASK) << CC_IOSQES_SHIFT)
+#define NVME_SET_CC_IOCQES(cc, val) \
+    (cc |= (uint32_t)((val) & CC_IOCQES_MASK) << CC_IOCQES_SHIFT)
+
+enum NvmeCstsShift {
+    CSTS_RDY_SHIFT      = 0,
+    CSTS_CFS_SHIFT      = 1,
+    CSTS_SHST_SHIFT     = 2,
+    CSTS_NSSRO_SHIFT    = 4,
+};
+
+enum NvmeCstsMask {
+    CSTS_RDY_MASK   = 0x1,
+    CSTS_CFS_MASK   = 0x1,
+    CSTS_SHST_MASK  = 0x3,
+    CSTS_NSSRO_MASK = 0x1,
+};
+
+enum NvmeCsts {
+    NVME_CSTS_READY         = 1 << CSTS_RDY_SHIFT,
+    NVME_CSTS_FAILED        = 1 << CSTS_CFS_SHIFT,
+    NVME_CSTS_SHST_NORMAL   = 0 << CSTS_SHST_SHIFT,
+    NVME_CSTS_SHST_PROGRESS = 1 << CSTS_SHST_SHIFT,
+    NVME_CSTS_SHST_COMPLETE = 2 << CSTS_SHST_SHIFT,
+    NVME_CSTS_NSSRO         = 1 << CSTS_NSSRO_SHIFT,
+};
+
+#define NVME_CSTS_RDY(csts)     ((csts >> CSTS_RDY_SHIFT)   & CSTS_RDY_MASK)
+#define NVME_CSTS_CFS(csts)     ((csts >> CSTS_CFS_SHIFT)   & CSTS_CFS_MASK)
+#define NVME_CSTS_SHST(csts)    ((csts >> CSTS_SHST_SHIFT)  & CSTS_SHST_MASK)
+#define NVME_CSTS_NSSRO(csts)   ((csts >> CSTS_NSSRO_SHIFT) & CSTS_NSSRO_MASK)
+
+enum NvmeAqaShift {
+    AQA_ASQS_SHIFT  = 0,
+    AQA_ACQS_SHIFT  = 16,
+};
+
+enum NvmeAqaMask {
+    AQA_ASQS_MASK   = 0xfff,
+    AQA_ACQS_MASK   = 0xfff,
+};
+
+#define NVME_AQA_ASQS(aqa) ((aqa >> AQA_ASQS_SHIFT) & AQA_ASQS_MASK)
+#define NVME_AQA_ACQS(aqa) ((aqa >> AQA_ACQS_SHIFT) & AQA_ACQS_MASK)
+
+enum NvmeCmblocShift {
+    CMBLOC_BIR_SHIFT     = 0,
+    CMBLOC_CQMMS_SHIFT   = 3,
+    CMBLOC_CQPDS_SHIFT   = 4,
+    CMBLOC_CDPMLS_SHIFT  = 5,
+    CMBLOC_CDPCILS_SHIFT = 6,
+    CMBLOC_CDMMMS_SHIFT  = 7,
+    CMBLOC_CQDA_SHIFT    = 8,
+    CMBLOC_OFST_SHIFT    = 12,
+};
+
+enum NvmeCmblocMask {
+    CMBLOC_BIR_MASK     = 0x7,
+    CMBLOC_CQMMS_MASK   = 0x1,
+    CMBLOC_CQPDS_MASK   = 0x1,
+    CMBLOC_CDPMLS_MASK  = 0x1,
+    CMBLOC_CDPCILS_MASK = 0x1,
+    CMBLOC_CDMMMS_MASK  = 0x1,
+    CMBLOC_CQDA_MASK    = 0x1,
+    CMBLOC_OFST_MASK    = 0xfffff,
+};
+
+#define NVME_CMBLOC_BIR(cmbloc) \
+    ((cmbloc >> CMBLOC_BIR_SHIFT) & CMBLOC_BIR_MASK)
+#define NVME_CMBLOC_CQMMS(cmbloc) \
+    ((cmbloc >> CMBLOC_CQMMS_SHIFT) & CMBLOC_CQMMS_MASK)
+#define NVME_CMBLOC_CQPDS(cmbloc) \
+    ((cmbloc >> CMBLOC_CQPDS_SHIFT) & CMBLOC_CQPDS_MASK)
+#define NVME_CMBLOC_CDPMLS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDPMLS_SHIFT) & CMBLOC_CDPMLS_MASK)
+#define NVME_CMBLOC_CDPCILS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDPCILS_SHIFT) & CMBLOC_CDPCILS_MASK)
+#define NVME_CMBLOC_CDMMMS(cmbloc) \
+    ((cmbloc >> CMBLOC_CDMMMS_SHIFT) & CMBLOC_CDMMMS_MASK)
+#define NVME_CMBLOC_CQDA(cmbloc) \
+    ((cmbloc >> CMBLOC_CQDA_SHIFT) & CMBLOC_CQDA_MASK)
+#define NVME_CMBLOC_OFST(cmbloc) \
+    ((cmbloc >> CMBLOC_OFST_SHIFT) & CMBLOC_OFST_MASK)
+
+#define NVME_CMBLOC_SET_BIR(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_BIR_MASK) << CMBLOC_BIR_SHIFT)
+#define NVME_CMBLOC_SET_CQMMS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQMMS_MASK) << CMBLOC_CQMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQPDS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQPDS_MASK) << CMBLOC_CQPDS_SHIFT)
+#define NVME_CMBLOC_SET_CDPMLS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDPMLS_MASK) << CMBLOC_CDPMLS_SHIFT)
+#define NVME_CMBLOC_SET_CDPCILS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDPCILS_MASK) << CMBLOC_CDPCILS_SHIFT)
+#define NVME_CMBLOC_SET_CDMMMS(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CDMMMS_MASK) << CMBLOC_CDMMMS_SHIFT)
+#define NVME_CMBLOC_SET_CQDA(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_CQDA_MASK) << CMBLOC_CQDA_SHIFT)
+#define NVME_CMBLOC_SET_OFST(cmbloc, val) \
+    (cmbloc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBLOC_OFST_SHIFT)
+
+#define NVME_CMBMSMC_SET_CRE (cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBLOC_OFST_MASK) << CMBMSC_CRE_SHIFT)
+
+enum NvmeCmbszShift {
+    CMBSZ_SQS_SHIFT   = 0,
+    CMBSZ_CQS_SHIFT   = 1,
+    CMBSZ_LISTS_SHIFT = 2,
+    CMBSZ_RDS_SHIFT   = 3,
+    CMBSZ_WDS_SHIFT   = 4,
+    CMBSZ_SZU_SHIFT   = 8,
+    CMBSZ_SZ_SHIFT    = 12,
+};
+
+enum NvmeCmbszMask {
+    CMBSZ_SQS_MASK   = 0x1,
+    CMBSZ_CQS_MASK   = 0x1,
+    CMBSZ_LISTS_MASK = 0x1,
+    CMBSZ_RDS_MASK   = 0x1,
+    CMBSZ_WDS_MASK   = 0x1,
+    CMBSZ_SZU_MASK   = 0xf,
+    CMBSZ_SZ_MASK    = 0xfffff,
+};
+
+#define NVME_CMBSZ_SQS(cmbsz)  ((cmbsz >> CMBSZ_SQS_SHIFT)   & CMBSZ_SQS_MASK)
+#define NVME_CMBSZ_CQS(cmbsz)  ((cmbsz >> CMBSZ_CQS_SHIFT)   & CMBSZ_CQS_MASK)
+#define NVME_CMBSZ_LISTS(cmbsz)((cmbsz >> CMBSZ_LISTS_SHIFT) & CMBSZ_LISTS_MASK)
+#define NVME_CMBSZ_RDS(cmbsz)  ((cmbsz >> CMBSZ_RDS_SHIFT)   & CMBSZ_RDS_MASK)
+#define NVME_CMBSZ_WDS(cmbsz)  ((cmbsz >> CMBSZ_WDS_SHIFT)   & CMBSZ_WDS_MASK)
+#define NVME_CMBSZ_SZU(cmbsz)  ((cmbsz >> CMBSZ_SZU_SHIFT)   & CMBSZ_SZU_MASK)
+#define NVME_CMBSZ_SZ(cmbsz)   ((cmbsz >> CMBSZ_SZ_SHIFT)    & CMBSZ_SZ_MASK)
+
+#define NVME_CMBSZ_SET_SQS(cmbsz, val)   \
+    (cmbsz |= (uint64_t)(val &  CMBSZ_SQS_MASK)  << CMBSZ_SQS_SHIFT)
+#define NVME_CMBSZ_SET_CQS(cmbsz, val)   \
+    (cmbsz |= (uint64_t)(val & CMBSZ_CQS_MASK) << CMBSZ_CQS_SHIFT)
+#define NVME_CMBSZ_SET_LISTS(cmbsz, val) \
+    (cmbsz |= (uint64_t)(val & CMBSZ_LISTS_MASK) << CMBSZ_LISTS_SHIFT)
+#define NVME_CMBSZ_SET_RDS(cmbsz, val)   \
+    (cmbsz |= (uint64_t)(val & CMBSZ_RDS_MASK) << CMBSZ_RDS_SHIFT)
+#define NVME_CMBSZ_SET_WDS(cmbsz, val)   \
+    (cmbsz |= (uint64_t)(val & CMBSZ_WDS_MASK) << CMBSZ_WDS_SHIFT)
+#define NVME_CMBSZ_SET_SZU(cmbsz, val)   \
+    (cmbsz |= (uint64_t)(val & CMBSZ_SZU_MASK) << CMBSZ_SZU_SHIFT)
+#define NVME_CMBSZ_SET_SZ(cmbsz, val)    \
+    (cmbsz |= (uint64_t)(val & CMBSZ_SZ_MASK) << CMBSZ_SZ_SHIFT)
+
+#define NVME_CMBSZ_GETSIZE(cmbsz) \
+    (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz))))
+
+enum NvmeCmbmscShift {
+    CMBMSC_CRE_SHIFT  = 0,
+    CMBMSC_CMSE_SHIFT = 1,
+    CMBMSC_CBA_SHIFT  = 12,
+};
+
+enum NvmeCmbmscMask {
+    CMBMSC_CRE_MASK  = 0x1,
+    CMBMSC_CMSE_MASK = 0x1,
+    CMBMSC_CBA_MASK  = ((1ULL << 52) - 1),
+};
+
+#define NVME_CMBMSC_CRE(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CRE_SHIFT)  & CMBMSC_CRE_MASK)
+#define NVME_CMBMSC_CMSE(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CMSE_SHIFT) & CMBMSC_CMSE_MASK)
+#define NVME_CMBMSC_CBA(cmbmsc) \
+    ((cmbmsc >> CMBMSC_CBA_SHIFT) & CMBMSC_CBA_MASK)
+
+
+#define NVME_CMBMSC_SET_CRE(cmbmsc, val)  \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CRE_MASK) << CMBMSC_CRE_SHIFT)
+#define NVME_CMBMSC_SET_CMSE(cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CMSE_MASK) << CMBMSC_CMSE_SHIFT)
+#define NVME_CMBMSC_SET_CBA(cmbmsc, val) \
+    (cmbmsc |= (uint64_t)(val & CMBMSC_CBA_MASK) << CMBMSC_CBA_SHIFT)
+
+enum NvmeCmbstsShift {
+    CMBSTS_CBAI_SHIFT = 0,
+};
+enum NvmeCmbstsMask {
+    CMBSTS_CBAI_MASK = 0x1,
+};
+
+#define NVME_CMBSTS_CBAI(cmbsts) \
+    ((cmbsts >> CMBSTS_CBAI_SHIFT) & CMBSTS_CBAI_MASK)
+
+#define NVME_CMBSTS_SET_CBAI(cmbsts, val)  \
+    (cmbsts |= (uint64_t)(val & CMBSTS_CBAI_MASK) << CMBSTS_CBAI_SHIFT)
+
+enum NvmePmrcapShift {
+    PMRCAP_RDS_SHIFT      = 3,
+    PMRCAP_WDS_SHIFT      = 4,
+    PMRCAP_BIR_SHIFT      = 5,
+    PMRCAP_PMRTU_SHIFT    = 8,
+    PMRCAP_PMRWBM_SHIFT   = 10,
+    PMRCAP_PMRTO_SHIFT    = 16,
+    PMRCAP_CMSS_SHIFT     = 24,
+};
+
+enum NvmePmrcapMask {
+    PMRCAP_RDS_MASK      = 0x1,
+    PMRCAP_WDS_MASK      = 0x1,
+    PMRCAP_BIR_MASK      = 0x7,
+    PMRCAP_PMRTU_MASK    = 0x3,
+    PMRCAP_PMRWBM_MASK   = 0xf,
+    PMRCAP_PMRTO_MASK    = 0xff,
+    PMRCAP_CMSS_MASK     = 0x1,
+};
+
+#define NVME_PMRCAP_RDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_RDS_SHIFT)   & PMRCAP_RDS_MASK)
+#define NVME_PMRCAP_WDS(pmrcap)    \
+    ((pmrcap >> PMRCAP_WDS_SHIFT)   & PMRCAP_WDS_MASK)
+#define NVME_PMRCAP_BIR(pmrcap)    \
+    ((pmrcap >> PMRCAP_BIR_SHIFT)   & PMRCAP_BIR_MASK)
+#define NVME_PMRCAP_PMRTU(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTU_SHIFT)   & PMRCAP_PMRTU_MASK)
+#define NVME_PMRCAP_PMRWBM(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRWBM_SHIFT)   & PMRCAP_PMRWBM_MASK)
+#define NVME_PMRCAP_PMRTO(pmrcap)    \
+    ((pmrcap >> PMRCAP_PMRTO_SHIFT)   & PMRCAP_PMRTO_MASK)
+#define NVME_PMRCAP_CMSS(pmrcap)    \
+    ((pmrcap >> PMRCAP_CMSS_SHIFT)   & PMRCAP_CMSS_MASK)
+
+#define NVME_PMRCAP_SET_RDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_RDS_MASK) << PMRCAP_RDS_SHIFT)
+#define NVME_PMRCAP_SET_WDS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_WDS_MASK) << PMRCAP_WDS_SHIFT)
+#define NVME_PMRCAP_SET_BIR(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_BIR_MASK) << PMRCAP_BIR_SHIFT)
+#define NVME_PMRCAP_SET_PMRTU(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTU_MASK) << PMRCAP_PMRTU_SHIFT)
+#define NVME_PMRCAP_SET_PMRWBM(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRWBM_MASK) << PMRCAP_PMRWBM_SHIFT)
+#define NVME_PMRCAP_SET_PMRTO(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_PMRTO_MASK) << PMRCAP_PMRTO_SHIFT)
+#define NVME_PMRCAP_SET_CMSS(pmrcap, val)   \
+    (pmrcap |= (uint64_t)(val & PMRCAP_CMSS_MASK) << PMRCAP_CMSS_SHIFT)
+
+enum NvmePmrctlShift {
+    PMRCTL_EN_SHIFT   = 0,
+};
+
+enum NvmePmrctlMask {
+    PMRCTL_EN_MASK   = 0x1,
+};
+
+#define NVME_PMRCTL_EN(pmrctl)  ((pmrctl >> PMRCTL_EN_SHIFT)   & PMRCTL_EN_MASK)
+
+#define NVME_PMRCTL_SET_EN(pmrctl, val)   \
+    (pmrctl |= (uint64_t)(val & PMRCTL_EN_MASK) << PMRCTL_EN_SHIFT)
+
+enum NvmePmrstsShift {
+    PMRSTS_ERR_SHIFT    = 0,
+    PMRSTS_NRDY_SHIFT   = 8,
+    PMRSTS_HSTS_SHIFT   = 9,
+    PMRSTS_CBAI_SHIFT   = 12,
+};
+
+enum NvmePmrstsMask {
+    PMRSTS_ERR_MASK    = 0xff,
+    PMRSTS_NRDY_MASK   = 0x1,
+    PMRSTS_HSTS_MASK   = 0x7,
+    PMRSTS_CBAI_MASK   = 0x1,
+};
+
+#define NVME_PMRSTS_ERR(pmrsts)     \
+    ((pmrsts >> PMRSTS_ERR_SHIFT)   & PMRSTS_ERR_MASK)
+#define NVME_PMRSTS_NRDY(pmrsts)    \
+    ((pmrsts >> PMRSTS_NRDY_SHIFT)   & PMRSTS_NRDY_MASK)
+#define NVME_PMRSTS_HSTS(pmrsts)    \
+    ((pmrsts >> PMRSTS_HSTS_SHIFT)   & PMRSTS_HSTS_MASK)
+#define NVME_PMRSTS_CBAI(pmrsts)    \
+    ((pmrsts >> PMRSTS_CBAI_SHIFT)   & PMRSTS_CBAI_MASK)
+
+#define NVME_PMRSTS_SET_ERR(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_ERR_MASK) << PMRSTS_ERR_SHIFT)
+#define NVME_PMRSTS_SET_NRDY(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_NRDY_MASK) << PMRSTS_NRDY_SHIFT)
+#define NVME_PMRSTS_SET_HSTS(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_HSTS_MASK) << PMRSTS_HSTS_SHIFT)
+#define NVME_PMRSTS_SET_CBAI(pmrsts, val)   \
+    (pmrsts |= (uint64_t)(val & PMRSTS_CBAI_MASK) << PMRSTS_CBAI_SHIFT)
+
+enum NvmePmrebsShift {
+    PMREBS_PMRSZU_SHIFT   = 0,
+    PMREBS_RBB_SHIFT      = 4,
+    PMREBS_PMRWBZ_SHIFT   = 8,
+};
+
+enum NvmePmrebsMask {
+    PMREBS_PMRSZU_MASK   = 0xf,
+    PMREBS_RBB_MASK      = 0x1,
+    PMREBS_PMRWBZ_MASK   = 0xffffff,
+};
+
+#define NVME_PMREBS_PMRSZU(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRSZU_SHIFT)   & PMREBS_PMRSZU_MASK)
+#define NVME_PMREBS_RBB(pmrebs)     \
+    ((pmrebs >> PMREBS_RBB_SHIFT)   & PMREBS_RBB_MASK)
+#define NVME_PMREBS_PMRWBZ(pmrebs)  \
+    ((pmrebs >> PMREBS_PMRWBZ_SHIFT)   & PMREBS_PMRWBZ_MASK)
+
+#define NVME_PMREBS_SET_PMRSZU(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRSZU_MASK) << PMREBS_PMRSZU_SHIFT)
+#define NVME_PMREBS_SET_RBB(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_RBB_MASK) << PMREBS_RBB_SHIFT)
+#define NVME_PMREBS_SET_PMRWBZ(pmrebs, val)   \
+    (pmrebs |= (uint64_t)(val & PMREBS_PMRWBZ_MASK) << PMREBS_PMRWBZ_SHIFT)
+
+enum NvmePmrswtpShift {
+    PMRSWTP_PMRSWTU_SHIFT   = 0,
+    PMRSWTP_PMRSWTV_SHIFT   = 8,
+};
+
+enum NvmePmrswtpMask {
+    PMRSWTP_PMRSWTU_MASK   = 0xf,
+    PMRSWTP_PMRSWTV_MASK   = 0xffffff,
+};
+
+#define NVME_PMRSWTP_PMRSWTU(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTU_SHIFT)   & PMRSWTP_PMRSWTU_MASK)
+#define NVME_PMRSWTP_PMRSWTV(pmrswtp)   \
+    ((pmrswtp >> PMRSWTP_PMRSWTV_SHIFT)   & PMRSWTP_PMRSWTV_MASK)
+
+#define NVME_PMRSWTP_SET_PMRSWTU(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTU_MASK) << PMRSWTP_PMRSWTU_SHIFT)
+#define NVME_PMRSWTP_SET_PMRSWTV(pmrswtp, val)   \
+    (pmrswtp |= (uint64_t)(val & PMRSWTP_PMRSWTV_MASK) << PMRSWTP_PMRSWTV_SHIFT)
+
+enum NvmePmrmsclShift {
+    PMRMSCL_CMSE_SHIFT   = 1,
+    PMRMSCL_CBA_SHIFT    = 12,
+};
+
+enum NvmePmrmsclMask {
+    PMRMSCL_CMSE_MASK   = 0x1,
+    PMRMSCL_CBA_MASK    = 0xfffff,
+};
+
+#define NVME_PMRMSCL_CMSE(pmrmscl)    \
+    ((pmrmscl >> PMRMSCL_CMSE_SHIFT)   & PMRMSCL_CMSE_MASK)
+#define NVME_PMRMSCL_CBA(pmrmscl)     \
+    ((pmrmscl >> PMRMSCL_CBA_SHIFT)   & PMRMSCL_CBA_MASK)
+
+#define NVME_PMRMSCL_SET_CMSE(pmrmscl, val)   \
+    (pmrmscl |= (uint32_t)(val & PMRMSCL_CMSE_MASK) << PMRMSCL_CMSE_SHIFT)
+#define NVME_PMRMSCL_SET_CBA(pmrmscl, val)   \
+    (pmrmscl |= (uint32_t)(val & PMRMSCL_CBA_MASK) << PMRMSCL_CBA_SHIFT)
+
+enum NvmeSglDescriptorType {
+    NVME_SGL_DESCR_TYPE_DATA_BLOCK          = 0x0,
+    NVME_SGL_DESCR_TYPE_BIT_BUCKET          = 0x1,
+    NVME_SGL_DESCR_TYPE_SEGMENT             = 0x2,
+    NVME_SGL_DESCR_TYPE_LAST_SEGMENT        = 0x3,
+    NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK    = 0x4,
+
+    NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC     = 0xf,
+};
+
+enum NvmeSglDescriptorSubtype {
+    NVME_SGL_DESCR_SUBTYPE_ADDRESS = 0x0,
+};
+
+typedef struct QEMU_PACKED NvmeSglDescriptor {
+    uint64_t addr;
+    uint32_t len;
+    uint8_t  rsvd[3];
+    uint8_t  type;
+} NvmeSglDescriptor;
+
+#define NVME_SGL_TYPE(type)     ((type >> 4) & 0xf)
+#define NVME_SGL_SUBTYPE(type)  (type & 0xf)
+
+typedef union NvmeCmdDptr {
+    struct {
+        uint64_t    prp1;
+        uint64_t    prp2;
+    };
+
+    NvmeSglDescriptor sgl;
+} NvmeCmdDptr;
+
+enum NvmePsdt {
+    NVME_PSDT_PRP                 = 0x0,
+    NVME_PSDT_SGL_MPTR_CONTIGUOUS = 0x1,
+    NVME_PSDT_SGL_MPTR_SGL        = 0x2,
+};
+
+typedef struct QEMU_PACKED NvmeCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint64_t    res1;
+    uint64_t    mptr;
+    NvmeCmdDptr dptr;
+    uint32_t    cdw10;
+    uint32_t    cdw11;
+    uint32_t    cdw12;
+    uint32_t    cdw13;
+    uint32_t    cdw14;
+    uint32_t    cdw15;
+} NvmeCmd;
+
+#define NVME_CMD_FLAGS_FUSE(flags) (flags & 0x3)
+#define NVME_CMD_FLAGS_PSDT(flags) ((flags >> 6) & 0x3)
+
+enum NvmeAdminCommands {
+    NVME_ADM_CMD_DELETE_SQ      = 0x00,
+    NVME_ADM_CMD_CREATE_SQ      = 0x01,
+    NVME_ADM_CMD_GET_LOG_PAGE   = 0x02,
+    NVME_ADM_CMD_DELETE_CQ      = 0x04,
+    NVME_ADM_CMD_CREATE_CQ      = 0x05,
+    NVME_ADM_CMD_IDENTIFY       = 0x06,
+    NVME_ADM_CMD_ABORT          = 0x08,
+    NVME_ADM_CMD_SET_FEATURES   = 0x09,
+    NVME_ADM_CMD_GET_FEATURES   = 0x0a,
+    NVME_ADM_CMD_ASYNC_EV_REQ   = 0x0c,
+    NVME_ADM_CMD_ACTIVATE_FW    = 0x10,
+    NVME_ADM_CMD_DOWNLOAD_FW    = 0x11,
+    NVME_ADM_CMD_NS_ATTACHMENT  = 0x15,
+    NVME_ADM_CMD_FORMAT_NVM     = 0x80,
+    NVME_ADM_CMD_SECURITY_SEND  = 0x81,
+    NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+};
+
+enum NvmeIoCommands {
+    NVME_CMD_FLUSH              = 0x00,
+    NVME_CMD_WRITE              = 0x01,
+    NVME_CMD_READ               = 0x02,
+    NVME_CMD_WRITE_UNCOR        = 0x04,
+    NVME_CMD_COMPARE            = 0x05,
+    NVME_CMD_WRITE_ZEROES       = 0x08,
+    NVME_CMD_DSM                = 0x09,
+    NVME_CMD_VERIFY             = 0x0c,
+    NVME_CMD_COPY               = 0x19,
+    NVME_CMD_ZONE_MGMT_SEND     = 0x79,
+    NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
+    NVME_CMD_ZONE_APPEND        = 0x7d,
+};
+
+typedef struct QEMU_PACKED NvmeDeleteQ {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    rsvd1[9];
+    uint16_t    qid;
+    uint16_t    rsvd10;
+    uint32_t    rsvd11[5];
+} NvmeDeleteQ;
+
+typedef struct QEMU_PACKED NvmeCreateCq {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    rsvd1[5];
+    uint64_t    prp1;
+    uint64_t    rsvd8;
+    uint16_t    cqid;
+    uint16_t    qsize;
+    uint16_t    cq_flags;
+    uint16_t    irq_vector;
+    uint32_t    rsvd12[4];
+} NvmeCreateCq;
+
+#define NVME_CQ_FLAGS_PC(cq_flags)  (cq_flags & 0x1)
+#define NVME_CQ_FLAGS_IEN(cq_flags) ((cq_flags >> 1) & 0x1)
+
+enum NvmeFlagsCq {
+    NVME_CQ_PC          = 1,
+    NVME_CQ_IEN         = 2,
+};
+
+typedef struct QEMU_PACKED NvmeCreateSq {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    rsvd1[5];
+    uint64_t    prp1;
+    uint64_t    rsvd8;
+    uint16_t    sqid;
+    uint16_t    qsize;
+    uint16_t    sq_flags;
+    uint16_t    cqid;
+    uint32_t    rsvd12[4];
+} NvmeCreateSq;
+
+#define NVME_SQ_FLAGS_PC(sq_flags)      (sq_flags & 0x1)
+#define NVME_SQ_FLAGS_QPRIO(sq_flags)   ((sq_flags >> 1) & 0x3)
+
+enum NvmeFlagsSq {
+    NVME_SQ_PC          = 1,
+
+    NVME_SQ_PRIO_URGENT = 0,
+    NVME_SQ_PRIO_HIGH   = 1,
+    NVME_SQ_PRIO_NORMAL = 2,
+    NVME_SQ_PRIO_LOW    = 3,
+};
+
+typedef struct QEMU_PACKED NvmeIdentify {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint64_t    rsvd2[2];
+    uint64_t    prp1;
+    uint64_t    prp2;
+    uint8_t     cns;
+    uint8_t     rsvd10;
+    uint16_t    ctrlid;
+    uint16_t    nvmsetid;
+    uint8_t     rsvd11;
+    uint8_t     csi;
+    uint32_t    rsvd12[4];
+} NvmeIdentify;
+
+typedef struct QEMU_PACKED NvmeRwCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint64_t    rsvd2;
+    uint64_t    mptr;
+    NvmeCmdDptr dptr;
+    uint64_t    slba;
+    uint16_t    nlb;
+    uint16_t    control;
+    uint32_t    dsmgmt;
+    uint32_t    reftag;
+    uint16_t    apptag;
+    uint16_t    appmask;
+} NvmeRwCmd;
+
+enum {
+    NVME_RW_LR                  = 1 << 15,
+    NVME_RW_FUA                 = 1 << 14,
+    NVME_RW_DSM_FREQ_UNSPEC     = 0,
+    NVME_RW_DSM_FREQ_TYPICAL    = 1,
+    NVME_RW_DSM_FREQ_RARE       = 2,
+    NVME_RW_DSM_FREQ_READS      = 3,
+    NVME_RW_DSM_FREQ_WRITES     = 4,
+    NVME_RW_DSM_FREQ_RW         = 5,
+    NVME_RW_DSM_FREQ_ONCE       = 6,
+    NVME_RW_DSM_FREQ_PREFETCH   = 7,
+    NVME_RW_DSM_FREQ_TEMP       = 8,
+    NVME_RW_DSM_LATENCY_NONE    = 0 << 4,
+    NVME_RW_DSM_LATENCY_IDLE    = 1 << 4,
+    NVME_RW_DSM_LATENCY_NORM    = 2 << 4,
+    NVME_RW_DSM_LATENCY_LOW     = 3 << 4,
+    NVME_RW_DSM_SEQ_REQ         = 1 << 6,
+    NVME_RW_DSM_COMPRESSED      = 1 << 7,
+    NVME_RW_PIREMAP             = 1 << 9,
+    NVME_RW_PRINFO_PRACT        = 1 << 13,
+    NVME_RW_PRINFO_PRCHK_GUARD  = 1 << 12,
+    NVME_RW_PRINFO_PRCHK_APP    = 1 << 11,
+    NVME_RW_PRINFO_PRCHK_REF    = 1 << 10,
+    NVME_RW_PRINFO_PRCHK_MASK   = 7 << 10,
+
+};
+
+#define NVME_RW_PRINFO(control) ((control >> 10) & 0xf)
+
+enum {
+    NVME_PRINFO_PRACT       = 1 << 3,
+    NVME_PRINFO_PRCHK_GUARD = 1 << 2,
+    NVME_PRINFO_PRCHK_APP   = 1 << 1,
+    NVME_PRINFO_PRCHK_REF   = 1 << 0,
+    NVME_PRINFO_PRCHK_MASK  = 7 << 0,
+};
+
+typedef struct QEMU_PACKED NvmeDsmCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint64_t    rsvd2[2];
+    NvmeCmdDptr dptr;
+    uint32_t    nr;
+    uint32_t    attributes;
+    uint32_t    rsvd12[4];
+} NvmeDsmCmd;
+
+enum {
+    NVME_DSMGMT_IDR = 1 << 0,
+    NVME_DSMGMT_IDW = 1 << 1,
+    NVME_DSMGMT_AD  = 1 << 2,
+};
+
+typedef struct QEMU_PACKED NvmeDsmRange {
+    uint32_t    cattr;
+    uint32_t    nlb;
+    uint64_t    slba;
+} NvmeDsmRange;
+
+enum {
+    NVME_COPY_FORMAT_0 = 0x0,
+};
+
+typedef struct QEMU_PACKED NvmeCopyCmd {
+    uint8_t     opcode;
+    uint8_t     flags;
+    uint16_t    cid;
+    uint32_t    nsid;
+    uint32_t    rsvd2[4];
+    NvmeCmdDptr dptr;
+    uint64_t    sdlba;
+    uint8_t     nr;
+    uint8_t     control[3];
+    uint16_t    rsvd13;
+    uint16_t    dspec;
+    uint32_t    reftag;
+    uint16_t    apptag;
+    uint16_t    appmask;
+} NvmeCopyCmd;
+
+typedef struct QEMU_PACKED NvmeCopySourceRange {
+    uint8_t  rsvd0[8];
+    uint64_t slba;
+    uint16_t nlb;
+    uint8_t  rsvd18[6];
+    uint32_t reftag;
+    uint16_t apptag;
+    uint16_t appmask;
+} NvmeCopySourceRange;
+
+enum NvmeAsyncEventRequest {
+    NVME_AER_TYPE_ERROR                     = 0,
+    NVME_AER_TYPE_SMART                     = 1,
+    NVME_AER_TYPE_NOTICE                    = 2,
+    NVME_AER_TYPE_IO_SPECIFIC               = 6,
+    NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
+    NVME_AER_INFO_ERR_INVALID_DB_REGISTER   = 0,
+    NVME_AER_INFO_ERR_INVALID_DB_VALUE      = 1,
+    NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
+    NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
+    NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,
+    NVME_AER_INFO_ERR_FW_IMG_LOAD_ERR       = 5,
+    NVME_AER_INFO_SMART_RELIABILITY         = 0,
+    NVME_AER_INFO_SMART_TEMP_THRESH         = 1,
+    NVME_AER_INFO_SMART_SPARE_THRESH        = 2,
+    NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED    = 0,
+};
+
+typedef struct QEMU_PACKED NvmeAerResult {
+    uint8_t event_type;
+    uint8_t event_info;
+    uint8_t log_page;
+    uint8_t resv;
+} NvmeAerResult;
+
+typedef struct QEMU_PACKED NvmeZonedResult {
+    uint64_t slba;
+} NvmeZonedResult;
+
+typedef struct QEMU_PACKED NvmeCqe {
+    uint32_t    result;
+    uint32_t    dw1;
+    uint16_t    sq_head;
+    uint16_t    sq_id;
+    uint16_t    cid;
+    uint16_t    status;
+} NvmeCqe;
+
+enum NvmeStatusCodes {
+    NVME_SUCCESS                = 0x0000,
+    NVME_INVALID_OPCODE         = 0x0001,
+    NVME_INVALID_FIELD          = 0x0002,
+    NVME_CID_CONFLICT           = 0x0003,
+    NVME_DATA_TRAS_ERROR        = 0x0004,
+    NVME_POWER_LOSS_ABORT       = 0x0005,
+    NVME_INTERNAL_DEV_ERROR     = 0x0006,
+    NVME_CMD_ABORT_REQ          = 0x0007,
+    NVME_CMD_ABORT_SQ_DEL       = 0x0008,
+    NVME_CMD_ABORT_FAILED_FUSE  = 0x0009,
+    NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
+    NVME_INVALID_NSID           = 0x000b,
+    NVME_CMD_SEQ_ERROR          = 0x000c,
+    NVME_INVALID_SGL_SEG_DESCR  = 0x000d,
+    NVME_INVALID_NUM_SGL_DESCRS = 0x000e,
+    NVME_DATA_SGL_LEN_INVALID   = 0x000f,
+    NVME_MD_SGL_LEN_INVALID     = 0x0010,
+    NVME_SGL_DESCR_TYPE_INVALID = 0x0011,
+    NVME_INVALID_USE_OF_CMB     = 0x0012,
+    NVME_INVALID_PRP_OFFSET     = 0x0013,
+    NVME_CMD_SET_CMB_REJECTED   = 0x002b,
+    NVME_INVALID_CMD_SET        = 0x002c,
+    NVME_LBA_RANGE              = 0x0080,
+    NVME_CAP_EXCEEDED           = 0x0081,
+    NVME_NS_NOT_READY           = 0x0082,
+    NVME_NS_RESV_CONFLICT       = 0x0083,
+    NVME_FORMAT_IN_PROGRESS     = 0x0084,
+    NVME_INVALID_CQID           = 0x0100,
+    NVME_INVALID_QID            = 0x0101,
+    NVME_MAX_QSIZE_EXCEEDED     = 0x0102,
+    NVME_ACL_EXCEEDED           = 0x0103,
+    NVME_RESERVED               = 0x0104,
+    NVME_AER_LIMIT_EXCEEDED     = 0x0105,
+    NVME_INVALID_FW_SLOT        = 0x0106,
+    NVME_INVALID_FW_IMAGE       = 0x0107,
+    NVME_INVALID_IRQ_VECTOR     = 0x0108,
+    NVME_INVALID_LOG_ID         = 0x0109,
+    NVME_INVALID_FORMAT         = 0x010a,
+    NVME_FW_REQ_RESET           = 0x010b,
+    NVME_INVALID_QUEUE_DEL      = 0x010c,
+    NVME_FID_NOT_SAVEABLE       = 0x010d,
+    NVME_FEAT_NOT_CHANGEABLE    = 0x010e,
+    NVME_FEAT_NOT_NS_SPEC       = 0x010f,
+    NVME_FW_REQ_SUSYSTEM_RESET  = 0x0110,
+    NVME_NS_ALREADY_ATTACHED    = 0x0118,
+    NVME_NS_PRIVATE             = 0x0119,
+    NVME_NS_NOT_ATTACHED        = 0x011a,
+    NVME_NS_CTRL_LIST_INVALID   = 0x011c,
+    NVME_CONFLICTING_ATTRS      = 0x0180,
+    NVME_INVALID_PROT_INFO      = 0x0181,
+    NVME_WRITE_TO_RO            = 0x0182,
+    NVME_CMD_SIZE_LIMIT         = 0x0183,
+    NVME_ZONE_BOUNDARY_ERROR    = 0x01b8,
+    NVME_ZONE_FULL              = 0x01b9,
+    NVME_ZONE_READ_ONLY         = 0x01ba,
+    NVME_ZONE_OFFLINE           = 0x01bb,
+    NVME_ZONE_INVALID_WRITE     = 0x01bc,
+    NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
+    NVME_ZONE_TOO_MANY_OPEN     = 0x01be,
+    NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
+    NVME_WRITE_FAULT            = 0x0280,
+    NVME_UNRECOVERED_READ       = 0x0281,
+    NVME_E2E_GUARD_ERROR        = 0x0282,
+    NVME_E2E_APP_ERROR          = 0x0283,
+    NVME_E2E_REF_ERROR          = 0x0284,
+    NVME_CMP_FAILURE            = 0x0285,
+    NVME_ACCESS_DENIED          = 0x0286,
+    NVME_DULB                   = 0x0287,
+    NVME_MORE                   = 0x2000,
+    NVME_DNR                    = 0x4000,
+    NVME_NO_COMPLETE            = 0xffff,
+};
+
+typedef struct QEMU_PACKED NvmeFwSlotInfoLog {
+    uint8_t     afi;
+    uint8_t     reserved1[7];
+    uint8_t     frs1[8];
+    uint8_t     frs2[8];
+    uint8_t     frs3[8];
+    uint8_t     frs4[8];
+    uint8_t     frs5[8];
+    uint8_t     frs6[8];
+    uint8_t     frs7[8];
+    uint8_t     reserved2[448];
+} NvmeFwSlotInfoLog;
+
+typedef struct QEMU_PACKED NvmeErrorLog {
+    uint64_t    error_count;
+    uint16_t    sqid;
+    uint16_t    cid;
+    uint16_t    status_field;
+    uint16_t    param_error_location;
+    uint64_t    lba;
+    uint32_t    nsid;
+    uint8_t     vs;
+    uint8_t     resv[35];
+} NvmeErrorLog;
+
+typedef struct QEMU_PACKED NvmeSmartLog {
+    uint8_t     critical_warning;
+    uint16_t    temperature;
+    uint8_t     available_spare;
+    uint8_t     available_spare_threshold;
+    uint8_t     percentage_used;
+    uint8_t     reserved1[26];
+    uint64_t    data_units_read[2];
+    uint64_t    data_units_written[2];
+    uint64_t    host_read_commands[2];
+    uint64_t    host_write_commands[2];
+    uint64_t    controller_busy_time[2];
+    uint64_t    power_cycles[2];
+    uint64_t    power_on_hours[2];
+    uint64_t    unsafe_shutdowns[2];
+    uint64_t    media_errors[2];
+    uint64_t    number_of_error_log_entries[2];
+    uint8_t     reserved2[320];
+} NvmeSmartLog;
+
+#define NVME_SMART_WARN_MAX     6
+enum NvmeSmartWarn {
+    NVME_SMART_SPARE                  = 1 << 0,
+    NVME_SMART_TEMPERATURE            = 1 << 1,
+    NVME_SMART_RELIABILITY            = 1 << 2,
+    NVME_SMART_MEDIA_READ_ONLY        = 1 << 3,
+    NVME_SMART_FAILED_VOLATILE_MEDIA  = 1 << 4,
+    NVME_SMART_PMR_UNRELIABLE         = 1 << 5,
+};
+
+typedef struct NvmeEffectsLog {
+    uint32_t    acs[256];
+    uint32_t    iocs[256];
+    uint8_t     resv[2048];
+} NvmeEffectsLog;
+
+enum {
+    NVME_CMD_EFF_CSUPP      = 1 << 0,
+    NVME_CMD_EFF_LBCC       = 1 << 1,
+    NVME_CMD_EFF_NCC        = 1 << 2,
+    NVME_CMD_EFF_NIC        = 1 << 3,
+    NVME_CMD_EFF_CCC        = 1 << 4,
+    NVME_CMD_EFF_CSE_MASK   = 3 << 16,
+    NVME_CMD_EFF_UUID_SEL   = 1 << 19,
+};
+
+enum NvmeLogIdentifier {
+    NVME_LOG_ERROR_INFO     = 0x01,
+    NVME_LOG_SMART_INFO     = 0x02,
+    NVME_LOG_FW_SLOT_INFO   = 0x03,
+    NVME_LOG_CHANGED_NSLIST = 0x04,
+    NVME_LOG_CMD_EFFECTS    = 0x05,
+};
+
+typedef struct QEMU_PACKED NvmePSD {
+    uint16_t    mp;
+    uint16_t    reserved;
+    uint32_t    enlat;
+    uint32_t    exlat;
+    uint8_t     rrt;
+    uint8_t     rrl;
+    uint8_t     rwt;
+    uint8_t     rwl;
+    uint8_t     resv[16];
+} NvmePSD;
+
+#define NVME_CONTROLLER_LIST_SIZE 2048
+#define NVME_IDENTIFY_DATA_SIZE 4096
+
+enum NvmeIdCns {
+    NVME_ID_CNS_NS                    = 0x00,
+    NVME_ID_CNS_CTRL                  = 0x01,
+    NVME_ID_CNS_NS_ACTIVE_LIST        = 0x02,
+    NVME_ID_CNS_NS_DESCR_LIST         = 0x03,
+    NVME_ID_CNS_CS_NS                 = 0x05,
+    NVME_ID_CNS_CS_CTRL               = 0x06,
+    NVME_ID_CNS_CS_NS_ACTIVE_LIST     = 0x07,
+    NVME_ID_CNS_NS_PRESENT_LIST       = 0x10,
+    NVME_ID_CNS_NS_PRESENT            = 0x11,
+    NVME_ID_CNS_NS_ATTACHED_CTRL_LIST = 0x12,
+    NVME_ID_CNS_CTRL_LIST             = 0x13,
+    NVME_ID_CNS_CS_NS_PRESENT_LIST    = 0x1a,
+    NVME_ID_CNS_CS_NS_PRESENT         = 0x1b,
+    NVME_ID_CNS_IO_COMMAND_SET        = 0x1c,
+};
+
+typedef struct QEMU_PACKED NvmeIdCtrl {
+    uint16_t    vid;
+    uint16_t    ssvid;
+    uint8_t     sn[20];
+    uint8_t     mn[40];
+    uint8_t     fr[8];
+    uint8_t     rab;
+    uint8_t     ieee[3];
+    uint8_t     cmic;
+    uint8_t     mdts;
+    uint16_t    cntlid;
+    uint32_t    ver;
+    uint32_t    rtd3r;
+    uint32_t    rtd3e;
+    uint32_t    oaes;
+    uint32_t    ctratt;
+    uint8_t     rsvd100[11];
+    uint8_t     cntrltype;
+    uint8_t     fguid[16];
+    uint8_t     rsvd128[128];
+    uint16_t    oacs;
+    uint8_t     acl;
+    uint8_t     aerl;
+    uint8_t     frmw;
+    uint8_t     lpa;
+    uint8_t     elpe;
+    uint8_t     npss;
+    uint8_t     avscc;
+    uint8_t     apsta;
+    uint16_t    wctemp;
+    uint16_t    cctemp;
+    uint16_t    mtfa;
+    uint32_t    hmpre;
+    uint32_t    hmmin;
+    uint8_t     tnvmcap[16];
+    uint8_t     unvmcap[16];
+    uint32_t    rpmbs;
+    uint16_t    edstt;
+    uint8_t     dsto;
+    uint8_t     fwug;
+    uint16_t    kas;
+    uint16_t    hctma;
+    uint16_t    mntmt;
+    uint16_t    mxtmt;
+    uint32_t    sanicap;
+    uint8_t     rsvd332[180];
+    uint8_t     sqes;
+    uint8_t     cqes;
+    uint16_t    maxcmd;
+    uint32_t    nn;
+    uint16_t    oncs;
+    uint16_t    fuses;
+    uint8_t     fna;
+    uint8_t     vwc;
+    uint16_t    awun;
+    uint16_t    awupf;
+    uint8_t     nvscc;
+    uint8_t     rsvd531;
+    uint16_t    acwu;
+    uint16_t    ocfs;
+    uint32_t    sgls;
+    uint8_t     rsvd540[228];
+    uint8_t     subnqn[256];
+    uint8_t     rsvd1024[1024];
+    NvmePSD     psd[32];
+    uint8_t     vs[1024];
+} NvmeIdCtrl;
+
+typedef struct NvmeIdCtrlZoned {
+    uint8_t     zasl;
+    uint8_t     rsvd1[4095];
+} NvmeIdCtrlZoned;
+
+typedef struct NvmeIdCtrlNvm {
+    uint8_t     vsl;
+    uint8_t     wzsl;
+    uint8_t     wusl;
+    uint8_t     dmrl;
+    uint32_t    dmrsl;
+    uint64_t    dmsl;
+    uint8_t     rsvd16[4080];
+} NvmeIdCtrlNvm;
+
+enum NvmeIdCtrlOaes {
+    NVME_OAES_NS_ATTR   = 1 << 8,
+};
+
+enum NvmeIdCtrlOacs {
+    NVME_OACS_SECURITY  = 1 << 0,
+    NVME_OACS_FORMAT    = 1 << 1,
+    NVME_OACS_FW        = 1 << 2,
+    NVME_OACS_NS_MGMT   = 1 << 3,
+};
+
+enum NvmeIdCtrlOncs {
+    NVME_ONCS_COMPARE       = 1 << 0,
+    NVME_ONCS_WRITE_UNCORR  = 1 << 1,
+    NVME_ONCS_DSM           = 1 << 2,
+    NVME_ONCS_WRITE_ZEROES  = 1 << 3,
+    NVME_ONCS_FEATURES      = 1 << 4,
+    NVME_ONCS_RESRVATIONS   = 1 << 5,
+    NVME_ONCS_TIMESTAMP     = 1 << 6,
+    NVME_ONCS_VERIFY        = 1 << 7,
+    NVME_ONCS_COPY          = 1 << 8,
+};
+
+enum NvmeIdCtrlOcfs {
+    NVME_OCFS_COPY_FORMAT_0 = 1 << 0,
+};
+
+enum NvmeIdctrlVwc {
+    NVME_VWC_PRESENT                    = 1 << 0,
+    NVME_VWC_NSID_BROADCAST_NO_SUPPORT  = 0 << 1,
+    NVME_VWC_NSID_BROADCAST_RESERVED    = 1 << 1,
+    NVME_VWC_NSID_BROADCAST_CTRL_SPEC   = 2 << 1,
+    NVME_VWC_NSID_BROADCAST_SUPPORT     = 3 << 1,
+};
+
+enum NvmeIdCtrlFrmw {
+    NVME_FRMW_SLOT1_RO = 1 << 0,
+};
+
+enum NvmeIdCtrlLpa {
+    NVME_LPA_NS_SMART = 1 << 0,
+    NVME_LPA_CSE      = 1 << 1,
+    NVME_LPA_EXTENDED = 1 << 2,
+};
+
+enum NvmeIdCtrlCmic {
+    NVME_CMIC_MULTI_CTRL    = 1 << 1,
+};
+
+enum NvmeNsAttachmentOperation {
+    NVME_NS_ATTACHMENT_ATTACH = 0x0,
+    NVME_NS_ATTACHMENT_DETACH = 0x1,
+};
+
+#define NVME_CTRL_SQES_MIN(sqes) ((sqes) & 0xf)
+#define NVME_CTRL_SQES_MAX(sqes) (((sqes) >> 4) & 0xf)
+#define NVME_CTRL_CQES_MIN(cqes) ((cqes) & 0xf)
+#define NVME_CTRL_CQES_MAX(cqes) (((cqes) >> 4) & 0xf)
+
+#define NVME_CTRL_SGLS_SUPPORT_MASK        (0x3 <<  0)
+#define NVME_CTRL_SGLS_SUPPORT_NO_ALIGN    (0x1 <<  0)
+#define NVME_CTRL_SGLS_SUPPORT_DWORD_ALIGN (0x1 <<  1)
+#define NVME_CTRL_SGLS_KEYED               (0x1 <<  2)
+#define NVME_CTRL_SGLS_BITBUCKET           (0x1 << 16)
+#define NVME_CTRL_SGLS_MPTR_CONTIGUOUS     (0x1 << 17)
+#define NVME_CTRL_SGLS_EXCESS_LENGTH       (0x1 << 18)
+#define NVME_CTRL_SGLS_MPTR_SGL            (0x1 << 19)
+#define NVME_CTRL_SGLS_ADDR_OFFSET         (0x1 << 20)
+
+#define NVME_ARB_AB(arb)    (arb & 0x7)
+#define NVME_ARB_AB_NOLIMIT 0x7
+#define NVME_ARB_LPW(arb)   ((arb >> 8) & 0xff)
+#define NVME_ARB_MPW(arb)   ((arb >> 16) & 0xff)
+#define NVME_ARB_HPW(arb)   ((arb >> 24) & 0xff)
+
+#define NVME_INTC_THR(intc)     (intc & 0xff)
+#define NVME_INTC_TIME(intc)    ((intc >> 8) & 0xff)
+
+#define NVME_INTVC_NOCOALESCING (0x1 << 16)
+
+#define NVME_TEMP_THSEL(temp)  ((temp >> 20) & 0x3)
+#define NVME_TEMP_THSEL_OVER   0x0
+#define NVME_TEMP_THSEL_UNDER  0x1
+
+#define NVME_TEMP_TMPSEL(temp)     ((temp >> 16) & 0xf)
+#define NVME_TEMP_TMPSEL_COMPOSITE 0x0
+
+#define NVME_TEMP_TMPTH(temp) (temp & 0xffff)
+
+#define NVME_AEC_SMART(aec)         (aec & 0xff)
+#define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
+#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+
+#define NVME_ERR_REC_TLER(err_rec)  (err_rec & 0xffff)
+#define NVME_ERR_REC_DULBE(err_rec) (err_rec & 0x10000)
+
+enum NvmeFeatureIds {
+    NVME_ARBITRATION                = 0x1,
+    NVME_POWER_MANAGEMENT           = 0x2,
+    NVME_LBA_RANGE_TYPE             = 0x3,
+    NVME_TEMPERATURE_THRESHOLD      = 0x4,
+    NVME_ERROR_RECOVERY             = 0x5,
+    NVME_VOLATILE_WRITE_CACHE       = 0x6,
+    NVME_NUMBER_OF_QUEUES           = 0x7,
+    NVME_INTERRUPT_COALESCING       = 0x8,
+    NVME_INTERRUPT_VECTOR_CONF      = 0x9,
+    NVME_WRITE_ATOMICITY            = 0xa,
+    NVME_ASYNCHRONOUS_EVENT_CONF    = 0xb,
+    NVME_TIMESTAMP                  = 0xe,
+    NVME_COMMAND_SET_PROFILE        = 0x19,
+    NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
+    NVME_FID_MAX                    = 0x100,
+};
+
+typedef enum NvmeFeatureCap {
+    NVME_FEAT_CAP_SAVE      = 1 << 0,
+    NVME_FEAT_CAP_NS        = 1 << 1,
+    NVME_FEAT_CAP_CHANGE    = 1 << 2,
+} NvmeFeatureCap;
+
+typedef enum NvmeGetFeatureSelect {
+    NVME_GETFEAT_SELECT_CURRENT = 0x0,
+    NVME_GETFEAT_SELECT_DEFAULT = 0x1,
+    NVME_GETFEAT_SELECT_SAVED   = 0x2,
+    NVME_GETFEAT_SELECT_CAP     = 0x3,
+} NvmeGetFeatureSelect;
+
+#define NVME_GETSETFEAT_FID_MASK 0xff
+#define NVME_GETSETFEAT_FID(dw10) (dw10 & NVME_GETSETFEAT_FID_MASK)
+
+#define NVME_GETFEAT_SELECT_SHIFT 8
+#define NVME_GETFEAT_SELECT_MASK  0x7
+#define NVME_GETFEAT_SELECT(dw10) \
+    ((dw10 >> NVME_GETFEAT_SELECT_SHIFT) & NVME_GETFEAT_SELECT_MASK)
+
+#define NVME_SETFEAT_SAVE_SHIFT 31
+#define NVME_SETFEAT_SAVE_MASK  0x1
+#define NVME_SETFEAT_SAVE(dw10) \
+    ((dw10 >> NVME_SETFEAT_SAVE_SHIFT) & NVME_SETFEAT_SAVE_MASK)
+
+typedef struct QEMU_PACKED NvmeRangeType {
+    uint8_t     type;
+    uint8_t     attributes;
+    uint8_t     rsvd2[14];
+    uint64_t    slba;
+    uint64_t    nlb;
+    uint8_t     guid[16];
+    uint8_t     rsvd48[16];
+} NvmeRangeType;
+
+typedef struct QEMU_PACKED NvmeLBAF {
+    uint16_t    ms;
+    uint8_t     ds;
+    uint8_t     rp;
+} NvmeLBAF;
+
+typedef struct QEMU_PACKED NvmeLBAFE {
+    uint64_t    zsze;
+    uint8_t     zdes;
+    uint8_t     rsvd9[7];
+} NvmeLBAFE;
+
+#define NVME_NSID_BROADCAST 0xffffffff
+
+typedef struct QEMU_PACKED NvmeIdNs {
+    uint64_t    nsze;
+    uint64_t    ncap;
+    uint64_t    nuse;
+    uint8_t     nsfeat;
+    uint8_t     nlbaf;
+    uint8_t     flbas;
+    uint8_t     mc;
+    uint8_t     dpc;
+    uint8_t     dps;
+    uint8_t     nmic;
+    uint8_t     rescap;
+    uint8_t     fpi;
+    uint8_t     dlfeat;
+    uint16_t    nawun;
+    uint16_t    nawupf;
+    uint16_t    nacwu;
+    uint16_t    nabsn;
+    uint16_t    nabo;
+    uint16_t    nabspf;
+    uint16_t    noiob;
+    uint8_t     nvmcap[16];
+    uint16_t    npwg;
+    uint16_t    npwa;
+    uint16_t    npdg;
+    uint16_t    npda;
+    uint16_t    nows;
+    uint16_t    mssrl;
+    uint32_t    mcl;
+    uint8_t     msrc;
+    uint8_t     rsvd81[23];
+    uint8_t     nguid[16];
+    uint64_t    eui64;
+    NvmeLBAF    lbaf[16];
+    uint8_t     rsvd192[192];
+    uint8_t     vs[3712];
+} NvmeIdNs;
+
+typedef struct QEMU_PACKED NvmeIdNsDescr {
+    uint8_t nidt;
+    uint8_t nidl;
+    uint8_t rsvd2[2];
+} NvmeIdNsDescr;
+
+enum NvmeNsIdentifierLength {
+    NVME_NIDL_EUI64             = 8,
+    NVME_NIDL_NGUID             = 16,
+    NVME_NIDL_UUID              = 16,
+    NVME_NIDL_CSI               = 1,
+};
+
+enum NvmeNsIdentifierType {
+    NVME_NIDT_EUI64             = 0x01,
+    NVME_NIDT_NGUID             = 0x02,
+    NVME_NIDT_UUID              = 0x03,
+    NVME_NIDT_CSI               = 0x04,
+};
+
+enum NvmeIdNsNmic {
+    NVME_NMIC_NS_SHARED         = 1 << 0,
+};
+
+enum NvmeCsi {
+    NVME_CSI_NVM                = 0x00,
+    NVME_CSI_ZONED              = 0x02,
+};
+
+#define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
+
+typedef struct QEMU_PACKED NvmeIdNsZoned {
+    uint16_t    zoc;
+    uint16_t    ozcs;
+    uint32_t    mar;
+    uint32_t    mor;
+    uint32_t    rrl;
+    uint32_t    frl;
+    uint8_t     rsvd20[2796];
+    NvmeLBAFE   lbafe[16];
+    uint8_t     rsvd3072[768];
+    uint8_t     vs[256];
+} NvmeIdNsZoned;
+
+/*Deallocate Logical Block Features*/
+#define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)       ((dlfeat) & 0x10)
+#define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)    ((dlfeat) & 0x08)
+
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR(dlfeat)     ((dlfeat) & 0x7)
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_UNDEFINED   0
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ZEROES      1
+#define NVME_ID_NS_DLFEAT_READ_BEHAVIOR_ONES        2
+
+
+#define NVME_ID_NS_NSFEAT_THIN(nsfeat)      ((nsfeat & 0x1))
+#define NVME_ID_NS_NSFEAT_DULBE(nsfeat)     ((nsfeat >> 2) & 0x1)
+#define NVME_ID_NS_FLBAS_EXTENDED(flbas)    ((flbas >> 4) & 0x1)
+#define NVME_ID_NS_FLBAS_INDEX(flbas)       ((flbas & 0xf))
+#define NVME_ID_NS_MC_SEPARATE(mc)          ((mc >> 1) & 0x1)
+#define NVME_ID_NS_MC_EXTENDED(mc)          ((mc & 0x1))
+#define NVME_ID_NS_DPC_LAST_EIGHT(dpc)      ((dpc >> 4) & 0x1)
+#define NVME_ID_NS_DPC_FIRST_EIGHT(dpc)     ((dpc >> 3) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_3(dpc)          ((dpc >> 2) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_2(dpc)          ((dpc >> 1) & 0x1)
+#define NVME_ID_NS_DPC_TYPE_1(dpc)          ((dpc & 0x1))
+#define NVME_ID_NS_DPC_TYPE_MASK            0x7
+
+enum NvmeIdNsDps {
+    NVME_ID_NS_DPS_TYPE_NONE   = 0,
+    NVME_ID_NS_DPS_TYPE_1      = 1,
+    NVME_ID_NS_DPS_TYPE_2      = 2,
+    NVME_ID_NS_DPS_TYPE_3      = 3,
+    NVME_ID_NS_DPS_TYPE_MASK   = 0x7,
+    NVME_ID_NS_DPS_FIRST_EIGHT = 8,
+};
+
+enum NvmeIdNsFlbas {
+    NVME_ID_NS_FLBAS_EXTENDED = 1 << 4,
+};
+
+enum NvmeIdNsMc {
+    NVME_ID_NS_MC_EXTENDED = 1 << 0,
+    NVME_ID_NS_MC_SEPARATE = 1 << 1,
+};
+
+#define NVME_ID_NS_DPS_TYPE(dps) (dps & NVME_ID_NS_DPS_TYPE_MASK)
+
+typedef struct NvmeDifTuple {
+    uint16_t guard;
+    uint16_t apptag;
+    uint32_t reftag;
+} NvmeDifTuple;
+
+enum NvmeZoneAttr {
+    NVME_ZA_FINISHED_BY_CTLR         = 1 << 0,
+    NVME_ZA_FINISH_RECOMMENDED       = 1 << 1,
+    NVME_ZA_RESET_RECOMMENDED        = 1 << 2,
+    NVME_ZA_ZD_EXT_VALID             = 1 << 7,
+};
+
+typedef struct QEMU_PACKED NvmeZoneReportHeader {
+    uint64_t    nr_zones;
+    uint8_t     rsvd[56];
+} NvmeZoneReportHeader;
+
+enum NvmeZoneReceiveAction {
+    NVME_ZONE_REPORT                 = 0,
+    NVME_ZONE_REPORT_EXTENDED        = 1,
+};
+
+enum NvmeZoneReportType {
+    NVME_ZONE_REPORT_ALL             = 0,
+    NVME_ZONE_REPORT_EMPTY           = 1,
+    NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
+    NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
+    NVME_ZONE_REPORT_CLOSED          = 4,
+    NVME_ZONE_REPORT_FULL            = 5,
+    NVME_ZONE_REPORT_READ_ONLY       = 6,
+    NVME_ZONE_REPORT_OFFLINE         = 7,
+};
+
+enum NvmeZoneType {
+    NVME_ZONE_TYPE_RESERVED          = 0x00,
+    NVME_ZONE_TYPE_SEQ_WRITE         = 0x02,
+};
+
+enum NvmeZoneSendAction {
+    NVME_ZONE_ACTION_RSD             = 0x00,
+    NVME_ZONE_ACTION_CLOSE           = 0x01,
+    NVME_ZONE_ACTION_FINISH          = 0x02,
+    NVME_ZONE_ACTION_OPEN            = 0x03,
+    NVME_ZONE_ACTION_RESET           = 0x04,
+    NVME_ZONE_ACTION_OFFLINE         = 0x05,
+    NVME_ZONE_ACTION_SET_ZD_EXT      = 0x10,
+};
+
+typedef struct QEMU_PACKED NvmeZoneDescr {
+    uint8_t     zt;
+    uint8_t     zs;
+    uint8_t     za;
+    uint8_t     rsvd3[5];
+    uint64_t    zcap;
+    uint64_t    zslba;
+    uint64_t    wp;
+    uint8_t     rsvd32[32];
+} NvmeZoneDescr;
+
+typedef enum NvmeZoneState {
+    NVME_ZONE_STATE_RESERVED         = 0x00,
+    NVME_ZONE_STATE_EMPTY            = 0x01,
+    NVME_ZONE_STATE_IMPLICITLY_OPEN  = 0x02,
+    NVME_ZONE_STATE_EXPLICITLY_OPEN  = 0x03,
+    NVME_ZONE_STATE_CLOSED           = 0x04,
+    NVME_ZONE_STATE_READ_ONLY        = 0x0d,
+    NVME_ZONE_STATE_FULL             = 0x0e,
+    NVME_ZONE_STATE_OFFLINE          = 0x0f,
+} NvmeZoneState;
+
+static inline void _nvme_check_size(void)
+{
+    QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeAerResult) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZonedResult) != 8);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCqe) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDsmRange) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCopySourceRange) != 32);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDeleteQ) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCreateCq) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCreateSq) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdentify) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeRwCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDsmCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeCopyCmd) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeRangeType) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeErrorLog) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeFwSlotInfoLog) != 512);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeSmartLog) != 512);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeEffectsLog) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrl) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlZoned) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdCtrlNvm) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAF) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeLBAFE) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNs) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsZoned) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeSglDescriptor) != 16);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeIdNsDescr) != 4);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneDescr) != 64);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeDifTuple) != 8);
+}
+#endif
diff --git a/include/block/qapi.h b/include/block/qapi.h
new file mode 100644
index 000000000..22c7807c8
--- /dev/null
+++ b/include/block/qapi.h
@@ -0,0 +1,45 @@
+/*
+ * Block layer qmp and info dump related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef BLOCK_QAPI_H
+#define BLOCK_QAPI_H
+
+#include "block/block.h"
+#include "block/snapshot.h"
+
+BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
+                                        BlockDriverState *bs,
+                                        bool flat,
+                                        Error **errp);
+int bdrv_query_snapshot_info_list(BlockDriverState *bs,
+                                  SnapshotInfoList **p_list,
+                                  Error **errp);
+void bdrv_query_image_info(BlockDriverState *bs,
+                           ImageInfo **p_info,
+                           Error **errp);
+
+void bdrv_snapshot_dump(QEMUSnapshotInfo *sn);
+void bdrv_image_info_specific_dump(ImageInfoSpecific *info_spec);
+void bdrv_image_info_dump(ImageInfo *info);
+#endif
diff --git a/include/block/qdict.h b/include/block/qdict.h
new file mode 100644
index 000000000..ced2acfb9
--- /dev/null
+++ b/include/block/qdict.h
@@ -0,0 +1,32 @@
+/*
+ * Special QDict functions used by the block layer
+ *
+ * Copyright (c) 2013-2018 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2.1 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_QDICT_H
+#define BLOCK_QDICT_H
+
+#include "qapi/qmp/qdict.h"
+
+void qdict_copy_default(QDict *dst, QDict *src, const char *key);
+void qdict_set_default_str(QDict *dst, const char *key, const char *val);
+
+void qdict_join(QDict *dest, QDict *src, bool overwrite);
+
+void qdict_extract_subqdict(QDict *src, QDict **dst, const char *start);
+void qdict_array_split(QDict *src, QList **dst);
+int qdict_array_entries(QDict *src, const char *subqdict);
+
+typedef struct QDictRenames {
+    const char *from;
+    const char *to;
+} QDictRenames;
+bool qdict_rename_keys(QDict *qdict, const QDictRenames *renames, Error **errp);
+
+Visitor *qobject_input_visitor_new_flat_confused(QDict *qdict,
+                                                 Error **errp);
+#endif
diff --git a/include/block/raw-aio.h b/include/block/raw-aio.h
new file mode 100644
index 000000000..21fc10c4c
--- /dev/null
+++ b/include/block/raw-aio.h
@@ -0,0 +1,90 @@
+/*
+ * Declarations for AIO in the raw protocol
+ *
+ * Copyright IBM, Corp. 2008
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef QEMU_RAW_AIO_H
+#define QEMU_RAW_AIO_H
+
+#include "block/aio.h"
+#include "qemu/coroutine.h"
+#include "qemu/iov.h"
+
+/* AIO request types */
+#define QEMU_AIO_READ         0x0001
+#define QEMU_AIO_WRITE        0x0002
+#define QEMU_AIO_IOCTL        0x0004
+#define QEMU_AIO_FLUSH        0x0008
+#define QEMU_AIO_DISCARD      0x0010
+#define QEMU_AIO_WRITE_ZEROES 0x0020
+#define QEMU_AIO_COPY_RANGE   0x0040
+#define QEMU_AIO_TRUNCATE     0x0080
+#define QEMU_AIO_TYPE_MASK \
+        (QEMU_AIO_READ | \
+         QEMU_AIO_WRITE | \
+         QEMU_AIO_IOCTL | \
+         QEMU_AIO_FLUSH | \
+         QEMU_AIO_DISCARD | \
+         QEMU_AIO_WRITE_ZEROES | \
+         QEMU_AIO_COPY_RANGE | \
+         QEMU_AIO_TRUNCATE)
+
+/* AIO flags */
+#define QEMU_AIO_MISALIGNED   0x1000
+#define QEMU_AIO_BLKDEV       0x2000
+#define QEMU_AIO_NO_FALLBACK  0x4000
+
+
+/* linux-aio.c - Linux native implementation */
+#ifdef CONFIG_LINUX_AIO
+typedef struct LinuxAioState LinuxAioState;
+LinuxAioState *laio_init(Error **errp);
+void laio_cleanup(LinuxAioState *s);
+int coroutine_fn laio_co_submit(BlockDriverState *bs, LinuxAioState *s, int fd,
+                                uint64_t offset, QEMUIOVector *qiov, int type,
+                                uint64_t dev_max_batch);
+void laio_detach_aio_context(LinuxAioState *s, AioContext *old_context);
+void laio_attach_aio_context(LinuxAioState *s, AioContext *new_context);
+void laio_io_plug(BlockDriverState *bs, LinuxAioState *s);
+void laio_io_unplug(BlockDriverState *bs, LinuxAioState *s,
+                    uint64_t dev_max_batch);
+#endif
+/* io_uring.c - Linux io_uring implementation */
+#ifdef CONFIG_LINUX_IO_URING
+typedef struct LuringState LuringState;
+LuringState *luring_init(Error **errp);
+void luring_cleanup(LuringState *s);
+int coroutine_fn luring_co_submit(BlockDriverState *bs, LuringState *s, int fd,
+                                uint64_t offset, QEMUIOVector *qiov, int type);
+void luring_detach_aio_context(LuringState *s, AioContext *old_context);
+void luring_attach_aio_context(LuringState *s, AioContext *new_context);
+void luring_io_plug(BlockDriverState *bs, LuringState *s);
+void luring_io_unplug(BlockDriverState *bs, LuringState *s);
+#endif
+
+#ifdef _WIN32
+typedef struct QEMUWin32AIOState QEMUWin32AIOState;
+QEMUWin32AIOState *win32_aio_init(void);
+void win32_aio_cleanup(QEMUWin32AIOState *aio);
+int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile);
+BlockAIOCB *win32_aio_submit(BlockDriverState *bs,
+        QEMUWin32AIOState *aio, HANDLE hfile,
+        uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
+        BlockCompletionFunc *cb, void *opaque, int type);
+void win32_aio_detach_aio_context(QEMUWin32AIOState *aio,
+                                  AioContext *old_context);
+void win32_aio_attach_aio_context(QEMUWin32AIOState *aio,
+                                  AioContext *new_context);
+#endif
+
+#endif /* QEMU_RAW_AIO_H */
diff --git a/include/block/replication.h b/include/block/replication.h
new file mode 100644
index 000000000..21931b4f0
--- /dev/null
+++ b/include/block/replication.h
@@ -0,0 +1,175 @@
+/*
+ * Replication filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ *   Changlong Xie <xiecl.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef REPLICATION_H
+#define REPLICATION_H
+
+#include "qapi/qapi-types-block-core.h"
+#include "qemu/module.h"
+#include "qemu/queue.h"
+
+typedef struct ReplicationOps ReplicationOps;
+typedef struct ReplicationState ReplicationState;
+
+/**
+ * SECTION:block/replication.h
+ * @title:Base Replication System
+ * @short_description: interfaces for handling replication
+ *
+ * The Replication Model provides a framework for handling Replication
+ *
+ * <example>
+ *   <title>How to use replication interfaces</title>
+ *   <programlisting>
+ * #include "block/replication.h"
+ *
+ * typedef struct BDRVReplicationState {
+ *     ReplicationState *rs;
+ * } BDRVReplicationState;
+ *
+ * static void replication_start(ReplicationState *rs, ReplicationMode mode,
+ *                               Error **errp);
+ * static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
+ * static void replication_get_error(ReplicationState *rs, Error **errp);
+ * static void replication_stop(ReplicationState *rs, bool failover,
+ *                              Error **errp);
+ *
+ * static ReplicationOps replication_ops = {
+ *     .start = replication_start,
+ *     .checkpoint = replication_do_checkpoint,
+ *     .get_error = replication_get_error,
+ *     .stop = replication_stop,
+ * }
+ *
+ * static int replication_open(BlockDriverState *bs, QDict *options,
+ *                             int flags, Error **errp)
+ * {
+ *     BDRVReplicationState *s = bs->opaque;
+ *     s->rs = replication_new(bs, &replication_ops);
+ *     return 0;
+ * }
+ *
+ * static void replication_close(BlockDriverState *bs)
+ * {
+ *     BDRVReplicationState *s = bs->opaque;
+ *     replication_remove(s->rs);
+ * }
+ *
+ * BlockDriver bdrv_replication = {
+ *     .format_name                = "replication",
+ *     .instance_size              = sizeof(BDRVReplicationState),
+ *
+ *     .bdrv_open                  = replication_open,
+ *     .bdrv_close                 = replication_close,
+ * };
+ *
+ * static void bdrv_replication_init(void)
+ * {
+ *     bdrv_register(&bdrv_replication);
+ * }
+ *
+ * block_init(bdrv_replication_init);
+ *   </programlisting>
+ * </example>
+ *
+ * We create an example about how to use replication interfaces in above.
+ * Then in migration, we can use replication_(start/stop/do_checkpoint/
+ * get_error)_all to handle all replication operations.
+ */
+
+/**
+ * ReplicationState:
+ * @opaque: opaque pointer value passed to this ReplicationState
+ * @ops: replication operation of this ReplicationState
+ * @node: node that we will insert into @replication_states QLIST
+ */
+struct ReplicationState {
+    void *opaque;
+    ReplicationOps *ops;
+    QLIST_ENTRY(ReplicationState) node;
+};
+
+/**
+ * ReplicationOps:
+ * @start: callback to start replication
+ * @stop: callback to stop replication
+ * @checkpoint: callback to do checkpoint
+ * @get_error: callback to check if error occurred during replication
+ */
+struct ReplicationOps {
+    void (*start)(ReplicationState *rs, ReplicationMode mode, Error **errp);
+    void (*stop)(ReplicationState *rs, bool failover, Error **errp);
+    void (*checkpoint)(ReplicationState *rs, Error **errp);
+    void (*get_error)(ReplicationState *rs, Error **errp);
+};
+
+/**
+ * replication_new:
+ * @opaque: opaque pointer value passed to ReplicationState
+ * @ops: replication operation of the new relevant ReplicationState
+ *
+ * Called to create a new ReplicationState instance, and then insert it
+ * into @replication_states QLIST
+ *
+ * Returns: the new ReplicationState instance
+ */
+ReplicationState *replication_new(void *opaque, ReplicationOps *ops);
+
+/**
+ * replication_remove:
+ * @rs: the ReplicationState instance to remove
+ *
+ * Called to remove a ReplicationState instance, and then delete it from
+ * @replication_states QLIST
+ */
+void replication_remove(ReplicationState *rs);
+
+/**
+ * replication_start_all:
+ * @mode: replication mode that could be "primary" or "secondary"
+ * @errp: returns an error if this function fails
+ *
+ * Start replication, called in migration/checkpoint thread
+ *
+ * Note: the caller of the function MUST make sure vm stopped
+ */
+void replication_start_all(ReplicationMode mode, Error **errp);
+
+/**
+ * replication_do_checkpoint_all:
+ * @errp: returns an error if this function fails
+ *
+ * This interface is called after all VM state is transferred to Secondary QEMU
+ */
+void replication_do_checkpoint_all(Error **errp);
+
+/**
+ * replication_get_error_all:
+ * @errp: returns an error if this function fails
+ *
+ * This interface is called to check if error occurred during replication
+ */
+void replication_get_error_all(Error **errp);
+
+/**
+ * replication_stop_all:
+ * @failover: boolean value that indicates if we need do failover or not
+ * @errp: returns an error if this function fails
+ *
+ * It is called on failover. The vm should be stopped before calling it, if you
+ * use this API to shutdown the guest, or other things except failover
+ */
+void replication_stop_all(bool failover, Error **errp);
+
+#endif /* REPLICATION_H */
diff --git a/include/block/snapshot.h b/include/block/snapshot.h
new file mode 100644
index 000000000..940345692
--- /dev/null
+++ b/include/block/snapshot.h
@@ -0,0 +1,102 @@
+/*
+ * Block layer snapshot related functions
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef SNAPSHOT_H
+#define SNAPSHOT_H
+
+#include "qapi/qapi-builtin-types.h"
+
+#define SNAPSHOT_OPT_BASE       "snapshot."
+#define SNAPSHOT_OPT_ID         "snapshot.id"
+#define SNAPSHOT_OPT_NAME       "snapshot.name"
+
+extern QemuOptsList internal_snapshot_opts;
+
+typedef struct QEMUSnapshotInfo {
+    char id_str[128]; /* unique snapshot id */
+    /* the following fields are informative. They are not needed for
+       the consistency of the snapshot */
+    char name[256]; /* user chosen name */
+    uint64_t vm_state_size; /* VM state info size */
+    uint32_t date_sec; /* UTC date of the snapshot */
+    uint32_t date_nsec;
+    uint64_t vm_clock_nsec; /* VM clock relative to boot */
+    uint64_t icount; /* record/replay step */
+} QEMUSnapshotInfo;
+
+int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info,
+                       const char *name);
+bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs,
+                                       const char *id,
+                                       const char *name,
+                                       QEMUSnapshotInfo *sn_info,
+                                       Error **errp);
+int bdrv_can_snapshot(BlockDriverState *bs);
+int bdrv_snapshot_create(BlockDriverState *bs,
+                         QEMUSnapshotInfo *sn_info);
+int bdrv_snapshot_goto(BlockDriverState *bs,
+                       const char *snapshot_id,
+                       Error **errp);
+int bdrv_snapshot_delete(BlockDriverState *bs,
+                         const char *snapshot_id,
+                         const char *name,
+                         Error **errp);
+int bdrv_snapshot_list(BlockDriverState *bs,
+                       QEMUSnapshotInfo **psn_info);
+int bdrv_snapshot_load_tmp(BlockDriverState *bs,
+                           const char *snapshot_id,
+                           const char *name,
+                           Error **errp);
+int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs,
+                                         const char *id_or_name,
+                                         Error **errp);
+
+
+/* Group operations. All block drivers are involved.
+ * These functions will properly handle dataplane (take aio_context_acquire
+ * when appropriate for appropriate block drivers */
+
+bool bdrv_all_can_snapshot(bool has_devices, strList *devices,
+                           Error **errp);
+int bdrv_all_delete_snapshot(const char *name,
+                             bool has_devices, strList *devices,
+                             Error **errp);
+int bdrv_all_goto_snapshot(const char *name,
+                           bool has_devices, strList *devices,
+                           Error **errp);
+int bdrv_all_has_snapshot(const char *name,
+                          bool has_devices, strList *devices,
+                          Error **errp);
+int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
+                             BlockDriverState *vm_state_bs,
+                             uint64_t vm_state_size,
+                             bool has_devices,
+                             strList *devices,
+                             Error **errp);
+
+BlockDriverState *bdrv_all_find_vmstate_bs(const char *vmstate_bs,
+                                           bool has_devices, strList *devices,
+                                           Error **errp);
+
+#endif
diff --git a/include/block/thread-pool.h b/include/block/thread-pool.h
new file mode 100644
index 000000000..7dd7d730a
--- /dev/null
+++ b/include/block/thread-pool.h
@@ -0,0 +1,37 @@
+/*
+ * QEMU block layer thread pool
+ *
+ * Copyright IBM, Corp. 2008
+ * Copyright Red Hat, Inc. 2012
+ *
+ * Authors:
+ *  Anthony Liguori   <aliguori@us.ibm.com>
+ *  Paolo Bonzini     <pbonzini@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#ifndef QEMU_THREAD_POOL_H
+#define QEMU_THREAD_POOL_H
+
+#include "block/block.h"
+
+typedef int ThreadPoolFunc(void *opaque);
+
+typedef struct ThreadPool ThreadPool;
+
+ThreadPool *thread_pool_new(struct AioContext *ctx);
+void thread_pool_free(ThreadPool *pool);
+
+BlockAIOCB *thread_pool_submit_aio(ThreadPool *pool,
+        ThreadPoolFunc *func, void *arg,
+        BlockCompletionFunc *cb, void *opaque);
+int coroutine_fn thread_pool_submit_co(ThreadPool *pool,
+        ThreadPoolFunc *func, void *arg);
+void thread_pool_submit(ThreadPool *pool, ThreadPoolFunc *func, void *arg);
+
+#endif
diff --git a/include/block/throttle-groups.h b/include/block/throttle-groups.h
new file mode 100644
index 000000000..9541b3243
--- /dev/null
+++ b/include/block/throttle-groups.h
@@ -0,0 +1,91 @@
+/*
+ * QEMU block throttling group infrastructure
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 or
+ * (at your option) version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef THROTTLE_GROUPS_H
+#define THROTTLE_GROUPS_H
+
+#include "qemu/throttle.h"
+#include "block/block_int.h"
+#include "qom/object.h"
+
+/* The ThrottleGroupMember structure indicates membership in a ThrottleGroup
+ * and holds related data.
+ */
+
+typedef struct ThrottleGroupMember {
+    AioContext   *aio_context;
+    /* throttled_reqs_lock protects the CoQueues for throttled requests.  */
+    CoMutex      throttled_reqs_lock;
+    CoQueue      throttled_reqs[2];
+
+    /* Nonzero if the I/O limits are currently being ignored; generally
+     * it is zero.  Accessed with atomic operations.
+     */
+    unsigned int io_limits_disabled;
+
+    /* Number of pending throttle_group_restart_queue_entry() coroutines.
+     * Accessed with atomic operations.
+     */
+    unsigned int restart_pending;
+
+    /* The following fields are protected by the ThrottleGroup lock.
+     * See the ThrottleGroup documentation for details.
+     * throttle_state tells us if I/O limits are configured. */
+    ThrottleState *throttle_state;
+    ThrottleTimers throttle_timers;
+    unsigned       pending_reqs[2];
+    QLIST_ENTRY(ThrottleGroupMember) round_robin;
+
+} ThrottleGroupMember;
+
+#define TYPE_THROTTLE_GROUP "throttle-group"
+OBJECT_DECLARE_SIMPLE_TYPE(ThrottleGroup, THROTTLE_GROUP)
+
+const char *throttle_group_get_name(ThrottleGroupMember *tgm);
+
+ThrottleState *throttle_group_incref(const char *name);
+void throttle_group_unref(ThrottleState *ts);
+
+void throttle_group_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+void throttle_group_get_config(ThrottleGroupMember *tgm, ThrottleConfig *cfg);
+
+void throttle_group_register_tgm(ThrottleGroupMember *tgm,
+                                const char *groupname,
+                                AioContext *ctx);
+void throttle_group_unregister_tgm(ThrottleGroupMember *tgm);
+void throttle_group_restart_tgm(ThrottleGroupMember *tgm);
+
+void coroutine_fn throttle_group_co_io_limits_intercept(ThrottleGroupMember *tgm,
+                                                        int64_t bytes,
+                                                        bool is_write);
+void throttle_group_attach_aio_context(ThrottleGroupMember *tgm,
+                                       AioContext *new_context);
+void throttle_group_detach_aio_context(ThrottleGroupMember *tgm);
+/*
+ * throttle_group_exists() must be called under the global
+ * mutex.
+ */
+bool throttle_group_exists(const char *name);
+
+#endif
diff --git a/include/block/write-threshold.h b/include/block/write-threshold.h
new file mode 100644
index 000000000..f50f923e7
--- /dev/null
+++ b/include/block/write-threshold.h
@@ -0,0 +1,47 @@
+/*
+ * QEMU System Emulator block write threshold notification
+ *
+ * Copyright Red Hat, Inc. 2014
+ *
+ * Authors:
+ *  Francesco Romani <fromani@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#ifndef BLOCK_WRITE_THRESHOLD_H
+#define BLOCK_WRITE_THRESHOLD_H
+
+#include "qemu/typedefs.h"
+
+/*
+ * bdrv_write_threshold_set:
+ *
+ * Set the write threshold for block devices, in bytes.
+ * Notify when a write exceeds the threshold, meaning the device
+ * is becoming full, so it can be transparently resized.
+ * To be used with thin-provisioned block devices.
+ *
+ * Use threshold_bytes == 0 to disable.
+ */
+void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes);
+
+/*
+ * bdrv_write_threshold_get
+ *
+ * Get the configured write threshold, in bytes.
+ * Zero means no threshold configured.
+ */
+uint64_t bdrv_write_threshold_get(const BlockDriverState *bs);
+
+/*
+ * bdrv_write_threshold_check_write
+ *
+ * Check whether the specified request exceeds the write threshold.
+ * If so, send a corresponding event and disable write threshold checking.
+ */
+void bdrv_write_threshold_check_write(BlockDriverState *bs, int64_t offset,
+                                      int64_t bytes);
+
+#endif