aboutsummaryrefslogtreecommitdiffstats
path: root/block/export
diff options
context:
space:
mode:
Diffstat (limited to 'block/export')
-rw-r--r--block/export/export.c364
-rw-r--r--block/export/fuse.c812
-rw-r--r--block/export/meson.build7
-rw-r--r--block/export/vhost-user-blk-server.c533
-rw-r--r--block/export/vhost-user-blk-server.h19
5 files changed, 1735 insertions, 0 deletions
diff --git a/block/export/export.c b/block/export/export.c
new file mode 100644
index 000000000..6d3b9964c
--- /dev/null
+++ b/block/export/export.c
@@ -0,0 +1,364 @@
+/*
+ * Common block export infrastructure
+ *
+ * Copyright (c) 2012, 2020 Red Hat, Inc.
+ *
+ * Authors:
+ * Paolo Bonzini <pbonzini@redhat.com>
+ * Kevin Wolf <kwolf@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "block/block.h"
+#include "sysemu/block-backend.h"
+#include "sysemu/iothread.h"
+#include "block/export.h"
+#include "block/fuse.h"
+#include "block/nbd.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-block-export.h"
+#include "qapi/qapi-events-block-export.h"
+#include "qemu/id.h"
+#ifdef CONFIG_VHOST_USER_BLK_SERVER
+#include "vhost-user-blk-server.h"
+#endif
+
+static const BlockExportDriver *blk_exp_drivers[] = {
+ &blk_exp_nbd,
+#ifdef CONFIG_VHOST_USER_BLK_SERVER
+ &blk_exp_vhost_user_blk,
+#endif
+#ifdef CONFIG_FUSE
+ &blk_exp_fuse,
+#endif
+};
+
+/* Only accessed from the main thread */
+static QLIST_HEAD(, BlockExport) block_exports =
+ QLIST_HEAD_INITIALIZER(block_exports);
+
+BlockExport *blk_exp_find(const char *id)
+{
+ BlockExport *exp;
+
+ QLIST_FOREACH(exp, &block_exports, next) {
+ if (strcmp(id, exp->id) == 0) {
+ return exp;
+ }
+ }
+
+ return NULL;
+}
+
+static const BlockExportDriver *blk_exp_find_driver(BlockExportType type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(blk_exp_drivers); i++) {
+ if (blk_exp_drivers[i]->type == type) {
+ return blk_exp_drivers[i];
+ }
+ }
+ return NULL;
+}
+
+BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp)
+{
+ bool fixed_iothread = export->has_fixed_iothread && export->fixed_iothread;
+ const BlockExportDriver *drv;
+ BlockExport *exp = NULL;
+ BlockDriverState *bs;
+ BlockBackend *blk = NULL;
+ AioContext *ctx;
+ uint64_t perm;
+ int ret;
+
+ if (!id_wellformed(export->id)) {
+ error_setg(errp, "Invalid block export id");
+ return NULL;
+ }
+ if (blk_exp_find(export->id)) {
+ error_setg(errp, "Block export id '%s' is already in use", export->id);
+ return NULL;
+ }
+
+ drv = blk_exp_find_driver(export->type);
+ if (!drv) {
+ error_setg(errp, "No driver found for the requested export type");
+ return NULL;
+ }
+
+ bs = bdrv_lookup_bs(NULL, export->node_name, errp);
+ if (!bs) {
+ return NULL;
+ }
+
+ if (!export->has_writable) {
+ export->writable = false;
+ }
+ if (bdrv_is_read_only(bs) && export->writable) {
+ error_setg(errp, "Cannot export read-only node as writable");
+ return NULL;
+ }
+
+ ctx = bdrv_get_aio_context(bs);
+ aio_context_acquire(ctx);
+
+ if (export->has_iothread) {
+ IOThread *iothread;
+ AioContext *new_ctx;
+ Error **set_context_errp;
+
+ iothread = iothread_by_id(export->iothread);
+ if (!iothread) {
+ error_setg(errp, "iothread \"%s\" not found", export->iothread);
+ goto fail;
+ }
+
+ new_ctx = iothread_get_aio_context(iothread);
+
+ /* Ignore errors with fixed-iothread=false */
+ set_context_errp = fixed_iothread ? errp : NULL;
+ ret = bdrv_try_set_aio_context(bs, new_ctx, set_context_errp);
+ if (ret == 0) {
+ aio_context_release(ctx);
+ aio_context_acquire(new_ctx);
+ ctx = new_ctx;
+ } else if (fixed_iothread) {
+ goto fail;
+ }
+ }
+
+ /*
+ * Block exports are used for non-shared storage migration. Make sure
+ * that BDRV_O_INACTIVE is cleared and the image is ready for write
+ * access since the export could be available before migration handover.
+ * ctx was acquired in the caller.
+ */
+ bdrv_invalidate_cache(bs, NULL);
+
+ perm = BLK_PERM_CONSISTENT_READ;
+ if (export->writable) {
+ perm |= BLK_PERM_WRITE;
+ }
+
+ blk = blk_new(ctx, perm, BLK_PERM_ALL);
+
+ if (!fixed_iothread) {
+ blk_set_allow_aio_context_change(blk, true);
+ }
+
+ ret = blk_insert_bs(blk, bs, errp);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ if (!export->has_writethrough) {
+ export->writethrough = false;
+ }
+ blk_set_enable_write_cache(blk, !export->writethrough);
+
+ assert(drv->instance_size >= sizeof(BlockExport));
+ exp = g_malloc0(drv->instance_size);
+ *exp = (BlockExport) {
+ .drv = drv,
+ .refcount = 1,
+ .user_owned = true,
+ .id = g_strdup(export->id),
+ .ctx = ctx,
+ .blk = blk,
+ };
+
+ ret = drv->create(exp, export, errp);
+ if (ret < 0) {
+ goto fail;
+ }
+
+ assert(exp->blk != NULL);
+
+ QLIST_INSERT_HEAD(&block_exports, exp, next);
+
+ aio_context_release(ctx);
+ return exp;
+
+fail:
+ blk_unref(blk);
+ aio_context_release(ctx);
+ if (exp) {
+ g_free(exp->id);
+ g_free(exp);
+ }
+ return NULL;
+}
+
+/* Callers must hold exp->ctx lock */
+void blk_exp_ref(BlockExport *exp)
+{
+ assert(exp->refcount > 0);
+ exp->refcount++;
+}
+
+/* Runs in the main thread */
+static void blk_exp_delete_bh(void *opaque)
+{
+ BlockExport *exp = opaque;
+ AioContext *aio_context = exp->ctx;
+
+ aio_context_acquire(aio_context);
+
+ assert(exp->refcount == 0);
+ QLIST_REMOVE(exp, next);
+ exp->drv->delete(exp);
+ blk_unref(exp->blk);
+ qapi_event_send_block_export_deleted(exp->id);
+ g_free(exp->id);
+ g_free(exp);
+
+ aio_context_release(aio_context);
+}
+
+/* Callers must hold exp->ctx lock */
+void blk_exp_unref(BlockExport *exp)
+{
+ assert(exp->refcount > 0);
+ if (--exp->refcount == 0) {
+ /* Touch the block_exports list only in the main thread */
+ aio_bh_schedule_oneshot(qemu_get_aio_context(), blk_exp_delete_bh,
+ exp);
+ }
+}
+
+/*
+ * Drops the user reference to the export and requests that all client
+ * connections and other internally held references start to shut down. When
+ * the function returns, there may still be active references while the export
+ * is in the process of shutting down.
+ *
+ * Acquires exp->ctx internally. Callers must *not* hold the lock.
+ */
+void blk_exp_request_shutdown(BlockExport *exp)
+{
+ AioContext *aio_context = exp->ctx;
+
+ aio_context_acquire(aio_context);
+
+ /*
+ * If the user doesn't own the export any more, it is already shutting
+ * down. We must not call .request_shutdown and decrease the refcount a
+ * second time.
+ */
+ if (!exp->user_owned) {
+ goto out;
+ }
+
+ exp->drv->request_shutdown(exp);
+
+ assert(exp->user_owned);
+ exp->user_owned = false;
+ blk_exp_unref(exp);
+
+out:
+ aio_context_release(aio_context);
+}
+
+/*
+ * Returns whether a block export of the given type exists.
+ * type == BLOCK_EXPORT_TYPE__MAX checks for an export of any type.
+ */
+static bool blk_exp_has_type(BlockExportType type)
+{
+ BlockExport *exp;
+
+ if (type == BLOCK_EXPORT_TYPE__MAX) {
+ return !QLIST_EMPTY(&block_exports);
+ }
+
+ QLIST_FOREACH(exp, &block_exports, next) {
+ if (exp->drv->type == type) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/* type == BLOCK_EXPORT_TYPE__MAX for all types */
+void blk_exp_close_all_type(BlockExportType type)
+{
+ BlockExport *exp, *next;
+
+ assert(in_aio_context_home_thread(qemu_get_aio_context()));
+
+ QLIST_FOREACH_SAFE(exp, &block_exports, next, next) {
+ if (type != BLOCK_EXPORT_TYPE__MAX && exp->drv->type != type) {
+ continue;
+ }
+ blk_exp_request_shutdown(exp);
+ }
+
+ AIO_WAIT_WHILE(NULL, blk_exp_has_type(type));
+}
+
+void blk_exp_close_all(void)
+{
+ blk_exp_close_all_type(BLOCK_EXPORT_TYPE__MAX);
+}
+
+void qmp_block_export_add(BlockExportOptions *export, Error **errp)
+{
+ blk_exp_add(export, errp);
+}
+
+void qmp_block_export_del(const char *id,
+ bool has_mode, BlockExportRemoveMode mode,
+ Error **errp)
+{
+ ERRP_GUARD();
+ BlockExport *exp;
+
+ exp = blk_exp_find(id);
+ if (exp == NULL) {
+ error_setg(errp, "Export '%s' is not found", id);
+ return;
+ }
+ if (!exp->user_owned) {
+ error_setg(errp, "Export '%s' is already shutting down", id);
+ return;
+ }
+
+ if (!has_mode) {
+ mode = BLOCK_EXPORT_REMOVE_MODE_SAFE;
+ }
+ if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE && exp->refcount > 1) {
+ error_setg(errp, "export '%s' still in use", exp->id);
+ error_append_hint(errp, "Use mode='hard' to force client "
+ "disconnect\n");
+ return;
+ }
+
+ blk_exp_request_shutdown(exp);
+}
+
+BlockExportInfoList *qmp_query_block_exports(Error **errp)
+{
+ BlockExportInfoList *head = NULL, **tail = &head;
+ BlockExport *exp;
+
+ QLIST_FOREACH(exp, &block_exports, next) {
+ BlockExportInfo *info = g_new(BlockExportInfo, 1);
+ *info = (BlockExportInfo) {
+ .id = g_strdup(exp->id),
+ .type = exp->drv->type,
+ .node_name = g_strdup(bdrv_get_node_name(blk_bs(exp->blk))),
+ .shutting_down = !exp->user_owned,
+ };
+
+ QAPI_LIST_APPEND(tail, info);
+ }
+
+ return head;
+}
diff --git a/block/export/fuse.c b/block/export/fuse.c
new file mode 100644
index 000000000..823c126d2
--- /dev/null
+++ b/block/export/fuse.c
@@ -0,0 +1,812 @@
+/*
+ * Present a block device as a raw image through FUSE
+ *
+ * Copyright (c) 2020 Max Reitz <mreitz@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 or later of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define FUSE_USE_VERSION 31
+
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "block/block.h"
+#include "block/export.h"
+#include "block/fuse.h"
+#include "block/qapi.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-block.h"
+#include "sysemu/block-backend.h"
+
+#include <fuse.h>
+#include <fuse_lowlevel.h>
+
+#if defined(CONFIG_FALLOCATE_ZERO_RANGE)
+#include <linux/falloc.h>
+#endif
+
+#ifdef __linux__
+#include <linux/fs.h>
+#endif
+
+/* Prevent overly long bounce buffer allocations */
+#define FUSE_MAX_BOUNCE_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 64 * 1024 * 1024))
+
+
+typedef struct FuseExport {
+ BlockExport common;
+
+ struct fuse_session *fuse_session;
+ struct fuse_buf fuse_buf;
+ bool mounted, fd_handler_set_up;
+
+ char *mountpoint;
+ bool writable;
+ bool growable;
+ /* Whether allow_other was used as a mount option or not */
+ bool allow_other;
+
+ mode_t st_mode;
+ uid_t st_uid;
+ gid_t st_gid;
+} FuseExport;
+
+static GHashTable *exports;
+static const struct fuse_lowlevel_ops fuse_ops;
+
+static void fuse_export_shutdown(BlockExport *exp);
+static void fuse_export_delete(BlockExport *exp);
+
+static void init_exports_table(void);
+
+static int setup_fuse_export(FuseExport *exp, const char *mountpoint,
+ bool allow_other, Error **errp);
+static void read_from_fuse_export(void *opaque);
+
+static bool is_regular_file(const char *path, Error **errp);
+
+
+static int fuse_export_create(BlockExport *blk_exp,
+ BlockExportOptions *blk_exp_args,
+ Error **errp)
+{
+ FuseExport *exp = container_of(blk_exp, FuseExport, common);
+ BlockExportOptionsFuse *args = &blk_exp_args->u.fuse;
+ int ret;
+
+ assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE);
+
+ /* For growable exports, take the RESIZE permission */
+ if (args->growable) {
+ uint64_t blk_perm, blk_shared_perm;
+
+ blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
+
+ ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
+ blk_shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ init_exports_table();
+
+ /*
+ * It is important to do this check before calling is_regular_file() --
+ * that function will do a stat(), which we would have to handle if we
+ * already exported something on @mountpoint. But we cannot, because
+ * we are currently caught up here.
+ * (Note that ideally we would want to resolve relative paths here,
+ * but bdrv_make_absolute_filename() might do the wrong thing for
+ * paths that contain colons, and realpath() would resolve symlinks,
+ * which we do not want: The mount point is not going to be the
+ * symlink's destination, but the link itself.)
+ * So this will not catch all potential clashes, but hopefully at
+ * least the most common one of specifying exactly the same path
+ * string twice.
+ */
+ if (g_hash_table_contains(exports, args->mountpoint)) {
+ error_setg(errp, "There already is a FUSE export on '%s'",
+ args->mountpoint);
+ ret = -EEXIST;
+ goto fail;
+ }
+
+ if (!is_regular_file(args->mountpoint, errp)) {
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ exp->mountpoint = g_strdup(args->mountpoint);
+ exp->writable = blk_exp_args->writable;
+ exp->growable = args->growable;
+
+ /* set default */
+ if (!args->has_allow_other) {
+ args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO;
+ }
+
+ exp->st_mode = S_IFREG | S_IRUSR;
+ if (exp->writable) {
+ exp->st_mode |= S_IWUSR;
+ }
+ exp->st_uid = getuid();
+ exp->st_gid = getgid();
+
+ if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) {
+ /* Ignore errors on our first attempt */
+ ret = setup_fuse_export(exp, args->mountpoint, true, NULL);
+ exp->allow_other = ret == 0;
+ if (ret < 0) {
+ ret = setup_fuse_export(exp, args->mountpoint, false, errp);
+ }
+ } else {
+ exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON;
+ ret = setup_fuse_export(exp, args->mountpoint, exp->allow_other, errp);
+ }
+ if (ret < 0) {
+ goto fail;
+ }
+
+ return 0;
+
+fail:
+ fuse_export_delete(blk_exp);
+ return ret;
+}
+
+/**
+ * Allocates the global @exports hash table.
+ */
+static void init_exports_table(void)
+{
+ if (exports) {
+ return;
+ }
+
+ exports = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL);
+}
+
+/**
+ * Create exp->fuse_session and mount it.
+ */
+static int setup_fuse_export(FuseExport *exp, const char *mountpoint,
+ bool allow_other, Error **errp)
+{
+ const char *fuse_argv[4];
+ char *mount_opts;
+ struct fuse_args fuse_args;
+ int ret;
+
+ /*
+ * max_read needs to match what fuse_init() sets.
+ * max_write need not be supplied.
+ */
+ mount_opts = g_strdup_printf("max_read=%zu,default_permissions%s",
+ FUSE_MAX_BOUNCE_BYTES,
+ allow_other ? ",allow_other" : "");
+
+ fuse_argv[0] = ""; /* Dummy program name */
+ fuse_argv[1] = "-o";
+ fuse_argv[2] = mount_opts;
+ fuse_argv[3] = NULL;
+ fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv);
+
+ exp->fuse_session = fuse_session_new(&fuse_args, &fuse_ops,
+ sizeof(fuse_ops), exp);
+ g_free(mount_opts);
+ if (!exp->fuse_session) {
+ error_setg(errp, "Failed to set up FUSE session");
+ ret = -EIO;
+ goto fail;
+ }
+
+ ret = fuse_session_mount(exp->fuse_session, mountpoint);
+ if (ret < 0) {
+ error_setg(errp, "Failed to mount FUSE session to export");
+ ret = -EIO;
+ goto fail;
+ }
+ exp->mounted = true;
+
+ g_hash_table_insert(exports, g_strdup(mountpoint), NULL);
+
+ aio_set_fd_handler(exp->common.ctx,
+ fuse_session_fd(exp->fuse_session), true,
+ read_from_fuse_export, NULL, NULL, exp);
+ exp->fd_handler_set_up = true;
+
+ return 0;
+
+fail:
+ fuse_export_shutdown(&exp->common);
+ return ret;
+}
+
+/**
+ * Callback to be invoked when the FUSE session FD can be read from.
+ * (This is basically the FUSE event loop.)
+ */
+static void read_from_fuse_export(void *opaque)
+{
+ FuseExport *exp = opaque;
+ int ret;
+
+ blk_exp_ref(&exp->common);
+
+ do {
+ ret = fuse_session_receive_buf(exp->fuse_session, &exp->fuse_buf);
+ } while (ret == -EINTR);
+ if (ret < 0) {
+ goto out;
+ }
+
+ fuse_session_process_buf(exp->fuse_session, &exp->fuse_buf);
+
+out:
+ blk_exp_unref(&exp->common);
+}
+
+static void fuse_export_shutdown(BlockExport *blk_exp)
+{
+ FuseExport *exp = container_of(blk_exp, FuseExport, common);
+
+ if (exp->fuse_session) {
+ fuse_session_exit(exp->fuse_session);
+
+ if (exp->fd_handler_set_up) {
+ aio_set_fd_handler(exp->common.ctx,
+ fuse_session_fd(exp->fuse_session), true,
+ NULL, NULL, NULL, NULL);
+ exp->fd_handler_set_up = false;
+ }
+ }
+
+ if (exp->mountpoint) {
+ /*
+ * Safe to drop now, because we will not handle any requests
+ * for this export anymore anyway.
+ */
+ g_hash_table_remove(exports, exp->mountpoint);
+ }
+}
+
+static void fuse_export_delete(BlockExport *blk_exp)
+{
+ FuseExport *exp = container_of(blk_exp, FuseExport, common);
+
+ if (exp->fuse_session) {
+ if (exp->mounted) {
+ fuse_session_unmount(exp->fuse_session);
+ }
+
+ fuse_session_destroy(exp->fuse_session);
+ }
+
+ free(exp->fuse_buf.mem);
+ g_free(exp->mountpoint);
+}
+
+/**
+ * Check whether @path points to a regular file. If not, put an
+ * appropriate message into *errp.
+ */
+static bool is_regular_file(const char *path, Error **errp)
+{
+ struct stat statbuf;
+ int ret;
+
+ ret = stat(path, &statbuf);
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "Failed to stat '%s'", path);
+ return false;
+ }
+
+ if (!S_ISREG(statbuf.st_mode)) {
+ error_setg(errp, "'%s' is not a regular file", path);
+ return false;
+ }
+
+ return true;
+}
+
+/**
+ * A chance to set change some parameters supplied to FUSE_INIT.
+ */
+static void fuse_init(void *userdata, struct fuse_conn_info *conn)
+{
+ /*
+ * MIN_NON_ZERO() would not be wrong here, but what we set here
+ * must equal what has been passed to fuse_session_new().
+ * Therefore, as long as max_read must be passed as a mount option
+ * (which libfuse claims will be changed at some point), we have
+ * to set max_read to a fixed value here.
+ */
+ conn->max_read = FUSE_MAX_BOUNCE_BYTES;
+
+ conn->max_write = MIN_NON_ZERO(BDRV_REQUEST_MAX_BYTES, conn->max_write);
+}
+
+/**
+ * Let clients look up files. Always return ENOENT because we only
+ * care about the mountpoint itself.
+ */
+static void fuse_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ fuse_reply_err(req, ENOENT);
+}
+
+/**
+ * Let clients get file attributes (i.e., stat() the file).
+ */
+static void fuse_getattr(fuse_req_t req, fuse_ino_t inode,
+ struct fuse_file_info *fi)
+{
+ struct stat statbuf;
+ int64_t length, allocated_blocks;
+ time_t now = time(NULL);
+ FuseExport *exp = fuse_req_userdata(req);
+
+ length = blk_getlength(exp->common.blk);
+ if (length < 0) {
+ fuse_reply_err(req, -length);
+ return;
+ }
+
+ allocated_blocks = bdrv_get_allocated_file_size(blk_bs(exp->common.blk));
+ if (allocated_blocks <= 0) {
+ allocated_blocks = DIV_ROUND_UP(length, 512);
+ } else {
+ allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512);
+ }
+
+ statbuf = (struct stat) {
+ .st_ino = inode,
+ .st_mode = exp->st_mode,
+ .st_nlink = 1,
+ .st_uid = exp->st_uid,
+ .st_gid = exp->st_gid,
+ .st_size = length,
+ .st_blksize = blk_bs(exp->common.blk)->bl.request_alignment,
+ .st_blocks = allocated_blocks,
+ .st_atime = now,
+ .st_mtime = now,
+ .st_ctime = now,
+ };
+
+ fuse_reply_attr(req, &statbuf, 1.);
+}
+
+static int fuse_do_truncate(const FuseExport *exp, int64_t size,
+ bool req_zero_write, PreallocMode prealloc)
+{
+ uint64_t blk_perm, blk_shared_perm;
+ BdrvRequestFlags truncate_flags = 0;
+ int ret;
+
+ if (req_zero_write) {
+ truncate_flags |= BDRV_REQ_ZERO_WRITE;
+ }
+
+ /* Growable exports have a permanent RESIZE permission */
+ if (!exp->growable) {
+ blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm);
+
+ ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE,
+ blk_shared_perm, NULL);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ ret = blk_truncate(exp->common.blk, size, true, prealloc,
+ truncate_flags, NULL);
+
+ if (!exp->growable) {
+ /* Must succeed, because we are only giving up the RESIZE permission */
+ blk_set_perm(exp->common.blk, blk_perm, blk_shared_perm, &error_abort);
+ }
+
+ return ret;
+}
+
+/**
+ * Let clients set file attributes. Only resizing and changing
+ * permissions (st_mode, st_uid, st_gid) is allowed.
+ * Changing permissions is only allowed as far as it will actually
+ * permit access: Read-only exports cannot be given +w, and exports
+ * without allow_other cannot be given a different UID or GID, and
+ * they cannot be given non-owner access.
+ */
+static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf,
+ int to_set, struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+ int supported_attrs;
+ int ret;
+
+ supported_attrs = FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_MODE;
+ if (exp->allow_other) {
+ supported_attrs |= FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID;
+ }
+
+ if (to_set & ~supported_attrs) {
+ fuse_reply_err(req, ENOTSUP);
+ return;
+ }
+
+ /* Do some argument checks first before committing to anything */
+ if (to_set & FUSE_SET_ATTR_MODE) {
+ /*
+ * Without allow_other, non-owners can never access the export, so do
+ * not allow setting permissions for them
+ */
+ if (!exp->allow_other &&
+ (statbuf->st_mode & (S_IRWXG | S_IRWXO)) != 0)
+ {
+ fuse_reply_err(req, EPERM);
+ return;
+ }
+
+ /* +w for read-only exports makes no sense, disallow it */
+ if (!exp->writable &&
+ (statbuf->st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0)
+ {
+ fuse_reply_err(req, EROFS);
+ return;
+ }
+ }
+
+ if (to_set & FUSE_SET_ATTR_SIZE) {
+ if (!exp->writable) {
+ fuse_reply_err(req, EACCES);
+ return;
+ }
+
+ ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ }
+
+ if (to_set & FUSE_SET_ATTR_MODE) {
+ /* Ignore FUSE-supplied file type, only change the mode */
+ exp->st_mode = (statbuf->st_mode & 07777) | S_IFREG;
+ }
+
+ if (to_set & FUSE_SET_ATTR_UID) {
+ exp->st_uid = statbuf->st_uid;
+ }
+
+ if (to_set & FUSE_SET_ATTR_GID) {
+ exp->st_gid = statbuf->st_gid;
+ }
+
+ fuse_getattr(req, inode, fi);
+}
+
+/**
+ * Let clients open a file (i.e., the exported image).
+ */
+static void fuse_open(fuse_req_t req, fuse_ino_t inode,
+ struct fuse_file_info *fi)
+{
+ fuse_reply_open(req, fi);
+}
+
+/**
+ * Handle client reads from the exported image.
+ */
+static void fuse_read(fuse_req_t req, fuse_ino_t inode,
+ size_t size, off_t offset, struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+ int64_t length;
+ void *buf;
+ int ret;
+
+ /* Limited by max_read, should not happen */
+ if (size > FUSE_MAX_BOUNCE_BYTES) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ /**
+ * Clients will expect short reads at EOF, so we have to limit
+ * offset+size to the image length.
+ */
+ length = blk_getlength(exp->common.blk);
+ if (length < 0) {
+ fuse_reply_err(req, -length);
+ return;
+ }
+
+ if (offset + size > length) {
+ size = length - offset;
+ }
+
+ buf = qemu_try_blockalign(blk_bs(exp->common.blk), size);
+ if (!buf) {
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+
+ ret = blk_pread(exp->common.blk, offset, buf, size);
+ if (ret >= 0) {
+ fuse_reply_buf(req, buf, size);
+ } else {
+ fuse_reply_err(req, -ret);
+ }
+
+ qemu_vfree(buf);
+}
+
+/**
+ * Handle client writes to the exported image.
+ */
+static void fuse_write(fuse_req_t req, fuse_ino_t inode, const char *buf,
+ size_t size, off_t offset, struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+ int64_t length;
+ int ret;
+
+ /* Limited by max_write, should not happen */
+ if (size > BDRV_REQUEST_MAX_BYTES) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (!exp->writable) {
+ fuse_reply_err(req, EACCES);
+ return;
+ }
+
+ /**
+ * Clients will expect short writes at EOF, so we have to limit
+ * offset+size to the image length.
+ */
+ length = blk_getlength(exp->common.blk);
+ if (length < 0) {
+ fuse_reply_err(req, -length);
+ return;
+ }
+
+ if (offset + size > length) {
+ if (exp->growable) {
+ ret = fuse_do_truncate(exp, offset + size, true, PREALLOC_MODE_OFF);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ } else {
+ size = length - offset;
+ }
+ }
+
+ ret = blk_pwrite(exp->common.blk, offset, buf, size, 0);
+ if (ret >= 0) {
+ fuse_reply_write(req, size);
+ } else {
+ fuse_reply_err(req, -ret);
+ }
+}
+
+/**
+ * Let clients perform various fallocate() operations.
+ */
+static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode,
+ off_t offset, off_t length,
+ struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+ int64_t blk_len;
+ int ret;
+
+ if (!exp->writable) {
+ fuse_reply_err(req, EACCES);
+ return;
+ }
+
+ blk_len = blk_getlength(exp->common.blk);
+ if (blk_len < 0) {
+ fuse_reply_err(req, -blk_len);
+ return;
+ }
+
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ length = MIN(length, blk_len - offset);
+ }
+
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ do {
+ int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
+
+ ret = blk_pdiscard(exp->common.blk, offset, size);
+ offset += size;
+ length -= size;
+ } while (ret == 0 && length > 0);
+ }
+#ifdef CONFIG_FALLOCATE_ZERO_RANGE
+ else if (mode & FALLOC_FL_ZERO_RANGE) {
+ if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) {
+ /* No need for zeroes, we are going to write them ourselves */
+ ret = fuse_do_truncate(exp, offset + length, false,
+ PREALLOC_MODE_OFF);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ }
+
+ do {
+ int size = MIN(length, BDRV_REQUEST_MAX_BYTES);
+
+ ret = blk_pwrite_zeroes(exp->common.blk,
+ offset, size, 0);
+ offset += size;
+ length -= size;
+ } while (ret == 0 && length > 0);
+ }
+#endif /* CONFIG_FALLOCATE_ZERO_RANGE */
+ else if (!mode) {
+ /* We can only fallocate at the EOF with a truncate */
+ if (offset < blk_len) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ if (offset > blk_len) {
+ /* No preallocation needed here */
+ ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ }
+
+ ret = fuse_do_truncate(exp, offset + length, true,
+ PREALLOC_MODE_FALLOC);
+ } else {
+ ret = -EOPNOTSUPP;
+ }
+
+ fuse_reply_err(req, ret < 0 ? -ret : 0);
+}
+
+/**
+ * Let clients fsync the exported image.
+ */
+static void fuse_fsync(fuse_req_t req, fuse_ino_t inode, int datasync,
+ struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+ int ret;
+
+ ret = blk_flush(exp->common.blk);
+ fuse_reply_err(req, ret < 0 ? -ret : 0);
+}
+
+/**
+ * Called before an FD to the exported image is closed. (libfuse
+ * notes this to be a way to return last-minute errors.)
+ */
+static void fuse_flush(fuse_req_t req, fuse_ino_t inode,
+ struct fuse_file_info *fi)
+{
+ fuse_fsync(req, inode, 1, fi);
+}
+
+#ifdef CONFIG_FUSE_LSEEK
+/**
+ * Let clients inquire allocation status.
+ */
+static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset,
+ int whence, struct fuse_file_info *fi)
+{
+ FuseExport *exp = fuse_req_userdata(req);
+
+ if (whence != SEEK_HOLE && whence != SEEK_DATA) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ while (true) {
+ int64_t pnum;
+ int ret;
+
+ ret = bdrv_block_status_above(blk_bs(exp->common.blk), NULL,
+ offset, INT64_MAX, &pnum, NULL, NULL);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+
+ if (!pnum && (ret & BDRV_BLOCK_EOF)) {
+ int64_t blk_len;
+
+ /*
+ * If blk_getlength() rounds (e.g. by sectors), then the
+ * export length will be rounded, too. However,
+ * bdrv_block_status_above() may return EOF at unaligned
+ * offsets. We must not let this become visible and thus
+ * always simulate a hole between @offset (the real EOF)
+ * and @blk_len (the client-visible EOF).
+ */
+
+ blk_len = blk_getlength(exp->common.blk);
+ if (blk_len < 0) {
+ fuse_reply_err(req, -blk_len);
+ return;
+ }
+
+ if (offset > blk_len || whence == SEEK_DATA) {
+ fuse_reply_err(req, ENXIO);
+ } else {
+ fuse_reply_lseek(req, offset);
+ }
+ return;
+ }
+
+ if (ret & BDRV_BLOCK_DATA) {
+ if (whence == SEEK_DATA) {
+ fuse_reply_lseek(req, offset);
+ return;
+ }
+ } else {
+ if (whence == SEEK_HOLE) {
+ fuse_reply_lseek(req, offset);
+ return;
+ }
+ }
+
+ /* Safety check against infinite loops */
+ if (!pnum) {
+ fuse_reply_err(req, ENXIO);
+ return;
+ }
+
+ offset += pnum;
+ }
+}
+#endif
+
+static const struct fuse_lowlevel_ops fuse_ops = {
+ .init = fuse_init,
+ .lookup = fuse_lookup,
+ .getattr = fuse_getattr,
+ .setattr = fuse_setattr,
+ .open = fuse_open,
+ .read = fuse_read,
+ .write = fuse_write,
+ .fallocate = fuse_fallocate,
+ .flush = fuse_flush,
+ .fsync = fuse_fsync,
+#ifdef CONFIG_FUSE_LSEEK
+ .lseek = fuse_lseek,
+#endif
+};
+
+const BlockExportDriver blk_exp_fuse = {
+ .type = BLOCK_EXPORT_TYPE_FUSE,
+ .instance_size = sizeof(FuseExport),
+ .create = fuse_export_create,
+ .delete = fuse_export_delete,
+ .request_shutdown = fuse_export_shutdown,
+};
diff --git a/block/export/meson.build b/block/export/meson.build
new file mode 100644
index 000000000..0a08e384c
--- /dev/null
+++ b/block/export/meson.build
@@ -0,0 +1,7 @@
+blockdev_ss.add(files('export.c'))
+
+if have_vhost_user_blk_server
+ blockdev_ss.add(files('vhost-user-blk-server.c'))
+endif
+
+blockdev_ss.add(when: fuse, if_true: files('fuse.c'))
diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c
new file mode 100644
index 000000000..186256333
--- /dev/null
+++ b/block/export/vhost-user-blk-server.c
@@ -0,0 +1,533 @@
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Parts of the code based on nbd/server.c.
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */
+#include "standard-headers/linux/virtio_blk.h"
+#include "qemu/vhost-user-server.h"
+#include "vhost-user-blk-server.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "sysemu/block-backend.h"
+#include "util/block-helpers.h"
+
+/*
+ * Sector units are 512 bytes regardless of the
+ * virtio_blk_config->blk_size value.
+ */
+#define VIRTIO_BLK_SECTOR_BITS 9
+#define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS)
+
+enum {
+ VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1,
+ VHOST_USER_BLK_MAX_DISCARD_SECTORS = 32768,
+ VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS = 32768,
+};
+struct virtio_blk_inhdr {
+ unsigned char status;
+};
+
+typedef struct VuBlkReq {
+ VuVirtqElement elem;
+ int64_t sector_num;
+ size_t size;
+ struct virtio_blk_inhdr *in;
+ struct virtio_blk_outhdr out;
+ VuServer *server;
+ struct VuVirtq *vq;
+} VuBlkReq;
+
+/* vhost user block device */
+typedef struct {
+ BlockExport export;
+ VuServer vu_server;
+ uint32_t blk_size;
+ QIOChannelSocket *sioc;
+ struct virtio_blk_config blkcfg;
+ bool writable;
+} VuBlkExport;
+
+static void vu_blk_req_complete(VuBlkReq *req)
+{
+ VuDev *vu_dev = &req->server->vu_dev;
+
+ /* IO size with 1 extra status byte */
+ vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1);
+ vu_queue_notify(vu_dev, req->vq);
+
+ free(req);
+}
+
+static bool vu_blk_sect_range_ok(VuBlkExport *vexp, uint64_t sector,
+ size_t size)
+{
+ uint64_t nb_sectors;
+ uint64_t total_sectors;
+
+ if (size % VIRTIO_BLK_SECTOR_SIZE) {
+ return false;
+ }
+
+ nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS;
+
+ QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE);
+ if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
+ return false;
+ }
+ if ((sector << VIRTIO_BLK_SECTOR_BITS) % vexp->blk_size) {
+ return false;
+ }
+ blk_get_geometry(vexp->export.blk, &total_sectors);
+ if (sector > total_sectors || nb_sectors > total_sectors - sector) {
+ return false;
+ }
+ return true;
+}
+
+static int coroutine_fn
+vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov,
+ uint32_t iovcnt, uint32_t type)
+{
+ BlockBackend *blk = vexp->export.blk;
+ struct virtio_blk_discard_write_zeroes desc;
+ ssize_t size;
+ uint64_t sector;
+ uint32_t num_sectors;
+ uint32_t max_sectors;
+ uint32_t flags;
+ int bytes;
+
+ /* Only one desc is currently supported */
+ if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) {
+ return VIRTIO_BLK_S_UNSUPP;
+ }
+
+ size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc));
+ if (unlikely(size != sizeof(desc))) {
+ error_report("Invalid size %zd, expected %zu", size, sizeof(desc));
+ return VIRTIO_BLK_S_IOERR;
+ }
+
+ sector = le64_to_cpu(desc.sector);
+ num_sectors = le32_to_cpu(desc.num_sectors);
+ flags = le32_to_cpu(desc.flags);
+ max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ?
+ VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS :
+ VHOST_USER_BLK_MAX_DISCARD_SECTORS;
+
+ /* This check ensures that 'bytes' fits in an int */
+ if (unlikely(num_sectors > max_sectors)) {
+ return VIRTIO_BLK_S_IOERR;
+ }
+
+ bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS;
+
+ if (unlikely(!vu_blk_sect_range_ok(vexp, sector, bytes))) {
+ return VIRTIO_BLK_S_IOERR;
+ }
+
+ /*
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard
+ * and write zeroes commands if any unknown flag is set.
+ */
+ if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
+ return VIRTIO_BLK_S_UNSUPP;
+ }
+
+ if (type == VIRTIO_BLK_T_WRITE_ZEROES) {
+ int blk_flags = 0;
+
+ if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) {
+ blk_flags |= BDRV_REQ_MAY_UNMAP;
+ }
+
+ if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS,
+ bytes, blk_flags) == 0) {
+ return VIRTIO_BLK_S_OK;
+ }
+ } else if (type == VIRTIO_BLK_T_DISCARD) {
+ /*
+ * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for
+ * discard commands if the unmap flag is set.
+ */
+ if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) {
+ return VIRTIO_BLK_S_UNSUPP;
+ }
+
+ if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS,
+ bytes) == 0) {
+ return VIRTIO_BLK_S_OK;
+ }
+ }
+
+ return VIRTIO_BLK_S_IOERR;
+}
+
+static void coroutine_fn vu_blk_virtio_process_req(void *opaque)
+{
+ VuBlkReq *req = opaque;
+ VuServer *server = req->server;
+ VuVirtqElement *elem = &req->elem;
+ uint32_t type;
+
+ VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+ BlockBackend *blk = vexp->export.blk;
+
+ struct iovec *in_iov = elem->in_sg;
+ struct iovec *out_iov = elem->out_sg;
+ unsigned in_num = elem->in_num;
+ unsigned out_num = elem->out_num;
+
+ /* refer to hw/block/virtio_blk.c */
+ if (elem->out_num < 1 || elem->in_num < 1) {
+ error_report("virtio-blk request missing headers");
+ goto err;
+ }
+
+ if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out,
+ sizeof(req->out)) != sizeof(req->out))) {
+ error_report("virtio-blk request outhdr too short");
+ goto err;
+ }
+
+ iov_discard_front(&out_iov, &out_num, sizeof(req->out));
+
+ if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) {
+ error_report("virtio-blk request inhdr too short");
+ goto err;
+ }
+
+ /* We always touch the last byte, so just see how big in_iov is. */
+ req->in = (void *)in_iov[in_num - 1].iov_base
+ + in_iov[in_num - 1].iov_len
+ - sizeof(struct virtio_blk_inhdr);
+ iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr));
+
+ type = le32_to_cpu(req->out.type);
+ switch (type & ~VIRTIO_BLK_T_BARRIER) {
+ case VIRTIO_BLK_T_IN:
+ case VIRTIO_BLK_T_OUT: {
+ QEMUIOVector qiov;
+ int64_t offset;
+ ssize_t ret = 0;
+ bool is_write = type & VIRTIO_BLK_T_OUT;
+ req->sector_num = le64_to_cpu(req->out.sector);
+
+ if (is_write && !vexp->writable) {
+ req->in->status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ if (is_write) {
+ qemu_iovec_init_external(&qiov, out_iov, out_num);
+ } else {
+ qemu_iovec_init_external(&qiov, in_iov, in_num);
+ }
+
+ if (unlikely(!vu_blk_sect_range_ok(vexp,
+ req->sector_num,
+ qiov.size))) {
+ req->in->status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ offset = req->sector_num << VIRTIO_BLK_SECTOR_BITS;
+
+ if (is_write) {
+ ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0);
+ } else {
+ ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0);
+ }
+ if (ret >= 0) {
+ req->in->status = VIRTIO_BLK_S_OK;
+ } else {
+ req->in->status = VIRTIO_BLK_S_IOERR;
+ }
+ break;
+ }
+ case VIRTIO_BLK_T_FLUSH:
+ if (blk_co_flush(blk) == 0) {
+ req->in->status = VIRTIO_BLK_S_OK;
+ } else {
+ req->in->status = VIRTIO_BLK_S_IOERR;
+ }
+ break;
+ case VIRTIO_BLK_T_GET_ID: {
+ size_t size = MIN(iov_size(&elem->in_sg[0], in_num),
+ VIRTIO_BLK_ID_BYTES);
+ snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk");
+ req->in->status = VIRTIO_BLK_S_OK;
+ req->size = elem->in_sg[0].iov_len;
+ break;
+ }
+ case VIRTIO_BLK_T_DISCARD:
+ case VIRTIO_BLK_T_WRITE_ZEROES: {
+ if (!vexp->writable) {
+ req->in->status = VIRTIO_BLK_S_IOERR;
+ break;
+ }
+
+ req->in->status = vu_blk_discard_write_zeroes(vexp, out_iov, out_num,
+ type);
+ break;
+ }
+ default:
+ req->in->status = VIRTIO_BLK_S_UNSUPP;
+ break;
+ }
+
+ vu_blk_req_complete(req);
+ return;
+
+err:
+ free(req);
+}
+
+static void vu_blk_process_vq(VuDev *vu_dev, int idx)
+{
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+ VuVirtq *vq = vu_get_queue(vu_dev, idx);
+
+ while (1) {
+ VuBlkReq *req;
+
+ req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq));
+ if (!req) {
+ break;
+ }
+
+ req->server = server;
+ req->vq = vq;
+
+ Coroutine *co =
+ qemu_coroutine_create(vu_blk_virtio_process_req, req);
+ qemu_coroutine_enter(co);
+ }
+}
+
+static void vu_blk_queue_set_started(VuDev *vu_dev, int idx, bool started)
+{
+ VuVirtq *vq;
+
+ assert(vu_dev);
+
+ vq = vu_get_queue(vu_dev, idx);
+ vu_set_queue_handler(vu_dev, vq, started ? vu_blk_process_vq : NULL);
+}
+
+static uint64_t vu_blk_get_features(VuDev *dev)
+{
+ uint64_t features;
+ VuServer *server = container_of(dev, VuServer, vu_dev);
+ VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+ features = 1ull << VIRTIO_BLK_F_SIZE_MAX |
+ 1ull << VIRTIO_BLK_F_SEG_MAX |
+ 1ull << VIRTIO_BLK_F_TOPOLOGY |
+ 1ull << VIRTIO_BLK_F_BLK_SIZE |
+ 1ull << VIRTIO_BLK_F_FLUSH |
+ 1ull << VIRTIO_BLK_F_DISCARD |
+ 1ull << VIRTIO_BLK_F_WRITE_ZEROES |
+ 1ull << VIRTIO_BLK_F_CONFIG_WCE |
+ 1ull << VIRTIO_BLK_F_MQ |
+ 1ull << VIRTIO_F_VERSION_1 |
+ 1ull << VIRTIO_RING_F_INDIRECT_DESC |
+ 1ull << VIRTIO_RING_F_EVENT_IDX |
+ 1ull << VHOST_USER_F_PROTOCOL_FEATURES;
+
+ if (!vexp->writable) {
+ features |= 1ull << VIRTIO_BLK_F_RO;
+ }
+
+ return features;
+}
+
+static uint64_t vu_blk_get_protocol_features(VuDev *dev)
+{
+ return 1ull << VHOST_USER_PROTOCOL_F_CONFIG;
+}
+
+static int
+vu_blk_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len)
+{
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+ VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+
+ if (len > sizeof(struct virtio_blk_config)) {
+ return -1;
+ }
+
+ memcpy(config, &vexp->blkcfg, len);
+ return 0;
+}
+
+static int
+vu_blk_set_config(VuDev *vu_dev, const uint8_t *data,
+ uint32_t offset, uint32_t size, uint32_t flags)
+{
+ VuServer *server = container_of(vu_dev, VuServer, vu_dev);
+ VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server);
+ uint8_t wce;
+
+ /* don't support live migration */
+ if (flags != VHOST_SET_CONFIG_TYPE_MASTER) {
+ return -EINVAL;
+ }
+
+ if (offset != offsetof(struct virtio_blk_config, wce) ||
+ size != 1) {
+ return -EINVAL;
+ }
+
+ wce = *data;
+ vexp->blkcfg.wce = wce;
+ blk_set_enable_write_cache(vexp->export.blk, wce);
+ return 0;
+}
+
+/*
+ * When the client disconnects, it sends a VHOST_USER_NONE request
+ * and vu_process_message will simple call exit which cause the VM
+ * to exit abruptly.
+ * To avoid this issue, process VHOST_USER_NONE request ahead
+ * of vu_process_message.
+ *
+ */
+static int vu_blk_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply)
+{
+ if (vmsg->request == VHOST_USER_NONE) {
+ dev->panic(dev, "disconnect");
+ return true;
+ }
+ return false;
+}
+
+static const VuDevIface vu_blk_iface = {
+ .get_features = vu_blk_get_features,
+ .queue_set_started = vu_blk_queue_set_started,
+ .get_protocol_features = vu_blk_get_protocol_features,
+ .get_config = vu_blk_get_config,
+ .set_config = vu_blk_set_config,
+ .process_msg = vu_blk_process_msg,
+};
+
+static void blk_aio_attached(AioContext *ctx, void *opaque)
+{
+ VuBlkExport *vexp = opaque;
+
+ vexp->export.ctx = ctx;
+ vhost_user_server_attach_aio_context(&vexp->vu_server, ctx);
+}
+
+static void blk_aio_detach(void *opaque)
+{
+ VuBlkExport *vexp = opaque;
+
+ vhost_user_server_detach_aio_context(&vexp->vu_server);
+ vexp->export.ctx = NULL;
+}
+
+static void
+vu_blk_initialize_config(BlockDriverState *bs,
+ struct virtio_blk_config *config,
+ uint32_t blk_size,
+ uint16_t num_queues)
+{
+ config->capacity =
+ cpu_to_le64(bdrv_getlength(bs) >> VIRTIO_BLK_SECTOR_BITS);
+ config->blk_size = cpu_to_le32(blk_size);
+ config->size_max = cpu_to_le32(0);
+ config->seg_max = cpu_to_le32(128 - 2);
+ config->min_io_size = cpu_to_le16(1);
+ config->opt_io_size = cpu_to_le32(1);
+ config->num_queues = cpu_to_le16(num_queues);
+ config->max_discard_sectors =
+ cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS);
+ config->max_discard_seg = cpu_to_le32(1);
+ config->discard_sector_alignment =
+ cpu_to_le32(blk_size >> VIRTIO_BLK_SECTOR_BITS);
+ config->max_write_zeroes_sectors
+ = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS);
+ config->max_write_zeroes_seg = cpu_to_le32(1);
+}
+
+static void vu_blk_exp_request_shutdown(BlockExport *exp)
+{
+ VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+
+ vhost_user_server_stop(&vexp->vu_server);
+}
+
+static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts,
+ Error **errp)
+{
+ VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+ BlockExportOptionsVhostUserBlk *vu_opts = &opts->u.vhost_user_blk;
+ Error *local_err = NULL;
+ uint64_t logical_block_size;
+ uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT;
+
+ vexp->writable = opts->writable;
+ vexp->blkcfg.wce = 0;
+
+ if (vu_opts->has_logical_block_size) {
+ logical_block_size = vu_opts->logical_block_size;
+ } else {
+ logical_block_size = VIRTIO_BLK_SECTOR_SIZE;
+ }
+ check_block_size(exp->id, "logical-block-size", logical_block_size,
+ &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return -EINVAL;
+ }
+ vexp->blk_size = logical_block_size;
+ blk_set_guest_block_size(exp->blk, logical_block_size);
+
+ if (vu_opts->has_num_queues) {
+ num_queues = vu_opts->num_queues;
+ }
+ if (num_queues == 0) {
+ error_setg(errp, "num-queues must be greater than 0");
+ return -EINVAL;
+ }
+
+ vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg,
+ logical_block_size, num_queues);
+
+ blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
+ vexp);
+
+ if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx,
+ num_queues, &vu_blk_iface, errp)) {
+ blk_remove_aio_context_notifier(exp->blk, blk_aio_attached,
+ blk_aio_detach, vexp);
+ return -EADDRNOTAVAIL;
+ }
+
+ return 0;
+}
+
+static void vu_blk_exp_delete(BlockExport *exp)
+{
+ VuBlkExport *vexp = container_of(exp, VuBlkExport, export);
+
+ blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach,
+ vexp);
+}
+
+const BlockExportDriver blk_exp_vhost_user_blk = {
+ .type = BLOCK_EXPORT_TYPE_VHOST_USER_BLK,
+ .instance_size = sizeof(VuBlkExport),
+ .create = vu_blk_exp_create,
+ .delete = vu_blk_exp_delete,
+ .request_shutdown = vu_blk_exp_request_shutdown,
+};
diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h
new file mode 100644
index 000000000..fcf46fc8a
--- /dev/null
+++ b/block/export/vhost-user-blk-server.h
@@ -0,0 +1,19 @@
+/*
+ * Sharing QEMU block devices via vhost-user protocal
+ *
+ * Copyright (c) Coiby Xu <coiby.xu@gmail.com>.
+ * Copyright (c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef VHOST_USER_BLK_SERVER_H
+#define VHOST_USER_BLK_SERVER_H
+
+#include "block/export.h"
+
+/* For block/export/export.c */
+extern const BlockExportDriver blk_exp_vhost_user_blk;
+
+#endif /* VHOST_USER_BLK_SERVER_H */