diff options
Diffstat (limited to 'block/export')
-rw-r--r-- | block/export/export.c | 364 | ||||
-rw-r--r-- | block/export/fuse.c | 812 | ||||
-rw-r--r-- | block/export/meson.build | 7 | ||||
-rw-r--r-- | block/export/vhost-user-blk-server.c | 533 | ||||
-rw-r--r-- | block/export/vhost-user-blk-server.h | 19 |
5 files changed, 1735 insertions, 0 deletions
diff --git a/block/export/export.c b/block/export/export.c new file mode 100644 index 000000000..6d3b9964c --- /dev/null +++ b/block/export/export.c @@ -0,0 +1,364 @@ +/* + * Common block export infrastructure + * + * Copyright (c) 2012, 2020 Red Hat, Inc. + * + * Authors: + * Paolo Bonzini <pbonzini@redhat.com> + * Kevin Wolf <kwolf@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#include "block/block.h" +#include "sysemu/block-backend.h" +#include "sysemu/iothread.h" +#include "block/export.h" +#include "block/fuse.h" +#include "block/nbd.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-block-export.h" +#include "qapi/qapi-events-block-export.h" +#include "qemu/id.h" +#ifdef CONFIG_VHOST_USER_BLK_SERVER +#include "vhost-user-blk-server.h" +#endif + +static const BlockExportDriver *blk_exp_drivers[] = { + &blk_exp_nbd, +#ifdef CONFIG_VHOST_USER_BLK_SERVER + &blk_exp_vhost_user_blk, +#endif +#ifdef CONFIG_FUSE + &blk_exp_fuse, +#endif +}; + +/* Only accessed from the main thread */ +static QLIST_HEAD(, BlockExport) block_exports = + QLIST_HEAD_INITIALIZER(block_exports); + +BlockExport *blk_exp_find(const char *id) +{ + BlockExport *exp; + + QLIST_FOREACH(exp, &block_exports, next) { + if (strcmp(id, exp->id) == 0) { + return exp; + } + } + + return NULL; +} + +static const BlockExportDriver *blk_exp_find_driver(BlockExportType type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(blk_exp_drivers); i++) { + if (blk_exp_drivers[i]->type == type) { + return blk_exp_drivers[i]; + } + } + return NULL; +} + +BlockExport *blk_exp_add(BlockExportOptions *export, Error **errp) +{ + bool fixed_iothread = export->has_fixed_iothread && export->fixed_iothread; + const BlockExportDriver *drv; + BlockExport *exp = NULL; + BlockDriverState *bs; + BlockBackend *blk = NULL; + AioContext *ctx; + uint64_t perm; + int ret; + + if (!id_wellformed(export->id)) { + error_setg(errp, "Invalid block export id"); + return NULL; + } + if (blk_exp_find(export->id)) { + error_setg(errp, "Block export id '%s' is already in use", export->id); + return NULL; + } + + drv = blk_exp_find_driver(export->type); + if (!drv) { + error_setg(errp, "No driver found for the requested export type"); + return NULL; + } + + bs = bdrv_lookup_bs(NULL, export->node_name, errp); + if (!bs) { + return NULL; + } + + if (!export->has_writable) { + export->writable = false; + } + if (bdrv_is_read_only(bs) && export->writable) { + error_setg(errp, "Cannot export read-only node as writable"); + return NULL; + } + + ctx = bdrv_get_aio_context(bs); + aio_context_acquire(ctx); + + if (export->has_iothread) { + IOThread *iothread; + AioContext *new_ctx; + Error **set_context_errp; + + iothread = iothread_by_id(export->iothread); + if (!iothread) { + error_setg(errp, "iothread \"%s\" not found", export->iothread); + goto fail; + } + + new_ctx = iothread_get_aio_context(iothread); + + /* Ignore errors with fixed-iothread=false */ + set_context_errp = fixed_iothread ? errp : NULL; + ret = bdrv_try_set_aio_context(bs, new_ctx, set_context_errp); + if (ret == 0) { + aio_context_release(ctx); + aio_context_acquire(new_ctx); + ctx = new_ctx; + } else if (fixed_iothread) { + goto fail; + } + } + + /* + * Block exports are used for non-shared storage migration. Make sure + * that BDRV_O_INACTIVE is cleared and the image is ready for write + * access since the export could be available before migration handover. + * ctx was acquired in the caller. + */ + bdrv_invalidate_cache(bs, NULL); + + perm = BLK_PERM_CONSISTENT_READ; + if (export->writable) { + perm |= BLK_PERM_WRITE; + } + + blk = blk_new(ctx, perm, BLK_PERM_ALL); + + if (!fixed_iothread) { + blk_set_allow_aio_context_change(blk, true); + } + + ret = blk_insert_bs(blk, bs, errp); + if (ret < 0) { + goto fail; + } + + if (!export->has_writethrough) { + export->writethrough = false; + } + blk_set_enable_write_cache(blk, !export->writethrough); + + assert(drv->instance_size >= sizeof(BlockExport)); + exp = g_malloc0(drv->instance_size); + *exp = (BlockExport) { + .drv = drv, + .refcount = 1, + .user_owned = true, + .id = g_strdup(export->id), + .ctx = ctx, + .blk = blk, + }; + + ret = drv->create(exp, export, errp); + if (ret < 0) { + goto fail; + } + + assert(exp->blk != NULL); + + QLIST_INSERT_HEAD(&block_exports, exp, next); + + aio_context_release(ctx); + return exp; + +fail: + blk_unref(blk); + aio_context_release(ctx); + if (exp) { + g_free(exp->id); + g_free(exp); + } + return NULL; +} + +/* Callers must hold exp->ctx lock */ +void blk_exp_ref(BlockExport *exp) +{ + assert(exp->refcount > 0); + exp->refcount++; +} + +/* Runs in the main thread */ +static void blk_exp_delete_bh(void *opaque) +{ + BlockExport *exp = opaque; + AioContext *aio_context = exp->ctx; + + aio_context_acquire(aio_context); + + assert(exp->refcount == 0); + QLIST_REMOVE(exp, next); + exp->drv->delete(exp); + blk_unref(exp->blk); + qapi_event_send_block_export_deleted(exp->id); + g_free(exp->id); + g_free(exp); + + aio_context_release(aio_context); +} + +/* Callers must hold exp->ctx lock */ +void blk_exp_unref(BlockExport *exp) +{ + assert(exp->refcount > 0); + if (--exp->refcount == 0) { + /* Touch the block_exports list only in the main thread */ + aio_bh_schedule_oneshot(qemu_get_aio_context(), blk_exp_delete_bh, + exp); + } +} + +/* + * Drops the user reference to the export and requests that all client + * connections and other internally held references start to shut down. When + * the function returns, there may still be active references while the export + * is in the process of shutting down. + * + * Acquires exp->ctx internally. Callers must *not* hold the lock. + */ +void blk_exp_request_shutdown(BlockExport *exp) +{ + AioContext *aio_context = exp->ctx; + + aio_context_acquire(aio_context); + + /* + * If the user doesn't own the export any more, it is already shutting + * down. We must not call .request_shutdown and decrease the refcount a + * second time. + */ + if (!exp->user_owned) { + goto out; + } + + exp->drv->request_shutdown(exp); + + assert(exp->user_owned); + exp->user_owned = false; + blk_exp_unref(exp); + +out: + aio_context_release(aio_context); +} + +/* + * Returns whether a block export of the given type exists. + * type == BLOCK_EXPORT_TYPE__MAX checks for an export of any type. + */ +static bool blk_exp_has_type(BlockExportType type) +{ + BlockExport *exp; + + if (type == BLOCK_EXPORT_TYPE__MAX) { + return !QLIST_EMPTY(&block_exports); + } + + QLIST_FOREACH(exp, &block_exports, next) { + if (exp->drv->type == type) { + return true; + } + } + + return false; +} + +/* type == BLOCK_EXPORT_TYPE__MAX for all types */ +void blk_exp_close_all_type(BlockExportType type) +{ + BlockExport *exp, *next; + + assert(in_aio_context_home_thread(qemu_get_aio_context())); + + QLIST_FOREACH_SAFE(exp, &block_exports, next, next) { + if (type != BLOCK_EXPORT_TYPE__MAX && exp->drv->type != type) { + continue; + } + blk_exp_request_shutdown(exp); + } + + AIO_WAIT_WHILE(NULL, blk_exp_has_type(type)); +} + +void blk_exp_close_all(void) +{ + blk_exp_close_all_type(BLOCK_EXPORT_TYPE__MAX); +} + +void qmp_block_export_add(BlockExportOptions *export, Error **errp) +{ + blk_exp_add(export, errp); +} + +void qmp_block_export_del(const char *id, + bool has_mode, BlockExportRemoveMode mode, + Error **errp) +{ + ERRP_GUARD(); + BlockExport *exp; + + exp = blk_exp_find(id); + if (exp == NULL) { + error_setg(errp, "Export '%s' is not found", id); + return; + } + if (!exp->user_owned) { + error_setg(errp, "Export '%s' is already shutting down", id); + return; + } + + if (!has_mode) { + mode = BLOCK_EXPORT_REMOVE_MODE_SAFE; + } + if (mode == BLOCK_EXPORT_REMOVE_MODE_SAFE && exp->refcount > 1) { + error_setg(errp, "export '%s' still in use", exp->id); + error_append_hint(errp, "Use mode='hard' to force client " + "disconnect\n"); + return; + } + + blk_exp_request_shutdown(exp); +} + +BlockExportInfoList *qmp_query_block_exports(Error **errp) +{ + BlockExportInfoList *head = NULL, **tail = &head; + BlockExport *exp; + + QLIST_FOREACH(exp, &block_exports, next) { + BlockExportInfo *info = g_new(BlockExportInfo, 1); + *info = (BlockExportInfo) { + .id = g_strdup(exp->id), + .type = exp->drv->type, + .node_name = g_strdup(bdrv_get_node_name(blk_bs(exp->blk))), + .shutting_down = !exp->user_owned, + }; + + QAPI_LIST_APPEND(tail, info); + } + + return head; +} diff --git a/block/export/fuse.c b/block/export/fuse.c new file mode 100644 index 000000000..823c126d2 --- /dev/null +++ b/block/export/fuse.c @@ -0,0 +1,812 @@ +/* + * Present a block device as a raw image through FUSE + * + * Copyright (c) 2020 Max Reitz <mreitz@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 or later of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#define FUSE_USE_VERSION 31 + +#include "qemu/osdep.h" +#include "block/aio.h" +#include "block/block.h" +#include "block/export.h" +#include "block/fuse.h" +#include "block/qapi.h" +#include "qapi/error.h" +#include "qapi/qapi-commands-block.h" +#include "sysemu/block-backend.h" + +#include <fuse.h> +#include <fuse_lowlevel.h> + +#if defined(CONFIG_FALLOCATE_ZERO_RANGE) +#include <linux/falloc.h> +#endif + +#ifdef __linux__ +#include <linux/fs.h> +#endif + +/* Prevent overly long bounce buffer allocations */ +#define FUSE_MAX_BOUNCE_BYTES (MIN(BDRV_REQUEST_MAX_BYTES, 64 * 1024 * 1024)) + + +typedef struct FuseExport { + BlockExport common; + + struct fuse_session *fuse_session; + struct fuse_buf fuse_buf; + bool mounted, fd_handler_set_up; + + char *mountpoint; + bool writable; + bool growable; + /* Whether allow_other was used as a mount option or not */ + bool allow_other; + + mode_t st_mode; + uid_t st_uid; + gid_t st_gid; +} FuseExport; + +static GHashTable *exports; +static const struct fuse_lowlevel_ops fuse_ops; + +static void fuse_export_shutdown(BlockExport *exp); +static void fuse_export_delete(BlockExport *exp); + +static void init_exports_table(void); + +static int setup_fuse_export(FuseExport *exp, const char *mountpoint, + bool allow_other, Error **errp); +static void read_from_fuse_export(void *opaque); + +static bool is_regular_file(const char *path, Error **errp); + + +static int fuse_export_create(BlockExport *blk_exp, + BlockExportOptions *blk_exp_args, + Error **errp) +{ + FuseExport *exp = container_of(blk_exp, FuseExport, common); + BlockExportOptionsFuse *args = &blk_exp_args->u.fuse; + int ret; + + assert(blk_exp_args->type == BLOCK_EXPORT_TYPE_FUSE); + + /* For growable exports, take the RESIZE permission */ + if (args->growable) { + uint64_t blk_perm, blk_shared_perm; + + blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm); + + ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE, + blk_shared_perm, errp); + if (ret < 0) { + return ret; + } + } + + init_exports_table(); + + /* + * It is important to do this check before calling is_regular_file() -- + * that function will do a stat(), which we would have to handle if we + * already exported something on @mountpoint. But we cannot, because + * we are currently caught up here. + * (Note that ideally we would want to resolve relative paths here, + * but bdrv_make_absolute_filename() might do the wrong thing for + * paths that contain colons, and realpath() would resolve symlinks, + * which we do not want: The mount point is not going to be the + * symlink's destination, but the link itself.) + * So this will not catch all potential clashes, but hopefully at + * least the most common one of specifying exactly the same path + * string twice. + */ + if (g_hash_table_contains(exports, args->mountpoint)) { + error_setg(errp, "There already is a FUSE export on '%s'", + args->mountpoint); + ret = -EEXIST; + goto fail; + } + + if (!is_regular_file(args->mountpoint, errp)) { + ret = -EINVAL; + goto fail; + } + + exp->mountpoint = g_strdup(args->mountpoint); + exp->writable = blk_exp_args->writable; + exp->growable = args->growable; + + /* set default */ + if (!args->has_allow_other) { + args->allow_other = FUSE_EXPORT_ALLOW_OTHER_AUTO; + } + + exp->st_mode = S_IFREG | S_IRUSR; + if (exp->writable) { + exp->st_mode |= S_IWUSR; + } + exp->st_uid = getuid(); + exp->st_gid = getgid(); + + if (args->allow_other == FUSE_EXPORT_ALLOW_OTHER_AUTO) { + /* Ignore errors on our first attempt */ + ret = setup_fuse_export(exp, args->mountpoint, true, NULL); + exp->allow_other = ret == 0; + if (ret < 0) { + ret = setup_fuse_export(exp, args->mountpoint, false, errp); + } + } else { + exp->allow_other = args->allow_other == FUSE_EXPORT_ALLOW_OTHER_ON; + ret = setup_fuse_export(exp, args->mountpoint, exp->allow_other, errp); + } + if (ret < 0) { + goto fail; + } + + return 0; + +fail: + fuse_export_delete(blk_exp); + return ret; +} + +/** + * Allocates the global @exports hash table. + */ +static void init_exports_table(void) +{ + if (exports) { + return; + } + + exports = g_hash_table_new_full(g_str_hash, g_str_equal, g_free, NULL); +} + +/** + * Create exp->fuse_session and mount it. + */ +static int setup_fuse_export(FuseExport *exp, const char *mountpoint, + bool allow_other, Error **errp) +{ + const char *fuse_argv[4]; + char *mount_opts; + struct fuse_args fuse_args; + int ret; + + /* + * max_read needs to match what fuse_init() sets. + * max_write need not be supplied. + */ + mount_opts = g_strdup_printf("max_read=%zu,default_permissions%s", + FUSE_MAX_BOUNCE_BYTES, + allow_other ? ",allow_other" : ""); + + fuse_argv[0] = ""; /* Dummy program name */ + fuse_argv[1] = "-o"; + fuse_argv[2] = mount_opts; + fuse_argv[3] = NULL; + fuse_args = (struct fuse_args)FUSE_ARGS_INIT(3, (char **)fuse_argv); + + exp->fuse_session = fuse_session_new(&fuse_args, &fuse_ops, + sizeof(fuse_ops), exp); + g_free(mount_opts); + if (!exp->fuse_session) { + error_setg(errp, "Failed to set up FUSE session"); + ret = -EIO; + goto fail; + } + + ret = fuse_session_mount(exp->fuse_session, mountpoint); + if (ret < 0) { + error_setg(errp, "Failed to mount FUSE session to export"); + ret = -EIO; + goto fail; + } + exp->mounted = true; + + g_hash_table_insert(exports, g_strdup(mountpoint), NULL); + + aio_set_fd_handler(exp->common.ctx, + fuse_session_fd(exp->fuse_session), true, + read_from_fuse_export, NULL, NULL, exp); + exp->fd_handler_set_up = true; + + return 0; + +fail: + fuse_export_shutdown(&exp->common); + return ret; +} + +/** + * Callback to be invoked when the FUSE session FD can be read from. + * (This is basically the FUSE event loop.) + */ +static void read_from_fuse_export(void *opaque) +{ + FuseExport *exp = opaque; + int ret; + + blk_exp_ref(&exp->common); + + do { + ret = fuse_session_receive_buf(exp->fuse_session, &exp->fuse_buf); + } while (ret == -EINTR); + if (ret < 0) { + goto out; + } + + fuse_session_process_buf(exp->fuse_session, &exp->fuse_buf); + +out: + blk_exp_unref(&exp->common); +} + +static void fuse_export_shutdown(BlockExport *blk_exp) +{ + FuseExport *exp = container_of(blk_exp, FuseExport, common); + + if (exp->fuse_session) { + fuse_session_exit(exp->fuse_session); + + if (exp->fd_handler_set_up) { + aio_set_fd_handler(exp->common.ctx, + fuse_session_fd(exp->fuse_session), true, + NULL, NULL, NULL, NULL); + exp->fd_handler_set_up = false; + } + } + + if (exp->mountpoint) { + /* + * Safe to drop now, because we will not handle any requests + * for this export anymore anyway. + */ + g_hash_table_remove(exports, exp->mountpoint); + } +} + +static void fuse_export_delete(BlockExport *blk_exp) +{ + FuseExport *exp = container_of(blk_exp, FuseExport, common); + + if (exp->fuse_session) { + if (exp->mounted) { + fuse_session_unmount(exp->fuse_session); + } + + fuse_session_destroy(exp->fuse_session); + } + + free(exp->fuse_buf.mem); + g_free(exp->mountpoint); +} + +/** + * Check whether @path points to a regular file. If not, put an + * appropriate message into *errp. + */ +static bool is_regular_file(const char *path, Error **errp) +{ + struct stat statbuf; + int ret; + + ret = stat(path, &statbuf); + if (ret < 0) { + error_setg_errno(errp, errno, "Failed to stat '%s'", path); + return false; + } + + if (!S_ISREG(statbuf.st_mode)) { + error_setg(errp, "'%s' is not a regular file", path); + return false; + } + + return true; +} + +/** + * A chance to set change some parameters supplied to FUSE_INIT. + */ +static void fuse_init(void *userdata, struct fuse_conn_info *conn) +{ + /* + * MIN_NON_ZERO() would not be wrong here, but what we set here + * must equal what has been passed to fuse_session_new(). + * Therefore, as long as max_read must be passed as a mount option + * (which libfuse claims will be changed at some point), we have + * to set max_read to a fixed value here. + */ + conn->max_read = FUSE_MAX_BOUNCE_BYTES; + + conn->max_write = MIN_NON_ZERO(BDRV_REQUEST_MAX_BYTES, conn->max_write); +} + +/** + * Let clients look up files. Always return ENOENT because we only + * care about the mountpoint itself. + */ +static void fuse_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + fuse_reply_err(req, ENOENT); +} + +/** + * Let clients get file attributes (i.e., stat() the file). + */ +static void fuse_getattr(fuse_req_t req, fuse_ino_t inode, + struct fuse_file_info *fi) +{ + struct stat statbuf; + int64_t length, allocated_blocks; + time_t now = time(NULL); + FuseExport *exp = fuse_req_userdata(req); + + length = blk_getlength(exp->common.blk); + if (length < 0) { + fuse_reply_err(req, -length); + return; + } + + allocated_blocks = bdrv_get_allocated_file_size(blk_bs(exp->common.blk)); + if (allocated_blocks <= 0) { + allocated_blocks = DIV_ROUND_UP(length, 512); + } else { + allocated_blocks = DIV_ROUND_UP(allocated_blocks, 512); + } + + statbuf = (struct stat) { + .st_ino = inode, + .st_mode = exp->st_mode, + .st_nlink = 1, + .st_uid = exp->st_uid, + .st_gid = exp->st_gid, + .st_size = length, + .st_blksize = blk_bs(exp->common.blk)->bl.request_alignment, + .st_blocks = allocated_blocks, + .st_atime = now, + .st_mtime = now, + .st_ctime = now, + }; + + fuse_reply_attr(req, &statbuf, 1.); +} + +static int fuse_do_truncate(const FuseExport *exp, int64_t size, + bool req_zero_write, PreallocMode prealloc) +{ + uint64_t blk_perm, blk_shared_perm; + BdrvRequestFlags truncate_flags = 0; + int ret; + + if (req_zero_write) { + truncate_flags |= BDRV_REQ_ZERO_WRITE; + } + + /* Growable exports have a permanent RESIZE permission */ + if (!exp->growable) { + blk_get_perm(exp->common.blk, &blk_perm, &blk_shared_perm); + + ret = blk_set_perm(exp->common.blk, blk_perm | BLK_PERM_RESIZE, + blk_shared_perm, NULL); + if (ret < 0) { + return ret; + } + } + + ret = blk_truncate(exp->common.blk, size, true, prealloc, + truncate_flags, NULL); + + if (!exp->growable) { + /* Must succeed, because we are only giving up the RESIZE permission */ + blk_set_perm(exp->common.blk, blk_perm, blk_shared_perm, &error_abort); + } + + return ret; +} + +/** + * Let clients set file attributes. Only resizing and changing + * permissions (st_mode, st_uid, st_gid) is allowed. + * Changing permissions is only allowed as far as it will actually + * permit access: Read-only exports cannot be given +w, and exports + * without allow_other cannot be given a different UID or GID, and + * they cannot be given non-owner access. + */ +static void fuse_setattr(fuse_req_t req, fuse_ino_t inode, struct stat *statbuf, + int to_set, struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + int supported_attrs; + int ret; + + supported_attrs = FUSE_SET_ATTR_SIZE | FUSE_SET_ATTR_MODE; + if (exp->allow_other) { + supported_attrs |= FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID; + } + + if (to_set & ~supported_attrs) { + fuse_reply_err(req, ENOTSUP); + return; + } + + /* Do some argument checks first before committing to anything */ + if (to_set & FUSE_SET_ATTR_MODE) { + /* + * Without allow_other, non-owners can never access the export, so do + * not allow setting permissions for them + */ + if (!exp->allow_other && + (statbuf->st_mode & (S_IRWXG | S_IRWXO)) != 0) + { + fuse_reply_err(req, EPERM); + return; + } + + /* +w for read-only exports makes no sense, disallow it */ + if (!exp->writable && + (statbuf->st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) + { + fuse_reply_err(req, EROFS); + return; + } + } + + if (to_set & FUSE_SET_ATTR_SIZE) { + if (!exp->writable) { + fuse_reply_err(req, EACCES); + return; + } + + ret = fuse_do_truncate(exp, statbuf->st_size, true, PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + } + + if (to_set & FUSE_SET_ATTR_MODE) { + /* Ignore FUSE-supplied file type, only change the mode */ + exp->st_mode = (statbuf->st_mode & 07777) | S_IFREG; + } + + if (to_set & FUSE_SET_ATTR_UID) { + exp->st_uid = statbuf->st_uid; + } + + if (to_set & FUSE_SET_ATTR_GID) { + exp->st_gid = statbuf->st_gid; + } + + fuse_getattr(req, inode, fi); +} + +/** + * Let clients open a file (i.e., the exported image). + */ +static void fuse_open(fuse_req_t req, fuse_ino_t inode, + struct fuse_file_info *fi) +{ + fuse_reply_open(req, fi); +} + +/** + * Handle client reads from the exported image. + */ +static void fuse_read(fuse_req_t req, fuse_ino_t inode, + size_t size, off_t offset, struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + int64_t length; + void *buf; + int ret; + + /* Limited by max_read, should not happen */ + if (size > FUSE_MAX_BOUNCE_BYTES) { + fuse_reply_err(req, EINVAL); + return; + } + + /** + * Clients will expect short reads at EOF, so we have to limit + * offset+size to the image length. + */ + length = blk_getlength(exp->common.blk); + if (length < 0) { + fuse_reply_err(req, -length); + return; + } + + if (offset + size > length) { + size = length - offset; + } + + buf = qemu_try_blockalign(blk_bs(exp->common.blk), size); + if (!buf) { + fuse_reply_err(req, ENOMEM); + return; + } + + ret = blk_pread(exp->common.blk, offset, buf, size); + if (ret >= 0) { + fuse_reply_buf(req, buf, size); + } else { + fuse_reply_err(req, -ret); + } + + qemu_vfree(buf); +} + +/** + * Handle client writes to the exported image. + */ +static void fuse_write(fuse_req_t req, fuse_ino_t inode, const char *buf, + size_t size, off_t offset, struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + int64_t length; + int ret; + + /* Limited by max_write, should not happen */ + if (size > BDRV_REQUEST_MAX_BYTES) { + fuse_reply_err(req, EINVAL); + return; + } + + if (!exp->writable) { + fuse_reply_err(req, EACCES); + return; + } + + /** + * Clients will expect short writes at EOF, so we have to limit + * offset+size to the image length. + */ + length = blk_getlength(exp->common.blk); + if (length < 0) { + fuse_reply_err(req, -length); + return; + } + + if (offset + size > length) { + if (exp->growable) { + ret = fuse_do_truncate(exp, offset + size, true, PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + } else { + size = length - offset; + } + } + + ret = blk_pwrite(exp->common.blk, offset, buf, size, 0); + if (ret >= 0) { + fuse_reply_write(req, size); + } else { + fuse_reply_err(req, -ret); + } +} + +/** + * Let clients perform various fallocate() operations. + */ +static void fuse_fallocate(fuse_req_t req, fuse_ino_t inode, int mode, + off_t offset, off_t length, + struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + int64_t blk_len; + int ret; + + if (!exp->writable) { + fuse_reply_err(req, EACCES); + return; + } + + blk_len = blk_getlength(exp->common.blk); + if (blk_len < 0) { + fuse_reply_err(req, -blk_len); + return; + } + + if (mode & FALLOC_FL_KEEP_SIZE) { + length = MIN(length, blk_len - offset); + } + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + fuse_reply_err(req, EINVAL); + return; + } + + do { + int size = MIN(length, BDRV_REQUEST_MAX_BYTES); + + ret = blk_pdiscard(exp->common.blk, offset, size); + offset += size; + length -= size; + } while (ret == 0 && length > 0); + } +#ifdef CONFIG_FALLOCATE_ZERO_RANGE + else if (mode & FALLOC_FL_ZERO_RANGE) { + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + length > blk_len) { + /* No need for zeroes, we are going to write them ourselves */ + ret = fuse_do_truncate(exp, offset + length, false, + PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + } + + do { + int size = MIN(length, BDRV_REQUEST_MAX_BYTES); + + ret = blk_pwrite_zeroes(exp->common.blk, + offset, size, 0); + offset += size; + length -= size; + } while (ret == 0 && length > 0); + } +#endif /* CONFIG_FALLOCATE_ZERO_RANGE */ + else if (!mode) { + /* We can only fallocate at the EOF with a truncate */ + if (offset < blk_len) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + if (offset > blk_len) { + /* No preallocation needed here */ + ret = fuse_do_truncate(exp, offset, true, PREALLOC_MODE_OFF); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + } + + ret = fuse_do_truncate(exp, offset + length, true, + PREALLOC_MODE_FALLOC); + } else { + ret = -EOPNOTSUPP; + } + + fuse_reply_err(req, ret < 0 ? -ret : 0); +} + +/** + * Let clients fsync the exported image. + */ +static void fuse_fsync(fuse_req_t req, fuse_ino_t inode, int datasync, + struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + int ret; + + ret = blk_flush(exp->common.blk); + fuse_reply_err(req, ret < 0 ? -ret : 0); +} + +/** + * Called before an FD to the exported image is closed. (libfuse + * notes this to be a way to return last-minute errors.) + */ +static void fuse_flush(fuse_req_t req, fuse_ino_t inode, + struct fuse_file_info *fi) +{ + fuse_fsync(req, inode, 1, fi); +} + +#ifdef CONFIG_FUSE_LSEEK +/** + * Let clients inquire allocation status. + */ +static void fuse_lseek(fuse_req_t req, fuse_ino_t inode, off_t offset, + int whence, struct fuse_file_info *fi) +{ + FuseExport *exp = fuse_req_userdata(req); + + if (whence != SEEK_HOLE && whence != SEEK_DATA) { + fuse_reply_err(req, EINVAL); + return; + } + + while (true) { + int64_t pnum; + int ret; + + ret = bdrv_block_status_above(blk_bs(exp->common.blk), NULL, + offset, INT64_MAX, &pnum, NULL, NULL); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + + if (!pnum && (ret & BDRV_BLOCK_EOF)) { + int64_t blk_len; + + /* + * If blk_getlength() rounds (e.g. by sectors), then the + * export length will be rounded, too. However, + * bdrv_block_status_above() may return EOF at unaligned + * offsets. We must not let this become visible and thus + * always simulate a hole between @offset (the real EOF) + * and @blk_len (the client-visible EOF). + */ + + blk_len = blk_getlength(exp->common.blk); + if (blk_len < 0) { + fuse_reply_err(req, -blk_len); + return; + } + + if (offset > blk_len || whence == SEEK_DATA) { + fuse_reply_err(req, ENXIO); + } else { + fuse_reply_lseek(req, offset); + } + return; + } + + if (ret & BDRV_BLOCK_DATA) { + if (whence == SEEK_DATA) { + fuse_reply_lseek(req, offset); + return; + } + } else { + if (whence == SEEK_HOLE) { + fuse_reply_lseek(req, offset); + return; + } + } + + /* Safety check against infinite loops */ + if (!pnum) { + fuse_reply_err(req, ENXIO); + return; + } + + offset += pnum; + } +} +#endif + +static const struct fuse_lowlevel_ops fuse_ops = { + .init = fuse_init, + .lookup = fuse_lookup, + .getattr = fuse_getattr, + .setattr = fuse_setattr, + .open = fuse_open, + .read = fuse_read, + .write = fuse_write, + .fallocate = fuse_fallocate, + .flush = fuse_flush, + .fsync = fuse_fsync, +#ifdef CONFIG_FUSE_LSEEK + .lseek = fuse_lseek, +#endif +}; + +const BlockExportDriver blk_exp_fuse = { + .type = BLOCK_EXPORT_TYPE_FUSE, + .instance_size = sizeof(FuseExport), + .create = fuse_export_create, + .delete = fuse_export_delete, + .request_shutdown = fuse_export_shutdown, +}; diff --git a/block/export/meson.build b/block/export/meson.build new file mode 100644 index 000000000..0a08e384c --- /dev/null +++ b/block/export/meson.build @@ -0,0 +1,7 @@ +blockdev_ss.add(files('export.c')) + +if have_vhost_user_blk_server + blockdev_ss.add(files('vhost-user-blk-server.c')) +endif + +blockdev_ss.add(when: fuse, if_true: files('fuse.c')) diff --git a/block/export/vhost-user-blk-server.c b/block/export/vhost-user-blk-server.c new file mode 100644 index 000000000..186256333 --- /dev/null +++ b/block/export/vhost-user-blk-server.c @@ -0,0 +1,533 @@ +/* + * Sharing QEMU block devices via vhost-user protocal + * + * Parts of the code based on nbd/server.c. + * + * Copyright (c) Coiby Xu <coiby.xu@gmail.com>. + * Copyright (c) 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ +#include "qemu/osdep.h" +#include "block/block.h" +#include "subprojects/libvhost-user/libvhost-user.h" /* only for the type definitions */ +#include "standard-headers/linux/virtio_blk.h" +#include "qemu/vhost-user-server.h" +#include "vhost-user-blk-server.h" +#include "qapi/error.h" +#include "qom/object_interfaces.h" +#include "sysemu/block-backend.h" +#include "util/block-helpers.h" + +/* + * Sector units are 512 bytes regardless of the + * virtio_blk_config->blk_size value. + */ +#define VIRTIO_BLK_SECTOR_BITS 9 +#define VIRTIO_BLK_SECTOR_SIZE (1ull << VIRTIO_BLK_SECTOR_BITS) + +enum { + VHOST_USER_BLK_NUM_QUEUES_DEFAULT = 1, + VHOST_USER_BLK_MAX_DISCARD_SECTORS = 32768, + VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS = 32768, +}; +struct virtio_blk_inhdr { + unsigned char status; +}; + +typedef struct VuBlkReq { + VuVirtqElement elem; + int64_t sector_num; + size_t size; + struct virtio_blk_inhdr *in; + struct virtio_blk_outhdr out; + VuServer *server; + struct VuVirtq *vq; +} VuBlkReq; + +/* vhost user block device */ +typedef struct { + BlockExport export; + VuServer vu_server; + uint32_t blk_size; + QIOChannelSocket *sioc; + struct virtio_blk_config blkcfg; + bool writable; +} VuBlkExport; + +static void vu_blk_req_complete(VuBlkReq *req) +{ + VuDev *vu_dev = &req->server->vu_dev; + + /* IO size with 1 extra status byte */ + vu_queue_push(vu_dev, req->vq, &req->elem, req->size + 1); + vu_queue_notify(vu_dev, req->vq); + + free(req); +} + +static bool vu_blk_sect_range_ok(VuBlkExport *vexp, uint64_t sector, + size_t size) +{ + uint64_t nb_sectors; + uint64_t total_sectors; + + if (size % VIRTIO_BLK_SECTOR_SIZE) { + return false; + } + + nb_sectors = size >> VIRTIO_BLK_SECTOR_BITS; + + QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != VIRTIO_BLK_SECTOR_SIZE); + if (nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return false; + } + if ((sector << VIRTIO_BLK_SECTOR_BITS) % vexp->blk_size) { + return false; + } + blk_get_geometry(vexp->export.blk, &total_sectors); + if (sector > total_sectors || nb_sectors > total_sectors - sector) { + return false; + } + return true; +} + +static int coroutine_fn +vu_blk_discard_write_zeroes(VuBlkExport *vexp, struct iovec *iov, + uint32_t iovcnt, uint32_t type) +{ + BlockBackend *blk = vexp->export.blk; + struct virtio_blk_discard_write_zeroes desc; + ssize_t size; + uint64_t sector; + uint32_t num_sectors; + uint32_t max_sectors; + uint32_t flags; + int bytes; + + /* Only one desc is currently supported */ + if (unlikely(iov_size(iov, iovcnt) > sizeof(desc))) { + return VIRTIO_BLK_S_UNSUPP; + } + + size = iov_to_buf(iov, iovcnt, 0, &desc, sizeof(desc)); + if (unlikely(size != sizeof(desc))) { + error_report("Invalid size %zd, expected %zu", size, sizeof(desc)); + return VIRTIO_BLK_S_IOERR; + } + + sector = le64_to_cpu(desc.sector); + num_sectors = le32_to_cpu(desc.num_sectors); + flags = le32_to_cpu(desc.flags); + max_sectors = (type == VIRTIO_BLK_T_WRITE_ZEROES) ? + VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS : + VHOST_USER_BLK_MAX_DISCARD_SECTORS; + + /* This check ensures that 'bytes' fits in an int */ + if (unlikely(num_sectors > max_sectors)) { + return VIRTIO_BLK_S_IOERR; + } + + bytes = num_sectors << VIRTIO_BLK_SECTOR_BITS; + + if (unlikely(!vu_blk_sect_range_ok(vexp, sector, bytes))) { + return VIRTIO_BLK_S_IOERR; + } + + /* + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for discard + * and write zeroes commands if any unknown flag is set. + */ + if (unlikely(flags & ~VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { + return VIRTIO_BLK_S_UNSUPP; + } + + if (type == VIRTIO_BLK_T_WRITE_ZEROES) { + int blk_flags = 0; + + if (flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP) { + blk_flags |= BDRV_REQ_MAY_UNMAP; + } + + if (blk_co_pwrite_zeroes(blk, sector << VIRTIO_BLK_SECTOR_BITS, + bytes, blk_flags) == 0) { + return VIRTIO_BLK_S_OK; + } + } else if (type == VIRTIO_BLK_T_DISCARD) { + /* + * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP for + * discard commands if the unmap flag is set. + */ + if (unlikely(flags & VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP)) { + return VIRTIO_BLK_S_UNSUPP; + } + + if (blk_co_pdiscard(blk, sector << VIRTIO_BLK_SECTOR_BITS, + bytes) == 0) { + return VIRTIO_BLK_S_OK; + } + } + + return VIRTIO_BLK_S_IOERR; +} + +static void coroutine_fn vu_blk_virtio_process_req(void *opaque) +{ + VuBlkReq *req = opaque; + VuServer *server = req->server; + VuVirtqElement *elem = &req->elem; + uint32_t type; + + VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server); + BlockBackend *blk = vexp->export.blk; + + struct iovec *in_iov = elem->in_sg; + struct iovec *out_iov = elem->out_sg; + unsigned in_num = elem->in_num; + unsigned out_num = elem->out_num; + + /* refer to hw/block/virtio_blk.c */ + if (elem->out_num < 1 || elem->in_num < 1) { + error_report("virtio-blk request missing headers"); + goto err; + } + + if (unlikely(iov_to_buf(out_iov, out_num, 0, &req->out, + sizeof(req->out)) != sizeof(req->out))) { + error_report("virtio-blk request outhdr too short"); + goto err; + } + + iov_discard_front(&out_iov, &out_num, sizeof(req->out)); + + if (in_iov[in_num - 1].iov_len < sizeof(struct virtio_blk_inhdr)) { + error_report("virtio-blk request inhdr too short"); + goto err; + } + + /* We always touch the last byte, so just see how big in_iov is. */ + req->in = (void *)in_iov[in_num - 1].iov_base + + in_iov[in_num - 1].iov_len + - sizeof(struct virtio_blk_inhdr); + iov_discard_back(in_iov, &in_num, sizeof(struct virtio_blk_inhdr)); + + type = le32_to_cpu(req->out.type); + switch (type & ~VIRTIO_BLK_T_BARRIER) { + case VIRTIO_BLK_T_IN: + case VIRTIO_BLK_T_OUT: { + QEMUIOVector qiov; + int64_t offset; + ssize_t ret = 0; + bool is_write = type & VIRTIO_BLK_T_OUT; + req->sector_num = le64_to_cpu(req->out.sector); + + if (is_write && !vexp->writable) { + req->in->status = VIRTIO_BLK_S_IOERR; + break; + } + + if (is_write) { + qemu_iovec_init_external(&qiov, out_iov, out_num); + } else { + qemu_iovec_init_external(&qiov, in_iov, in_num); + } + + if (unlikely(!vu_blk_sect_range_ok(vexp, + req->sector_num, + qiov.size))) { + req->in->status = VIRTIO_BLK_S_IOERR; + break; + } + + offset = req->sector_num << VIRTIO_BLK_SECTOR_BITS; + + if (is_write) { + ret = blk_co_pwritev(blk, offset, qiov.size, &qiov, 0); + } else { + ret = blk_co_preadv(blk, offset, qiov.size, &qiov, 0); + } + if (ret >= 0) { + req->in->status = VIRTIO_BLK_S_OK; + } else { + req->in->status = VIRTIO_BLK_S_IOERR; + } + break; + } + case VIRTIO_BLK_T_FLUSH: + if (blk_co_flush(blk) == 0) { + req->in->status = VIRTIO_BLK_S_OK; + } else { + req->in->status = VIRTIO_BLK_S_IOERR; + } + break; + case VIRTIO_BLK_T_GET_ID: { + size_t size = MIN(iov_size(&elem->in_sg[0], in_num), + VIRTIO_BLK_ID_BYTES); + snprintf(elem->in_sg[0].iov_base, size, "%s", "vhost_user_blk"); + req->in->status = VIRTIO_BLK_S_OK; + req->size = elem->in_sg[0].iov_len; + break; + } + case VIRTIO_BLK_T_DISCARD: + case VIRTIO_BLK_T_WRITE_ZEROES: { + if (!vexp->writable) { + req->in->status = VIRTIO_BLK_S_IOERR; + break; + } + + req->in->status = vu_blk_discard_write_zeroes(vexp, out_iov, out_num, + type); + break; + } + default: + req->in->status = VIRTIO_BLK_S_UNSUPP; + break; + } + + vu_blk_req_complete(req); + return; + +err: + free(req); +} + +static void vu_blk_process_vq(VuDev *vu_dev, int idx) +{ + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + VuVirtq *vq = vu_get_queue(vu_dev, idx); + + while (1) { + VuBlkReq *req; + + req = vu_queue_pop(vu_dev, vq, sizeof(VuBlkReq)); + if (!req) { + break; + } + + req->server = server; + req->vq = vq; + + Coroutine *co = + qemu_coroutine_create(vu_blk_virtio_process_req, req); + qemu_coroutine_enter(co); + } +} + +static void vu_blk_queue_set_started(VuDev *vu_dev, int idx, bool started) +{ + VuVirtq *vq; + + assert(vu_dev); + + vq = vu_get_queue(vu_dev, idx); + vu_set_queue_handler(vu_dev, vq, started ? vu_blk_process_vq : NULL); +} + +static uint64_t vu_blk_get_features(VuDev *dev) +{ + uint64_t features; + VuServer *server = container_of(dev, VuServer, vu_dev); + VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server); + features = 1ull << VIRTIO_BLK_F_SIZE_MAX | + 1ull << VIRTIO_BLK_F_SEG_MAX | + 1ull << VIRTIO_BLK_F_TOPOLOGY | + 1ull << VIRTIO_BLK_F_BLK_SIZE | + 1ull << VIRTIO_BLK_F_FLUSH | + 1ull << VIRTIO_BLK_F_DISCARD | + 1ull << VIRTIO_BLK_F_WRITE_ZEROES | + 1ull << VIRTIO_BLK_F_CONFIG_WCE | + 1ull << VIRTIO_BLK_F_MQ | + 1ull << VIRTIO_F_VERSION_1 | + 1ull << VIRTIO_RING_F_INDIRECT_DESC | + 1ull << VIRTIO_RING_F_EVENT_IDX | + 1ull << VHOST_USER_F_PROTOCOL_FEATURES; + + if (!vexp->writable) { + features |= 1ull << VIRTIO_BLK_F_RO; + } + + return features; +} + +static uint64_t vu_blk_get_protocol_features(VuDev *dev) +{ + return 1ull << VHOST_USER_PROTOCOL_F_CONFIG; +} + +static int +vu_blk_get_config(VuDev *vu_dev, uint8_t *config, uint32_t len) +{ + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server); + + if (len > sizeof(struct virtio_blk_config)) { + return -1; + } + + memcpy(config, &vexp->blkcfg, len); + return 0; +} + +static int +vu_blk_set_config(VuDev *vu_dev, const uint8_t *data, + uint32_t offset, uint32_t size, uint32_t flags) +{ + VuServer *server = container_of(vu_dev, VuServer, vu_dev); + VuBlkExport *vexp = container_of(server, VuBlkExport, vu_server); + uint8_t wce; + + /* don't support live migration */ + if (flags != VHOST_SET_CONFIG_TYPE_MASTER) { + return -EINVAL; + } + + if (offset != offsetof(struct virtio_blk_config, wce) || + size != 1) { + return -EINVAL; + } + + wce = *data; + vexp->blkcfg.wce = wce; + blk_set_enable_write_cache(vexp->export.blk, wce); + return 0; +} + +/* + * When the client disconnects, it sends a VHOST_USER_NONE request + * and vu_process_message will simple call exit which cause the VM + * to exit abruptly. + * To avoid this issue, process VHOST_USER_NONE request ahead + * of vu_process_message. + * + */ +static int vu_blk_process_msg(VuDev *dev, VhostUserMsg *vmsg, int *do_reply) +{ + if (vmsg->request == VHOST_USER_NONE) { + dev->panic(dev, "disconnect"); + return true; + } + return false; +} + +static const VuDevIface vu_blk_iface = { + .get_features = vu_blk_get_features, + .queue_set_started = vu_blk_queue_set_started, + .get_protocol_features = vu_blk_get_protocol_features, + .get_config = vu_blk_get_config, + .set_config = vu_blk_set_config, + .process_msg = vu_blk_process_msg, +}; + +static void blk_aio_attached(AioContext *ctx, void *opaque) +{ + VuBlkExport *vexp = opaque; + + vexp->export.ctx = ctx; + vhost_user_server_attach_aio_context(&vexp->vu_server, ctx); +} + +static void blk_aio_detach(void *opaque) +{ + VuBlkExport *vexp = opaque; + + vhost_user_server_detach_aio_context(&vexp->vu_server); + vexp->export.ctx = NULL; +} + +static void +vu_blk_initialize_config(BlockDriverState *bs, + struct virtio_blk_config *config, + uint32_t blk_size, + uint16_t num_queues) +{ + config->capacity = + cpu_to_le64(bdrv_getlength(bs) >> VIRTIO_BLK_SECTOR_BITS); + config->blk_size = cpu_to_le32(blk_size); + config->size_max = cpu_to_le32(0); + config->seg_max = cpu_to_le32(128 - 2); + config->min_io_size = cpu_to_le16(1); + config->opt_io_size = cpu_to_le32(1); + config->num_queues = cpu_to_le16(num_queues); + config->max_discard_sectors = + cpu_to_le32(VHOST_USER_BLK_MAX_DISCARD_SECTORS); + config->max_discard_seg = cpu_to_le32(1); + config->discard_sector_alignment = + cpu_to_le32(blk_size >> VIRTIO_BLK_SECTOR_BITS); + config->max_write_zeroes_sectors + = cpu_to_le32(VHOST_USER_BLK_MAX_WRITE_ZEROES_SECTORS); + config->max_write_zeroes_seg = cpu_to_le32(1); +} + +static void vu_blk_exp_request_shutdown(BlockExport *exp) +{ + VuBlkExport *vexp = container_of(exp, VuBlkExport, export); + + vhost_user_server_stop(&vexp->vu_server); +} + +static int vu_blk_exp_create(BlockExport *exp, BlockExportOptions *opts, + Error **errp) +{ + VuBlkExport *vexp = container_of(exp, VuBlkExport, export); + BlockExportOptionsVhostUserBlk *vu_opts = &opts->u.vhost_user_blk; + Error *local_err = NULL; + uint64_t logical_block_size; + uint16_t num_queues = VHOST_USER_BLK_NUM_QUEUES_DEFAULT; + + vexp->writable = opts->writable; + vexp->blkcfg.wce = 0; + + if (vu_opts->has_logical_block_size) { + logical_block_size = vu_opts->logical_block_size; + } else { + logical_block_size = VIRTIO_BLK_SECTOR_SIZE; + } + check_block_size(exp->id, "logical-block-size", logical_block_size, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; + } + vexp->blk_size = logical_block_size; + blk_set_guest_block_size(exp->blk, logical_block_size); + + if (vu_opts->has_num_queues) { + num_queues = vu_opts->num_queues; + } + if (num_queues == 0) { + error_setg(errp, "num-queues must be greater than 0"); + return -EINVAL; + } + + vu_blk_initialize_config(blk_bs(exp->blk), &vexp->blkcfg, + logical_block_size, num_queues); + + blk_add_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, + vexp); + + if (!vhost_user_server_start(&vexp->vu_server, vu_opts->addr, exp->ctx, + num_queues, &vu_blk_iface, errp)) { + blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, + blk_aio_detach, vexp); + return -EADDRNOTAVAIL; + } + + return 0; +} + +static void vu_blk_exp_delete(BlockExport *exp) +{ + VuBlkExport *vexp = container_of(exp, VuBlkExport, export); + + blk_remove_aio_context_notifier(exp->blk, blk_aio_attached, blk_aio_detach, + vexp); +} + +const BlockExportDriver blk_exp_vhost_user_blk = { + .type = BLOCK_EXPORT_TYPE_VHOST_USER_BLK, + .instance_size = sizeof(VuBlkExport), + .create = vu_blk_exp_create, + .delete = vu_blk_exp_delete, + .request_shutdown = vu_blk_exp_request_shutdown, +}; diff --git a/block/export/vhost-user-blk-server.h b/block/export/vhost-user-blk-server.h new file mode 100644 index 000000000..fcf46fc8a --- /dev/null +++ b/block/export/vhost-user-blk-server.h @@ -0,0 +1,19 @@ +/* + * Sharing QEMU block devices via vhost-user protocal + * + * Copyright (c) Coiby Xu <coiby.xu@gmail.com>. + * Copyright (c) 2020 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or + * later. See the COPYING file in the top-level directory. + */ + +#ifndef VHOST_USER_BLK_SERVER_H +#define VHOST_USER_BLK_SERVER_H + +#include "block/export.h" + +/* For block/export/export.c */ +extern const BlockExportDriver blk_exp_vhost_user_blk; + +#endif /* VHOST_USER_BLK_SERVER_H */ |