diff options
author | Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> | 2023-10-10 11:40:56 +0000 |
---|---|---|
committer | Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> | 2023-10-10 11:40:56 +0000 |
commit | e02cda008591317b1625707ff8e115a4841aa889 (patch) | |
tree | aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /hw/block/dataplane | |
parent | cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff) |
Introduce Virtio-loopback epsilon release:
Epsilon release introduces a new compatibility layer which make virtio-loopback
design to work with QEMU and rust-vmm vhost-user backend without require any
changes.
Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>
Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
Diffstat (limited to 'hw/block/dataplane')
-rw-r--r-- | hw/block/dataplane/meson.build | 2 | ||||
-rw-r--r-- | hw/block/dataplane/trace-events | 5 | ||||
-rw-r--r-- | hw/block/dataplane/trace.h | 1 | ||||
-rw-r--r-- | hw/block/dataplane/virtio-blk.c | 369 | ||||
-rw-r--r-- | hw/block/dataplane/virtio-blk.h | 31 | ||||
-rw-r--r-- | hw/block/dataplane/xen-block.c | 828 | ||||
-rw-r--r-- | hw/block/dataplane/xen-block.h | 30 |
7 files changed, 1266 insertions, 0 deletions
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build new file mode 100644 index 000000000..12c6a264f --- /dev/null +++ b/hw/block/dataplane/meson.build @@ -0,0 +1,2 @@ +specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c')) +specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c')) diff --git a/hw/block/dataplane/trace-events b/hw/block/dataplane/trace-events new file mode 100644 index 000000000..38fc3e750 --- /dev/null +++ b/hw/block/dataplane/trace-events @@ -0,0 +1,5 @@ +# See docs/devel/tracing.rst for syntax documentation. + +# virtio-blk.c +virtio_blk_data_plane_start(void *s) "dataplane %p" +virtio_blk_data_plane_stop(void *s) "dataplane %p" diff --git a/hw/block/dataplane/trace.h b/hw/block/dataplane/trace.h new file mode 100644 index 000000000..240cc5983 --- /dev/null +++ b/hw/block/dataplane/trace.h @@ -0,0 +1 @@ +#include "trace/trace-hw_block_dataplane.h" diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c new file mode 100644 index 000000000..ee5a5352d --- /dev/null +++ b/hw/block/dataplane/virtio-blk.c @@ -0,0 +1,369 @@ +/* + * Dedicated thread for virtio-blk I/O processing + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/main-loop.h" +#include "qemu/thread.h" +#include "qemu/error-report.h" +#include "hw/virtio/virtio-access.h" +#include "hw/virtio/virtio-blk.h" +#include "virtio-blk.h" +#include "block/aio.h" +#include "hw/virtio/virtio-bus.h" +#include "qom/object_interfaces.h" + +struct VirtIOBlockDataPlane { + bool starting; + bool stopping; + + VirtIOBlkConf *conf; + VirtIODevice *vdev; + QEMUBH *bh; /* bh for guest notification */ + unsigned long *batch_notify_vqs; + bool batch_notifications; + + /* Note that these EventNotifiers are assigned by value. This is + * fine as long as you do not call event_notifier_cleanup on them + * (because you don't own the file descriptor or handle; you just + * use it). + */ + IOThread *iothread; + AioContext *ctx; +}; + +/* Raise an interrupt to signal guest, if necessary */ +void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq) +{ + if (s->batch_notifications) { + set_bit(virtio_get_queue_index(vq), s->batch_notify_vqs); + qemu_bh_schedule(s->bh); + } else { + virtio_notify_irqfd(s->vdev, vq); + } +} + +static void notify_guest_bh(void *opaque) +{ + VirtIOBlockDataPlane *s = opaque; + unsigned nvqs = s->conf->num_queues; + unsigned long bitmap[BITS_TO_LONGS(nvqs)]; + unsigned j; + + memcpy(bitmap, s->batch_notify_vqs, sizeof(bitmap)); + memset(s->batch_notify_vqs, 0, sizeof(bitmap)); + + for (j = 0; j < nvqs; j += BITS_PER_LONG) { + unsigned long bits = bitmap[j / BITS_PER_LONG]; + + while (bits != 0) { + unsigned i = j + ctzl(bits); + VirtQueue *vq = virtio_get_queue(s->vdev, i); + + virtio_notify_irqfd(s->vdev, vq); + + bits &= bits - 1; /* clear right-most bit */ + } + } +} + +/* Context: QEMU global mutex held */ +bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf, + VirtIOBlockDataPlane **dataplane, + Error **errp) +{ + VirtIOBlockDataPlane *s; + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + + *dataplane = NULL; + + if (conf->iothread) { + if (!k->set_guest_notifiers || !k->ioeventfd_assign) { + error_setg(errp, + "device is incompatible with iothread " + "(transport does not support notifiers)"); + return false; + } + if (!virtio_device_ioeventfd_enabled(vdev)) { + error_setg(errp, "ioeventfd is required for iothread"); + return false; + } + + /* If dataplane is (re-)enabled while the guest is running there could + * be block jobs that can conflict. + */ + if (blk_op_is_blocked(conf->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) { + error_prepend(errp, "cannot start virtio-blk dataplane: "); + return false; + } + } + /* Don't try if transport does not support notifiers. */ + if (!virtio_device_ioeventfd_enabled(vdev)) { + return false; + } + + s = g_new0(VirtIOBlockDataPlane, 1); + s->vdev = vdev; + s->conf = conf; + + if (conf->iothread) { + s->iothread = conf->iothread; + object_ref(OBJECT(s->iothread)); + s->ctx = iothread_get_aio_context(s->iothread); + } else { + s->ctx = qemu_get_aio_context(); + } + s->bh = aio_bh_new(s->ctx, notify_guest_bh, s); + s->batch_notify_vqs = bitmap_new(conf->num_queues); + + *dataplane = s; + + return true; +} + +/* Context: QEMU global mutex held */ +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s) +{ + VirtIOBlock *vblk; + + if (!s) { + return; + } + + vblk = VIRTIO_BLK(s->vdev); + assert(!vblk->dataplane_started); + g_free(s->batch_notify_vqs); + qemu_bh_delete(s->bh); + if (s->iothread) { + object_unref(OBJECT(s->iothread)); + } + g_free(s); +} + +static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev, + VirtQueue *vq) +{ + VirtIOBlock *s = (VirtIOBlock *)vdev; + + assert(s->dataplane); + assert(s->dataplane_started); + + return virtio_blk_handle_vq(s, vq); +} + +/* Context: QEMU global mutex held */ +int virtio_blk_data_plane_start(VirtIODevice *vdev) +{ + VirtIOBlock *vblk = VIRTIO_BLK(vdev); + VirtIOBlockDataPlane *s = vblk->dataplane; + BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vblk))); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + AioContext *old_context; + unsigned i; + unsigned nvqs = s->conf->num_queues; + Error *local_err = NULL; + int r; + + if (vblk->dataplane_started || s->starting) { + return 0; + } + + s->starting = true; + + if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) { + s->batch_notifications = true; + } else { + s->batch_notifications = false; + } + + /* Set up guest notifier (irq) */ + r = k->set_guest_notifiers(qbus->parent, nvqs, true); + if (r != 0) { + error_report("virtio-blk failed to set guest notifier (%d), " + "ensure -accel kvm is set.", r); + goto fail_guest_notifiers; + } + + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ + memory_region_transaction_begin(); + + /* Set up virtqueue notify */ + for (i = 0; i < nvqs; i++) { + r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, true); + if (r != 0) { + int j = i; + + fprintf(stderr, "virtio-blk failed to set host notifier (%d)\n", r); + while (i--) { + virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); + } + + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ + memory_region_transaction_commit(); + + while (j--) { + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), j); + } + goto fail_host_notifiers; + } + } + + memory_region_transaction_commit(); + + s->starting = false; + vblk->dataplane_started = true; + trace_virtio_blk_data_plane_start(s); + + old_context = blk_get_aio_context(s->conf->conf.blk); + aio_context_acquire(old_context); + r = blk_set_aio_context(s->conf->conf.blk, s->ctx, &local_err); + aio_context_release(old_context); + if (r < 0) { + error_report_err(local_err); + goto fail_aio_context; + } + + /* Process queued requests before the ones in vring */ + virtio_blk_process_queued_requests(vblk, false); + + /* Kick right away to begin processing requests already in vring */ + for (i = 0; i < nvqs; i++) { + VirtQueue *vq = virtio_get_queue(s->vdev, i); + + event_notifier_set(virtio_queue_get_host_notifier(vq)); + } + + /* Get this show started by hooking up our callbacks */ + aio_context_acquire(s->ctx); + for (i = 0; i < nvqs; i++) { + VirtQueue *vq = virtio_get_queue(s->vdev, i); + + virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, + virtio_blk_data_plane_handle_output); + } + aio_context_release(s->ctx); + return 0; + + fail_aio_context: + memory_region_transaction_begin(); + + for (i = 0; i < nvqs; i++) { + virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); + } + + memory_region_transaction_commit(); + + for (i = 0; i < nvqs; i++) { + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); + } + fail_host_notifiers: + k->set_guest_notifiers(qbus->parent, nvqs, false); + fail_guest_notifiers: + /* + * If we failed to set up the guest notifiers queued requests will be + * processed on the main context. + */ + virtio_blk_process_queued_requests(vblk, false); + vblk->dataplane_disabled = true; + s->starting = false; + vblk->dataplane_started = true; + return -ENOSYS; +} + +/* Stop notifications for new requests from guest. + * + * Context: BH in IOThread + */ +static void virtio_blk_data_plane_stop_bh(void *opaque) +{ + VirtIOBlockDataPlane *s = opaque; + unsigned i; + + for (i = 0; i < s->conf->num_queues; i++) { + VirtQueue *vq = virtio_get_queue(s->vdev, i); + + virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, NULL); + } +} + +/* Context: QEMU global mutex held */ +void virtio_blk_data_plane_stop(VirtIODevice *vdev) +{ + VirtIOBlock *vblk = VIRTIO_BLK(vdev); + VirtIOBlockDataPlane *s = vblk->dataplane; + BusState *qbus = qdev_get_parent_bus(DEVICE(vblk)); + VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus); + unsigned i; + unsigned nvqs = s->conf->num_queues; + + if (!vblk->dataplane_started || s->stopping) { + return; + } + + /* Better luck next time. */ + if (vblk->dataplane_disabled) { + vblk->dataplane_disabled = false; + vblk->dataplane_started = false; + return; + } + s->stopping = true; + trace_virtio_blk_data_plane_stop(s); + + aio_context_acquire(s->ctx); + aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s); + + /* Drain and try to switch bs back to the QEMU main loop. If other users + * keep the BlockBackend in the iothread, that's ok */ + blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL); + + aio_context_release(s->ctx); + + /* + * Batch all the host notifiers in a single transaction to avoid + * quadratic time complexity in address_space_update_ioeventfds(). + */ + memory_region_transaction_begin(); + + for (i = 0; i < nvqs; i++) { + virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false); + } + + /* + * The transaction expects the ioeventfds to be open when it + * commits. Do it now, before the cleanup loop. + */ + memory_region_transaction_commit(); + + for (i = 0; i < nvqs; i++) { + virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i); + } + + qemu_bh_cancel(s->bh); + notify_guest_bh(s); /* final chance to notify guest */ + + /* Clean up guest notifier (irq) */ + k->set_guest_notifiers(qbus->parent, nvqs, false); + + vblk->dataplane_started = false; + s->stopping = false; +} diff --git a/hw/block/dataplane/virtio-blk.h b/hw/block/dataplane/virtio-blk.h new file mode 100644 index 000000000..5e18bb99a --- /dev/null +++ b/hw/block/dataplane/virtio-blk.h @@ -0,0 +1,31 @@ +/* + * Dedicated thread for virtio-blk I/O processing + * + * Copyright 2012 IBM, Corp. + * Copyright 2012 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Stefan Hajnoczi <stefanha@redhat.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#ifndef HW_DATAPLANE_VIRTIO_BLK_H +#define HW_DATAPLANE_VIRTIO_BLK_H + +#include "hw/virtio/virtio.h" + +typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane; + +bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf, + VirtIOBlockDataPlane **dataplane, + Error **errp); +void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s); +void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq); + +int virtio_blk_data_plane_start(VirtIODevice *vdev); +void virtio_blk_data_plane_stop(VirtIODevice *vdev); + +#endif /* HW_DATAPLANE_VIRTIO_BLK_H */ diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c new file mode 100644 index 000000000..860787580 --- /dev/null +++ b/hw/block/dataplane/xen-block.c @@ -0,0 +1,828 @@ +/* + * Copyright (c) 2018 Citrix Systems Inc. + * (c) Gerd Hoffmann <kraxel@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see <http://www.gnu.org/licenses/>. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu/osdep.h" +#include "qemu/error-report.h" +#include "qemu/main-loop.h" +#include "qapi/error.h" +#include "hw/xen/xen_common.h" +#include "hw/block/xen_blkif.h" +#include "sysemu/block-backend.h" +#include "sysemu/iothread.h" +#include "xen-block.h" + +typedef struct XenBlockRequest { + blkif_request_t req; + int16_t status; + off_t start; + QEMUIOVector v; + void *buf; + size_t size; + int presync; + int aio_inflight; + int aio_errors; + XenBlockDataPlane *dataplane; + QLIST_ENTRY(XenBlockRequest) list; + BlockAcctCookie acct; +} XenBlockRequest; + +struct XenBlockDataPlane { + XenDevice *xendev; + XenEventChannel *event_channel; + unsigned int *ring_ref; + unsigned int nr_ring_ref; + void *sring; + int protocol; + blkif_back_rings_t rings; + int more_work; + QLIST_HEAD(inflight_head, XenBlockRequest) inflight; + QLIST_HEAD(freelist_head, XenBlockRequest) freelist; + int requests_total; + int requests_inflight; + unsigned int max_requests; + BlockBackend *blk; + unsigned int sector_size; + QEMUBH *bh; + IOThread *iothread; + AioContext *ctx; +}; + +static int xen_block_send_response(XenBlockRequest *request); + +static void reset_request(XenBlockRequest *request) +{ + memset(&request->req, 0, sizeof(request->req)); + request->status = 0; + request->start = 0; + request->size = 0; + request->presync = 0; + + request->aio_inflight = 0; + request->aio_errors = 0; + + request->dataplane = NULL; + memset(&request->list, 0, sizeof(request->list)); + memset(&request->acct, 0, sizeof(request->acct)); + + qemu_iovec_reset(&request->v); +} + +static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane) +{ + XenBlockRequest *request = NULL; + + if (QLIST_EMPTY(&dataplane->freelist)) { + if (dataplane->requests_total >= dataplane->max_requests) { + goto out; + } + /* allocate new struct */ + request = g_malloc0(sizeof(*request)); + request->dataplane = dataplane; + /* + * We cannot need more pages per requests than this, and since we + * re-use requests, allocate the memory once here. It will be freed + * xen_block_dataplane_destroy() when the request list is freed. + */ + request->buf = qemu_memalign(XC_PAGE_SIZE, + BLKIF_MAX_SEGMENTS_PER_REQUEST * + XC_PAGE_SIZE); + dataplane->requests_total++; + qemu_iovec_init(&request->v, 1); + } else { + /* get one from freelist */ + request = QLIST_FIRST(&dataplane->freelist); + QLIST_REMOVE(request, list); + } + QLIST_INSERT_HEAD(&dataplane->inflight, request, list); + dataplane->requests_inflight++; + +out: + return request; +} + +static void xen_block_complete_request(XenBlockRequest *request) +{ + XenBlockDataPlane *dataplane = request->dataplane; + + if (xen_block_send_response(request)) { + Error *local_err = NULL; + + xen_device_notify_event_channel(dataplane->xendev, + dataplane->event_channel, + &local_err); + if (local_err) { + error_report_err(local_err); + } + } + + QLIST_REMOVE(request, list); + dataplane->requests_inflight--; + reset_request(request); + request->dataplane = dataplane; + QLIST_INSERT_HEAD(&dataplane->freelist, request, list); +} + +/* + * translate request into iovec + start offset + * do sanity checks along the way + */ +static int xen_block_parse_request(XenBlockRequest *request) +{ + XenBlockDataPlane *dataplane = request->dataplane; + size_t len; + int i; + + switch (request->req.operation) { + case BLKIF_OP_READ: + break; + case BLKIF_OP_FLUSH_DISKCACHE: + request->presync = 1; + if (!request->req.nr_segments) { + return 0; + } + /* fall through */ + case BLKIF_OP_WRITE: + break; + case BLKIF_OP_DISCARD: + return 0; + default: + error_report("error: unknown operation (%d)", request->req.operation); + goto err; + }; + + if (request->req.operation != BLKIF_OP_READ && + !blk_is_writable(dataplane->blk)) { + error_report("error: write req for ro device"); + goto err; + } + + request->start = request->req.sector_number * dataplane->sector_size; + for (i = 0; i < request->req.nr_segments; i++) { + if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) { + error_report("error: nr_segments too big"); + goto err; + } + if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) { + error_report("error: first > last sector"); + goto err; + } + if (request->req.seg[i].last_sect * dataplane->sector_size >= + XC_PAGE_SIZE) { + error_report("error: page crossing"); + goto err; + } + + len = (request->req.seg[i].last_sect - + request->req.seg[i].first_sect + 1) * dataplane->sector_size; + request->size += len; + } + if (request->start + request->size > blk_getlength(dataplane->blk)) { + error_report("error: access beyond end of file"); + goto err; + } + return 0; + +err: + request->status = BLKIF_RSP_ERROR; + return -1; +} + +static int xen_block_copy_request(XenBlockRequest *request) +{ + XenBlockDataPlane *dataplane = request->dataplane; + XenDevice *xendev = dataplane->xendev; + XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i, count; + bool to_domain = (request->req.operation == BLKIF_OP_READ); + void *virt = request->buf; + Error *local_err = NULL; + + if (request->req.nr_segments == 0) { + return 0; + } + + count = request->req.nr_segments; + + for (i = 0; i < count; i++) { + if (to_domain) { + segs[i].dest.foreign.ref = request->req.seg[i].gref; + segs[i].dest.foreign.offset = request->req.seg[i].first_sect * + dataplane->sector_size; + segs[i].source.virt = virt; + } else { + segs[i].source.foreign.ref = request->req.seg[i].gref; + segs[i].source.foreign.offset = request->req.seg[i].first_sect * + dataplane->sector_size; + segs[i].dest.virt = virt; + } + segs[i].len = (request->req.seg[i].last_sect - + request->req.seg[i].first_sect + 1) * + dataplane->sector_size; + virt += segs[i].len; + } + + xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err); + + if (local_err) { + error_reportf_err(local_err, "failed to copy data: "); + + request->aio_errors++; + return -1; + } + + return 0; +} + +static int xen_block_do_aio(XenBlockRequest *request); + +static void xen_block_complete_aio(void *opaque, int ret) +{ + XenBlockRequest *request = opaque; + XenBlockDataPlane *dataplane = request->dataplane; + + aio_context_acquire(dataplane->ctx); + + if (ret != 0) { + error_report("%s I/O error", + request->req.operation == BLKIF_OP_READ ? + "read" : "write"); + request->aio_errors++; + } + + request->aio_inflight--; + if (request->presync) { + request->presync = 0; + xen_block_do_aio(request); + goto done; + } + if (request->aio_inflight > 0) { + goto done; + } + + switch (request->req.operation) { + case BLKIF_OP_READ: + /* in case of failure request->aio_errors is increased */ + if (ret == 0) { + xen_block_copy_request(request); + } + break; + case BLKIF_OP_WRITE: + case BLKIF_OP_FLUSH_DISKCACHE: + default: + break; + } + + request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY; + + switch (request->req.operation) { + case BLKIF_OP_WRITE: + case BLKIF_OP_FLUSH_DISKCACHE: + if (!request->req.nr_segments) { + break; + } + /* fall through */ + case BLKIF_OP_READ: + if (request->status == BLKIF_RSP_OKAY) { + block_acct_done(blk_get_stats(dataplane->blk), &request->acct); + } else { + block_acct_failed(blk_get_stats(dataplane->blk), &request->acct); + } + break; + case BLKIF_OP_DISCARD: + default: + break; + } + + xen_block_complete_request(request); + + if (dataplane->more_work) { + qemu_bh_schedule(dataplane->bh); + } + +done: + aio_context_release(dataplane->ctx); +} + +static bool xen_block_split_discard(XenBlockRequest *request, + blkif_sector_t sector_number, + uint64_t nr_sectors) +{ + XenBlockDataPlane *dataplane = request->dataplane; + int64_t byte_offset; + int byte_chunk; + uint64_t byte_remaining; + uint64_t sec_start = sector_number; + uint64_t sec_count = nr_sectors; + + /* Wrap around, or overflowing byte limit? */ + if (sec_start + sec_count < sec_count || + sec_start + sec_count > INT64_MAX / dataplane->sector_size) { + return false; + } + + byte_offset = sec_start * dataplane->sector_size; + byte_remaining = sec_count * dataplane->sector_size; + + do { + byte_chunk = byte_remaining > BDRV_REQUEST_MAX_BYTES ? + BDRV_REQUEST_MAX_BYTES : byte_remaining; + request->aio_inflight++; + blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk, + xen_block_complete_aio, request); + byte_remaining -= byte_chunk; + byte_offset += byte_chunk; + } while (byte_remaining > 0); + + return true; +} + +static int xen_block_do_aio(XenBlockRequest *request) +{ + XenBlockDataPlane *dataplane = request->dataplane; + + if (request->req.nr_segments && + (request->req.operation == BLKIF_OP_WRITE || + request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) && + xen_block_copy_request(request)) { + goto err; + } + + request->aio_inflight++; + if (request->presync) { + blk_aio_flush(request->dataplane->blk, xen_block_complete_aio, + request); + return 0; + } + + switch (request->req.operation) { + case BLKIF_OP_READ: + qemu_iovec_add(&request->v, request->buf, request->size); + block_acct_start(blk_get_stats(dataplane->blk), &request->acct, + request->v.size, BLOCK_ACCT_READ); + request->aio_inflight++; + blk_aio_preadv(dataplane->blk, request->start, &request->v, 0, + xen_block_complete_aio, request); + break; + case BLKIF_OP_WRITE: + case BLKIF_OP_FLUSH_DISKCACHE: + if (!request->req.nr_segments) { + break; + } + + qemu_iovec_add(&request->v, request->buf, request->size); + block_acct_start(blk_get_stats(dataplane->blk), &request->acct, + request->v.size, + request->req.operation == BLKIF_OP_WRITE ? + BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH); + request->aio_inflight++; + blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0, + xen_block_complete_aio, request); + break; + case BLKIF_OP_DISCARD: + { + struct blkif_request_discard *req = (void *)&request->req; + if (!xen_block_split_discard(request, req->sector_number, + req->nr_sectors)) { + goto err; + } + break; + } + default: + /* unknown operation (shouldn't happen -- parse catches this) */ + goto err; + } + + xen_block_complete_aio(request, 0); + + return 0; + +err: + request->status = BLKIF_RSP_ERROR; + xen_block_complete_request(request); + return -1; +} + +static int xen_block_send_response(XenBlockRequest *request) +{ + XenBlockDataPlane *dataplane = request->dataplane; + int send_notify = 0; + int have_requests = 0; + blkif_response_t *resp; + + /* Place on the response ring for the relevant domain. */ + switch (dataplane->protocol) { + case BLKIF_PROTOCOL_NATIVE: + resp = (blkif_response_t *)RING_GET_RESPONSE( + &dataplane->rings.native, + dataplane->rings.native.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_32: + resp = (blkif_response_t *)RING_GET_RESPONSE( + &dataplane->rings.x86_32_part, + dataplane->rings.x86_32_part.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_64: + resp = (blkif_response_t *)RING_GET_RESPONSE( + &dataplane->rings.x86_64_part, + dataplane->rings.x86_64_part.rsp_prod_pvt); + break; + default: + return 0; + } + + resp->id = request->req.id; + resp->operation = request->req.operation; + resp->status = request->status; + + dataplane->rings.common.rsp_prod_pvt++; + + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common, + send_notify); + if (dataplane->rings.common.rsp_prod_pvt == + dataplane->rings.common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common, + have_requests); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) { + have_requests = 1; + } + + if (have_requests) { + dataplane->more_work++; + } + return send_notify; +} + +static int xen_block_get_request(XenBlockDataPlane *dataplane, + XenBlockRequest *request, RING_IDX rc) +{ + switch (dataplane->protocol) { + case BLKIF_PROTOCOL_NATIVE: { + blkif_request_t *req = + RING_GET_REQUEST(&dataplane->rings.native, rc); + + memcpy(&request->req, req, sizeof(request->req)); + break; + } + case BLKIF_PROTOCOL_X86_32: { + blkif_x86_32_request_t *req = + RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc); + + blkif_get_x86_32_req(&request->req, req); + break; + } + case BLKIF_PROTOCOL_X86_64: { + blkif_x86_64_request_t *req = + RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc); + + blkif_get_x86_64_req(&request->req, req); + break; + } + } + /* Prevent the compiler from accessing the on-ring fields instead. */ + barrier(); + return 0; +} + +/* + * Threshold of in-flight requests above which we will start using + * blk_io_plug()/blk_io_unplug() to batch requests. + */ +#define IO_PLUG_THRESHOLD 1 + +static bool xen_block_handle_requests(XenBlockDataPlane *dataplane) +{ + RING_IDX rc, rp; + XenBlockRequest *request; + int inflight_atstart = dataplane->requests_inflight; + int batched = 0; + bool done_something = false; + + dataplane->more_work = 0; + + rc = dataplane->rings.common.req_cons; + rp = dataplane->rings.common.sring->req_prod; + xen_rmb(); /* Ensure we see queued requests up to 'rp'. */ + + /* + * If there was more than IO_PLUG_THRESHOLD requests in flight + * when we got here, this is an indication that there the bottleneck + * is below us, so it's worth beginning to batch up I/O requests + * rather than submitting them immediately. The maximum number + * of requests we're willing to batch is the number already in + * flight, so it can grow up to max_requests when the bottleneck + * is below us. + */ + if (inflight_atstart > IO_PLUG_THRESHOLD) { + blk_io_plug(dataplane->blk); + } + while (rc != rp) { + /* pull request from ring */ + if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) { + break; + } + request = xen_block_start_request(dataplane); + if (request == NULL) { + dataplane->more_work++; + break; + } + xen_block_get_request(dataplane, request, rc); + dataplane->rings.common.req_cons = ++rc; + done_something = true; + + /* parse them */ + if (xen_block_parse_request(request) != 0) { + switch (request->req.operation) { + case BLKIF_OP_READ: + block_acct_invalid(blk_get_stats(dataplane->blk), + BLOCK_ACCT_READ); + break; + case BLKIF_OP_WRITE: + block_acct_invalid(blk_get_stats(dataplane->blk), + BLOCK_ACCT_WRITE); + break; + case BLKIF_OP_FLUSH_DISKCACHE: + block_acct_invalid(blk_get_stats(dataplane->blk), + BLOCK_ACCT_FLUSH); + default: + break; + }; + + xen_block_complete_request(request); + continue; + } + + if (inflight_atstart > IO_PLUG_THRESHOLD && + batched >= inflight_atstart) { + blk_io_unplug(dataplane->blk); + } + xen_block_do_aio(request); + if (inflight_atstart > IO_PLUG_THRESHOLD) { + if (batched >= inflight_atstart) { + blk_io_plug(dataplane->blk); + batched = 0; + } else { + batched++; + } + } + } + if (inflight_atstart > IO_PLUG_THRESHOLD) { + blk_io_unplug(dataplane->blk); + } + + return done_something; +} + +static void xen_block_dataplane_bh(void *opaque) +{ + XenBlockDataPlane *dataplane = opaque; + + aio_context_acquire(dataplane->ctx); + xen_block_handle_requests(dataplane); + aio_context_release(dataplane->ctx); +} + +static bool xen_block_dataplane_event(void *opaque) +{ + XenBlockDataPlane *dataplane = opaque; + + return xen_block_handle_requests(dataplane); +} + +XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev, + BlockBackend *blk, + unsigned int sector_size, + IOThread *iothread) +{ + XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1); + + dataplane->xendev = xendev; + dataplane->blk = blk; + dataplane->sector_size = sector_size; + + QLIST_INIT(&dataplane->inflight); + QLIST_INIT(&dataplane->freelist); + + if (iothread) { + dataplane->iothread = iothread; + object_ref(OBJECT(dataplane->iothread)); + dataplane->ctx = iothread_get_aio_context(dataplane->iothread); + } else { + dataplane->ctx = qemu_get_aio_context(); + } + dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh, + dataplane); + + return dataplane; +} + +void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane) +{ + XenBlockRequest *request; + + if (!dataplane) { + return; + } + + while (!QLIST_EMPTY(&dataplane->freelist)) { + request = QLIST_FIRST(&dataplane->freelist); + QLIST_REMOVE(request, list); + qemu_iovec_destroy(&request->v); + qemu_vfree(request->buf); + g_free(request); + } + + qemu_bh_delete(dataplane->bh); + if (dataplane->iothread) { + object_unref(OBJECT(dataplane->iothread)); + } + + g_free(dataplane); +} + +void xen_block_dataplane_stop(XenBlockDataPlane *dataplane) +{ + XenDevice *xendev; + + if (!dataplane) { + return; + } + + xendev = dataplane->xendev; + + aio_context_acquire(dataplane->ctx); + if (dataplane->event_channel) { + /* Only reason for failure is a NULL channel */ + xen_device_set_event_channel_context(xendev, dataplane->event_channel, + qemu_get_aio_context(), + &error_abort); + } + /* Xen doesn't have multiple users for nodes, so this can't fail */ + blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort); + aio_context_release(dataplane->ctx); + + /* + * Now that the context has been moved onto the main thread, cancel + * further processing. + */ + qemu_bh_cancel(dataplane->bh); + + if (dataplane->event_channel) { + Error *local_err = NULL; + + xen_device_unbind_event_channel(xendev, dataplane->event_channel, + &local_err); + dataplane->event_channel = NULL; + + if (local_err) { + error_report_err(local_err); + } + } + + if (dataplane->sring) { + Error *local_err = NULL; + + xen_device_unmap_grant_refs(xendev, dataplane->sring, + dataplane->nr_ring_ref, &local_err); + dataplane->sring = NULL; + + if (local_err) { + error_report_err(local_err); + } + } + + g_free(dataplane->ring_ref); + dataplane->ring_ref = NULL; +} + +void xen_block_dataplane_start(XenBlockDataPlane *dataplane, + const unsigned int ring_ref[], + unsigned int nr_ring_ref, + unsigned int event_channel, + unsigned int protocol, + Error **errp) +{ + ERRP_GUARD(); + XenDevice *xendev = dataplane->xendev; + AioContext *old_context; + unsigned int ring_size; + unsigned int i; + + dataplane->nr_ring_ref = nr_ring_ref; + dataplane->ring_ref = g_new(unsigned int, nr_ring_ref); + + for (i = 0; i < nr_ring_ref; i++) { + dataplane->ring_ref[i] = ring_ref[i]; + } + + dataplane->protocol = protocol; + + ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref; + switch (dataplane->protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size); + break; + } + default: + error_setg(errp, "unknown protocol %u", dataplane->protocol); + return; + } + + xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref, + errp); + if (*errp) { + goto stop; + } + + dataplane->sring = xen_device_map_grant_refs(xendev, + dataplane->ring_ref, + dataplane->nr_ring_ref, + PROT_READ | PROT_WRITE, + errp); + if (*errp) { + goto stop; + } + + switch (dataplane->protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring_native = dataplane->sring; + + BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring; + + BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32, + ring_size); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring; + + BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64, + ring_size); + break; + } + } + + dataplane->event_channel = + xen_device_bind_event_channel(xendev, event_channel, + xen_block_dataplane_event, dataplane, + errp); + if (*errp) { + goto stop; + } + + old_context = blk_get_aio_context(dataplane->blk); + aio_context_acquire(old_context); + /* If other users keep the BlockBackend in the iothread, that's ok */ + blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL); + aio_context_release(old_context); + + /* Only reason for failure is a NULL channel */ + aio_context_acquire(dataplane->ctx); + xen_device_set_event_channel_context(xendev, dataplane->event_channel, + dataplane->ctx, &error_abort); + aio_context_release(dataplane->ctx); + + return; + +stop: + xen_block_dataplane_stop(dataplane); +} diff --git a/hw/block/dataplane/xen-block.h b/hw/block/dataplane/xen-block.h new file mode 100644 index 000000000..76dcd51c3 --- /dev/null +++ b/hw/block/dataplane/xen-block.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2018 Citrix Systems Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef HW_BLOCK_DATAPLANE_XEN_BLOCK_H +#define HW_BLOCK_DATAPLANE_XEN_BLOCK_H + +#include "hw/block/block.h" +#include "hw/xen/xen-bus.h" +#include "sysemu/iothread.h" + +typedef struct XenBlockDataPlane XenBlockDataPlane; + +XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev, + BlockBackend *blk, + unsigned int sector_size, + IOThread *iothread); +void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane); +void xen_block_dataplane_start(XenBlockDataPlane *dataplane, + const unsigned int ring_ref[], + unsigned int nr_ring_ref, + unsigned int event_channel, + unsigned int protocol, + Error **errp); +void xen_block_dataplane_stop(XenBlockDataPlane *dataplane); + +#endif /* HW_BLOCK_DATAPLANE_XEN_BLOCK_H */ |