aboutsummaryrefslogtreecommitdiffstats
path: root/hw/block/dataplane
diff options
context:
space:
mode:
authorTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
committerTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
commite02cda008591317b1625707ff8e115a4841aa889 (patch)
treeaee302e3cf8b59ec2d32ec481be3d1afddfc8968 /hw/block/dataplane
parentcc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)
Introduce Virtio-loopback epsilon release:
Epsilon release introduces a new compatibility layer which make virtio-loopback design to work with QEMU and rust-vmm vhost-user backend without require any changes. Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
Diffstat (limited to 'hw/block/dataplane')
-rw-r--r--hw/block/dataplane/meson.build2
-rw-r--r--hw/block/dataplane/trace-events5
-rw-r--r--hw/block/dataplane/trace.h1
-rw-r--r--hw/block/dataplane/virtio-blk.c369
-rw-r--r--hw/block/dataplane/virtio-blk.h31
-rw-r--r--hw/block/dataplane/xen-block.c828
-rw-r--r--hw/block/dataplane/xen-block.h30
7 files changed, 1266 insertions, 0 deletions
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build
new file mode 100644
index 000000000..12c6a264f
--- /dev/null
+++ b/hw/block/dataplane/meson.build
@@ -0,0 +1,2 @@
+specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
+specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
diff --git a/hw/block/dataplane/trace-events b/hw/block/dataplane/trace-events
new file mode 100644
index 000000000..38fc3e750
--- /dev/null
+++ b/hw/block/dataplane/trace-events
@@ -0,0 +1,5 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# virtio-blk.c
+virtio_blk_data_plane_start(void *s) "dataplane %p"
+virtio_blk_data_plane_stop(void *s) "dataplane %p"
diff --git a/hw/block/dataplane/trace.h b/hw/block/dataplane/trace.h
new file mode 100644
index 000000000..240cc5983
--- /dev/null
+++ b/hw/block/dataplane/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_block_dataplane.h"
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
new file mode 100644
index 000000000..ee5a5352d
--- /dev/null
+++ b/hw/block/dataplane/virtio-blk.c
@@ -0,0 +1,369 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "trace.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/thread.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-blk.h"
+#include "virtio-blk.h"
+#include "block/aio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "qom/object_interfaces.h"
+
+struct VirtIOBlockDataPlane {
+ bool starting;
+ bool stopping;
+
+ VirtIOBlkConf *conf;
+ VirtIODevice *vdev;
+ QEMUBH *bh; /* bh for guest notification */
+ unsigned long *batch_notify_vqs;
+ bool batch_notifications;
+
+ /* Note that these EventNotifiers are assigned by value. This is
+ * fine as long as you do not call event_notifier_cleanup on them
+ * (because you don't own the file descriptor or handle; you just
+ * use it).
+ */
+ IOThread *iothread;
+ AioContext *ctx;
+};
+
+/* Raise an interrupt to signal guest, if necessary */
+void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq)
+{
+ if (s->batch_notifications) {
+ set_bit(virtio_get_queue_index(vq), s->batch_notify_vqs);
+ qemu_bh_schedule(s->bh);
+ } else {
+ virtio_notify_irqfd(s->vdev, vq);
+ }
+}
+
+static void notify_guest_bh(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+ unsigned nvqs = s->conf->num_queues;
+ unsigned long bitmap[BITS_TO_LONGS(nvqs)];
+ unsigned j;
+
+ memcpy(bitmap, s->batch_notify_vqs, sizeof(bitmap));
+ memset(s->batch_notify_vqs, 0, sizeof(bitmap));
+
+ for (j = 0; j < nvqs; j += BITS_PER_LONG) {
+ unsigned long bits = bitmap[j / BITS_PER_LONG];
+
+ while (bits != 0) {
+ unsigned i = j + ctzl(bits);
+ VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+ virtio_notify_irqfd(s->vdev, vq);
+
+ bits &= bits - 1; /* clear right-most bit */
+ }
+ }
+}
+
+/* Context: QEMU global mutex held */
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
+ VirtIOBlockDataPlane **dataplane,
+ Error **errp)
+{
+ VirtIOBlockDataPlane *s;
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+ *dataplane = NULL;
+
+ if (conf->iothread) {
+ if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
+ error_setg(errp,
+ "device is incompatible with iothread "
+ "(transport does not support notifiers)");
+ return false;
+ }
+ if (!virtio_device_ioeventfd_enabled(vdev)) {
+ error_setg(errp, "ioeventfd is required for iothread");
+ return false;
+ }
+
+ /* If dataplane is (re-)enabled while the guest is running there could
+ * be block jobs that can conflict.
+ */
+ if (blk_op_is_blocked(conf->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
+ error_prepend(errp, "cannot start virtio-blk dataplane: ");
+ return false;
+ }
+ }
+ /* Don't try if transport does not support notifiers. */
+ if (!virtio_device_ioeventfd_enabled(vdev)) {
+ return false;
+ }
+
+ s = g_new0(VirtIOBlockDataPlane, 1);
+ s->vdev = vdev;
+ s->conf = conf;
+
+ if (conf->iothread) {
+ s->iothread = conf->iothread;
+ object_ref(OBJECT(s->iothread));
+ s->ctx = iothread_get_aio_context(s->iothread);
+ } else {
+ s->ctx = qemu_get_aio_context();
+ }
+ s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
+ s->batch_notify_vqs = bitmap_new(conf->num_queues);
+
+ *dataplane = s;
+
+ return true;
+}
+
+/* Context: QEMU global mutex held */
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
+{
+ VirtIOBlock *vblk;
+
+ if (!s) {
+ return;
+ }
+
+ vblk = VIRTIO_BLK(s->vdev);
+ assert(!vblk->dataplane_started);
+ g_free(s->batch_notify_vqs);
+ qemu_bh_delete(s->bh);
+ if (s->iothread) {
+ object_unref(OBJECT(s->iothread));
+ }
+ g_free(s);
+}
+
+static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
+ VirtQueue *vq)
+{
+ VirtIOBlock *s = (VirtIOBlock *)vdev;
+
+ assert(s->dataplane);
+ assert(s->dataplane_started);
+
+ return virtio_blk_handle_vq(s, vq);
+}
+
+/* Context: QEMU global mutex held */
+int virtio_blk_data_plane_start(VirtIODevice *vdev)
+{
+ VirtIOBlock *vblk = VIRTIO_BLK(vdev);
+ VirtIOBlockDataPlane *s = vblk->dataplane;
+ BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vblk)));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ AioContext *old_context;
+ unsigned i;
+ unsigned nvqs = s->conf->num_queues;
+ Error *local_err = NULL;
+ int r;
+
+ if (vblk->dataplane_started || s->starting) {
+ return 0;
+ }
+
+ s->starting = true;
+
+ if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+ s->batch_notifications = true;
+ } else {
+ s->batch_notifications = false;
+ }
+
+ /* Set up guest notifier (irq) */
+ r = k->set_guest_notifiers(qbus->parent, nvqs, true);
+ if (r != 0) {
+ error_report("virtio-blk failed to set guest notifier (%d), "
+ "ensure -accel kvm is set.", r);
+ goto fail_guest_notifiers;
+ }
+
+ /*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+ memory_region_transaction_begin();
+
+ /* Set up virtqueue notify */
+ for (i = 0; i < nvqs; i++) {
+ r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, true);
+ if (r != 0) {
+ int j = i;
+
+ fprintf(stderr, "virtio-blk failed to set host notifier (%d)\n", r);
+ while (i--) {
+ virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+ }
+
+ /*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+ memory_region_transaction_commit();
+
+ while (j--) {
+ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), j);
+ }
+ goto fail_host_notifiers;
+ }
+ }
+
+ memory_region_transaction_commit();
+
+ s->starting = false;
+ vblk->dataplane_started = true;
+ trace_virtio_blk_data_plane_start(s);
+
+ old_context = blk_get_aio_context(s->conf->conf.blk);
+ aio_context_acquire(old_context);
+ r = blk_set_aio_context(s->conf->conf.blk, s->ctx, &local_err);
+ aio_context_release(old_context);
+ if (r < 0) {
+ error_report_err(local_err);
+ goto fail_aio_context;
+ }
+
+ /* Process queued requests before the ones in vring */
+ virtio_blk_process_queued_requests(vblk, false);
+
+ /* Kick right away to begin processing requests already in vring */
+ for (i = 0; i < nvqs; i++) {
+ VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+ event_notifier_set(virtio_queue_get_host_notifier(vq));
+ }
+
+ /* Get this show started by hooking up our callbacks */
+ aio_context_acquire(s->ctx);
+ for (i = 0; i < nvqs; i++) {
+ VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+ virtio_queue_aio_set_host_notifier_handler(vq, s->ctx,
+ virtio_blk_data_plane_handle_output);
+ }
+ aio_context_release(s->ctx);
+ return 0;
+
+ fail_aio_context:
+ memory_region_transaction_begin();
+
+ for (i = 0; i < nvqs; i++) {
+ virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+ }
+
+ memory_region_transaction_commit();
+
+ for (i = 0; i < nvqs; i++) {
+ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+ }
+ fail_host_notifiers:
+ k->set_guest_notifiers(qbus->parent, nvqs, false);
+ fail_guest_notifiers:
+ /*
+ * If we failed to set up the guest notifiers queued requests will be
+ * processed on the main context.
+ */
+ virtio_blk_process_queued_requests(vblk, false);
+ vblk->dataplane_disabled = true;
+ s->starting = false;
+ vblk->dataplane_started = true;
+ return -ENOSYS;
+}
+
+/* Stop notifications for new requests from guest.
+ *
+ * Context: BH in IOThread
+ */
+static void virtio_blk_data_plane_stop_bh(void *opaque)
+{
+ VirtIOBlockDataPlane *s = opaque;
+ unsigned i;
+
+ for (i = 0; i < s->conf->num_queues; i++) {
+ VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+ virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, NULL);
+ }
+}
+
+/* Context: QEMU global mutex held */
+void virtio_blk_data_plane_stop(VirtIODevice *vdev)
+{
+ VirtIOBlock *vblk = VIRTIO_BLK(vdev);
+ VirtIOBlockDataPlane *s = vblk->dataplane;
+ BusState *qbus = qdev_get_parent_bus(DEVICE(vblk));
+ VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ unsigned i;
+ unsigned nvqs = s->conf->num_queues;
+
+ if (!vblk->dataplane_started || s->stopping) {
+ return;
+ }
+
+ /* Better luck next time. */
+ if (vblk->dataplane_disabled) {
+ vblk->dataplane_disabled = false;
+ vblk->dataplane_started = false;
+ return;
+ }
+ s->stopping = true;
+ trace_virtio_blk_data_plane_stop(s);
+
+ aio_context_acquire(s->ctx);
+ aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
+
+ /* Drain and try to switch bs back to the QEMU main loop. If other users
+ * keep the BlockBackend in the iothread, that's ok */
+ blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL);
+
+ aio_context_release(s->ctx);
+
+ /*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+ memory_region_transaction_begin();
+
+ for (i = 0; i < nvqs; i++) {
+ virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+ }
+
+ /*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+ memory_region_transaction_commit();
+
+ for (i = 0; i < nvqs; i++) {
+ virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+ }
+
+ qemu_bh_cancel(s->bh);
+ notify_guest_bh(s); /* final chance to notify guest */
+
+ /* Clean up guest notifier (irq) */
+ k->set_guest_notifiers(qbus->parent, nvqs, false);
+
+ vblk->dataplane_started = false;
+ s->stopping = false;
+}
diff --git a/hw/block/dataplane/virtio-blk.h b/hw/block/dataplane/virtio-blk.h
new file mode 100644
index 000000000..5e18bb99a
--- /dev/null
+++ b/hw/block/dataplane/virtio-blk.h
@@ -0,0 +1,31 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HW_DATAPLANE_VIRTIO_BLK_H
+#define HW_DATAPLANE_VIRTIO_BLK_H
+
+#include "hw/virtio/virtio.h"
+
+typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
+ VirtIOBlockDataPlane **dataplane,
+ Error **errp);
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq);
+
+int virtio_blk_data_plane_start(VirtIODevice *vdev);
+void virtio_blk_data_plane_stop(VirtIODevice *vdev);
+
+#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
new file mode 100644
index 000000000..860787580
--- /dev/null
+++ b/hw/block/dataplane/xen-block.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2018 Citrix Systems Inc.
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "hw/xen/xen_common.h"
+#include "hw/block/xen_blkif.h"
+#include "sysemu/block-backend.h"
+#include "sysemu/iothread.h"
+#include "xen-block.h"
+
+typedef struct XenBlockRequest {
+ blkif_request_t req;
+ int16_t status;
+ off_t start;
+ QEMUIOVector v;
+ void *buf;
+ size_t size;
+ int presync;
+ int aio_inflight;
+ int aio_errors;
+ XenBlockDataPlane *dataplane;
+ QLIST_ENTRY(XenBlockRequest) list;
+ BlockAcctCookie acct;
+} XenBlockRequest;
+
+struct XenBlockDataPlane {
+ XenDevice *xendev;
+ XenEventChannel *event_channel;
+ unsigned int *ring_ref;
+ unsigned int nr_ring_ref;
+ void *sring;
+ int protocol;
+ blkif_back_rings_t rings;
+ int more_work;
+ QLIST_HEAD(inflight_head, XenBlockRequest) inflight;
+ QLIST_HEAD(freelist_head, XenBlockRequest) freelist;
+ int requests_total;
+ int requests_inflight;
+ unsigned int max_requests;
+ BlockBackend *blk;
+ unsigned int sector_size;
+ QEMUBH *bh;
+ IOThread *iothread;
+ AioContext *ctx;
+};
+
+static int xen_block_send_response(XenBlockRequest *request);
+
+static void reset_request(XenBlockRequest *request)
+{
+ memset(&request->req, 0, sizeof(request->req));
+ request->status = 0;
+ request->start = 0;
+ request->size = 0;
+ request->presync = 0;
+
+ request->aio_inflight = 0;
+ request->aio_errors = 0;
+
+ request->dataplane = NULL;
+ memset(&request->list, 0, sizeof(request->list));
+ memset(&request->acct, 0, sizeof(request->acct));
+
+ qemu_iovec_reset(&request->v);
+}
+
+static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
+{
+ XenBlockRequest *request = NULL;
+
+ if (QLIST_EMPTY(&dataplane->freelist)) {
+ if (dataplane->requests_total >= dataplane->max_requests) {
+ goto out;
+ }
+ /* allocate new struct */
+ request = g_malloc0(sizeof(*request));
+ request->dataplane = dataplane;
+ /*
+ * We cannot need more pages per requests than this, and since we
+ * re-use requests, allocate the memory once here. It will be freed
+ * xen_block_dataplane_destroy() when the request list is freed.
+ */
+ request->buf = qemu_memalign(XC_PAGE_SIZE,
+ BLKIF_MAX_SEGMENTS_PER_REQUEST *
+ XC_PAGE_SIZE);
+ dataplane->requests_total++;
+ qemu_iovec_init(&request->v, 1);
+ } else {
+ /* get one from freelist */
+ request = QLIST_FIRST(&dataplane->freelist);
+ QLIST_REMOVE(request, list);
+ }
+ QLIST_INSERT_HEAD(&dataplane->inflight, request, list);
+ dataplane->requests_inflight++;
+
+out:
+ return request;
+}
+
+static void xen_block_complete_request(XenBlockRequest *request)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+
+ if (xen_block_send_response(request)) {
+ Error *local_err = NULL;
+
+ xen_device_notify_event_channel(dataplane->xendev,
+ dataplane->event_channel,
+ &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ }
+
+ QLIST_REMOVE(request, list);
+ dataplane->requests_inflight--;
+ reset_request(request);
+ request->dataplane = dataplane;
+ QLIST_INSERT_HEAD(&dataplane->freelist, request, list);
+}
+
+/*
+ * translate request into iovec + start offset
+ * do sanity checks along the way
+ */
+static int xen_block_parse_request(XenBlockRequest *request)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+ size_t len;
+ int i;
+
+ switch (request->req.operation) {
+ case BLKIF_OP_READ:
+ break;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ request->presync = 1;
+ if (!request->req.nr_segments) {
+ return 0;
+ }
+ /* fall through */
+ case BLKIF_OP_WRITE:
+ break;
+ case BLKIF_OP_DISCARD:
+ return 0;
+ default:
+ error_report("error: unknown operation (%d)", request->req.operation);
+ goto err;
+ };
+
+ if (request->req.operation != BLKIF_OP_READ &&
+ !blk_is_writable(dataplane->blk)) {
+ error_report("error: write req for ro device");
+ goto err;
+ }
+
+ request->start = request->req.sector_number * dataplane->sector_size;
+ for (i = 0; i < request->req.nr_segments; i++) {
+ if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+ error_report("error: nr_segments too big");
+ goto err;
+ }
+ if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) {
+ error_report("error: first > last sector");
+ goto err;
+ }
+ if (request->req.seg[i].last_sect * dataplane->sector_size >=
+ XC_PAGE_SIZE) {
+ error_report("error: page crossing");
+ goto err;
+ }
+
+ len = (request->req.seg[i].last_sect -
+ request->req.seg[i].first_sect + 1) * dataplane->sector_size;
+ request->size += len;
+ }
+ if (request->start + request->size > blk_getlength(dataplane->blk)) {
+ error_report("error: access beyond end of file");
+ goto err;
+ }
+ return 0;
+
+err:
+ request->status = BLKIF_RSP_ERROR;
+ return -1;
+}
+
+static int xen_block_copy_request(XenBlockRequest *request)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+ XenDevice *xendev = dataplane->xendev;
+ XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+ int i, count;
+ bool to_domain = (request->req.operation == BLKIF_OP_READ);
+ void *virt = request->buf;
+ Error *local_err = NULL;
+
+ if (request->req.nr_segments == 0) {
+ return 0;
+ }
+
+ count = request->req.nr_segments;
+
+ for (i = 0; i < count; i++) {
+ if (to_domain) {
+ segs[i].dest.foreign.ref = request->req.seg[i].gref;
+ segs[i].dest.foreign.offset = request->req.seg[i].first_sect *
+ dataplane->sector_size;
+ segs[i].source.virt = virt;
+ } else {
+ segs[i].source.foreign.ref = request->req.seg[i].gref;
+ segs[i].source.foreign.offset = request->req.seg[i].first_sect *
+ dataplane->sector_size;
+ segs[i].dest.virt = virt;
+ }
+ segs[i].len = (request->req.seg[i].last_sect -
+ request->req.seg[i].first_sect + 1) *
+ dataplane->sector_size;
+ virt += segs[i].len;
+ }
+
+ xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err);
+
+ if (local_err) {
+ error_reportf_err(local_err, "failed to copy data: ");
+
+ request->aio_errors++;
+ return -1;
+ }
+
+ return 0;
+}
+
+static int xen_block_do_aio(XenBlockRequest *request);
+
+static void xen_block_complete_aio(void *opaque, int ret)
+{
+ XenBlockRequest *request = opaque;
+ XenBlockDataPlane *dataplane = request->dataplane;
+
+ aio_context_acquire(dataplane->ctx);
+
+ if (ret != 0) {
+ error_report("%s I/O error",
+ request->req.operation == BLKIF_OP_READ ?
+ "read" : "write");
+ request->aio_errors++;
+ }
+
+ request->aio_inflight--;
+ if (request->presync) {
+ request->presync = 0;
+ xen_block_do_aio(request);
+ goto done;
+ }
+ if (request->aio_inflight > 0) {
+ goto done;
+ }
+
+ switch (request->req.operation) {
+ case BLKIF_OP_READ:
+ /* in case of failure request->aio_errors is increased */
+ if (ret == 0) {
+ xen_block_copy_request(request);
+ }
+ break;
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ default:
+ break;
+ }
+
+ request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
+
+ switch (request->req.operation) {
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ if (!request->req.nr_segments) {
+ break;
+ }
+ /* fall through */
+ case BLKIF_OP_READ:
+ if (request->status == BLKIF_RSP_OKAY) {
+ block_acct_done(blk_get_stats(dataplane->blk), &request->acct);
+ } else {
+ block_acct_failed(blk_get_stats(dataplane->blk), &request->acct);
+ }
+ break;
+ case BLKIF_OP_DISCARD:
+ default:
+ break;
+ }
+
+ xen_block_complete_request(request);
+
+ if (dataplane->more_work) {
+ qemu_bh_schedule(dataplane->bh);
+ }
+
+done:
+ aio_context_release(dataplane->ctx);
+}
+
+static bool xen_block_split_discard(XenBlockRequest *request,
+ blkif_sector_t sector_number,
+ uint64_t nr_sectors)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+ int64_t byte_offset;
+ int byte_chunk;
+ uint64_t byte_remaining;
+ uint64_t sec_start = sector_number;
+ uint64_t sec_count = nr_sectors;
+
+ /* Wrap around, or overflowing byte limit? */
+ if (sec_start + sec_count < sec_count ||
+ sec_start + sec_count > INT64_MAX / dataplane->sector_size) {
+ return false;
+ }
+
+ byte_offset = sec_start * dataplane->sector_size;
+ byte_remaining = sec_count * dataplane->sector_size;
+
+ do {
+ byte_chunk = byte_remaining > BDRV_REQUEST_MAX_BYTES ?
+ BDRV_REQUEST_MAX_BYTES : byte_remaining;
+ request->aio_inflight++;
+ blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk,
+ xen_block_complete_aio, request);
+ byte_remaining -= byte_chunk;
+ byte_offset += byte_chunk;
+ } while (byte_remaining > 0);
+
+ return true;
+}
+
+static int xen_block_do_aio(XenBlockRequest *request)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+
+ if (request->req.nr_segments &&
+ (request->req.operation == BLKIF_OP_WRITE ||
+ request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+ xen_block_copy_request(request)) {
+ goto err;
+ }
+
+ request->aio_inflight++;
+ if (request->presync) {
+ blk_aio_flush(request->dataplane->blk, xen_block_complete_aio,
+ request);
+ return 0;
+ }
+
+ switch (request->req.operation) {
+ case BLKIF_OP_READ:
+ qemu_iovec_add(&request->v, request->buf, request->size);
+ block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
+ request->v.size, BLOCK_ACCT_READ);
+ request->aio_inflight++;
+ blk_aio_preadv(dataplane->blk, request->start, &request->v, 0,
+ xen_block_complete_aio, request);
+ break;
+ case BLKIF_OP_WRITE:
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ if (!request->req.nr_segments) {
+ break;
+ }
+
+ qemu_iovec_add(&request->v, request->buf, request->size);
+ block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
+ request->v.size,
+ request->req.operation == BLKIF_OP_WRITE ?
+ BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
+ request->aio_inflight++;
+ blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0,
+ xen_block_complete_aio, request);
+ break;
+ case BLKIF_OP_DISCARD:
+ {
+ struct blkif_request_discard *req = (void *)&request->req;
+ if (!xen_block_split_discard(request, req->sector_number,
+ req->nr_sectors)) {
+ goto err;
+ }
+ break;
+ }
+ default:
+ /* unknown operation (shouldn't happen -- parse catches this) */
+ goto err;
+ }
+
+ xen_block_complete_aio(request, 0);
+
+ return 0;
+
+err:
+ request->status = BLKIF_RSP_ERROR;
+ xen_block_complete_request(request);
+ return -1;
+}
+
+static int xen_block_send_response(XenBlockRequest *request)
+{
+ XenBlockDataPlane *dataplane = request->dataplane;
+ int send_notify = 0;
+ int have_requests = 0;
+ blkif_response_t *resp;
+
+ /* Place on the response ring for the relevant domain. */
+ switch (dataplane->protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ resp = (blkif_response_t *)RING_GET_RESPONSE(
+ &dataplane->rings.native,
+ dataplane->rings.native.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ resp = (blkif_response_t *)RING_GET_RESPONSE(
+ &dataplane->rings.x86_32_part,
+ dataplane->rings.x86_32_part.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ resp = (blkif_response_t *)RING_GET_RESPONSE(
+ &dataplane->rings.x86_64_part,
+ dataplane->rings.x86_64_part.rsp_prod_pvt);
+ break;
+ default:
+ return 0;
+ }
+
+ resp->id = request->req.id;
+ resp->operation = request->req.operation;
+ resp->status = request->status;
+
+ dataplane->rings.common.rsp_prod_pvt++;
+
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common,
+ send_notify);
+ if (dataplane->rings.common.rsp_prod_pvt ==
+ dataplane->rings.common.req_cons) {
+ /*
+ * Tail check for pending requests. Allows frontend to avoid
+ * notifications if requests are already in flight (lower
+ * overheads and promotes batching).
+ */
+ RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common,
+ have_requests);
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) {
+ have_requests = 1;
+ }
+
+ if (have_requests) {
+ dataplane->more_work++;
+ }
+ return send_notify;
+}
+
+static int xen_block_get_request(XenBlockDataPlane *dataplane,
+ XenBlockRequest *request, RING_IDX rc)
+{
+ switch (dataplane->protocol) {
+ case BLKIF_PROTOCOL_NATIVE: {
+ blkif_request_t *req =
+ RING_GET_REQUEST(&dataplane->rings.native, rc);
+
+ memcpy(&request->req, req, sizeof(request->req));
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32: {
+ blkif_x86_32_request_t *req =
+ RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc);
+
+ blkif_get_x86_32_req(&request->req, req);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64: {
+ blkif_x86_64_request_t *req =
+ RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc);
+
+ blkif_get_x86_64_req(&request->req, req);
+ break;
+ }
+ }
+ /* Prevent the compiler from accessing the on-ring fields instead. */
+ barrier();
+ return 0;
+}
+
+/*
+ * Threshold of in-flight requests above which we will start using
+ * blk_io_plug()/blk_io_unplug() to batch requests.
+ */
+#define IO_PLUG_THRESHOLD 1
+
+static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
+{
+ RING_IDX rc, rp;
+ XenBlockRequest *request;
+ int inflight_atstart = dataplane->requests_inflight;
+ int batched = 0;
+ bool done_something = false;
+
+ dataplane->more_work = 0;
+
+ rc = dataplane->rings.common.req_cons;
+ rp = dataplane->rings.common.sring->req_prod;
+ xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+ /*
+ * If there was more than IO_PLUG_THRESHOLD requests in flight
+ * when we got here, this is an indication that there the bottleneck
+ * is below us, so it's worth beginning to batch up I/O requests
+ * rather than submitting them immediately. The maximum number
+ * of requests we're willing to batch is the number already in
+ * flight, so it can grow up to max_requests when the bottleneck
+ * is below us.
+ */
+ if (inflight_atstart > IO_PLUG_THRESHOLD) {
+ blk_io_plug(dataplane->blk);
+ }
+ while (rc != rp) {
+ /* pull request from ring */
+ if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) {
+ break;
+ }
+ request = xen_block_start_request(dataplane);
+ if (request == NULL) {
+ dataplane->more_work++;
+ break;
+ }
+ xen_block_get_request(dataplane, request, rc);
+ dataplane->rings.common.req_cons = ++rc;
+ done_something = true;
+
+ /* parse them */
+ if (xen_block_parse_request(request) != 0) {
+ switch (request->req.operation) {
+ case BLKIF_OP_READ:
+ block_acct_invalid(blk_get_stats(dataplane->blk),
+ BLOCK_ACCT_READ);
+ break;
+ case BLKIF_OP_WRITE:
+ block_acct_invalid(blk_get_stats(dataplane->blk),
+ BLOCK_ACCT_WRITE);
+ break;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ block_acct_invalid(blk_get_stats(dataplane->blk),
+ BLOCK_ACCT_FLUSH);
+ default:
+ break;
+ };
+
+ xen_block_complete_request(request);
+ continue;
+ }
+
+ if (inflight_atstart > IO_PLUG_THRESHOLD &&
+ batched >= inflight_atstart) {
+ blk_io_unplug(dataplane->blk);
+ }
+ xen_block_do_aio(request);
+ if (inflight_atstart > IO_PLUG_THRESHOLD) {
+ if (batched >= inflight_atstart) {
+ blk_io_plug(dataplane->blk);
+ batched = 0;
+ } else {
+ batched++;
+ }
+ }
+ }
+ if (inflight_atstart > IO_PLUG_THRESHOLD) {
+ blk_io_unplug(dataplane->blk);
+ }
+
+ return done_something;
+}
+
+static void xen_block_dataplane_bh(void *opaque)
+{
+ XenBlockDataPlane *dataplane = opaque;
+
+ aio_context_acquire(dataplane->ctx);
+ xen_block_handle_requests(dataplane);
+ aio_context_release(dataplane->ctx);
+}
+
+static bool xen_block_dataplane_event(void *opaque)
+{
+ XenBlockDataPlane *dataplane = opaque;
+
+ return xen_block_handle_requests(dataplane);
+}
+
+XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
+ BlockBackend *blk,
+ unsigned int sector_size,
+ IOThread *iothread)
+{
+ XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1);
+
+ dataplane->xendev = xendev;
+ dataplane->blk = blk;
+ dataplane->sector_size = sector_size;
+
+ QLIST_INIT(&dataplane->inflight);
+ QLIST_INIT(&dataplane->freelist);
+
+ if (iothread) {
+ dataplane->iothread = iothread;
+ object_ref(OBJECT(dataplane->iothread));
+ dataplane->ctx = iothread_get_aio_context(dataplane->iothread);
+ } else {
+ dataplane->ctx = qemu_get_aio_context();
+ }
+ dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
+ dataplane);
+
+ return dataplane;
+}
+
+void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane)
+{
+ XenBlockRequest *request;
+
+ if (!dataplane) {
+ return;
+ }
+
+ while (!QLIST_EMPTY(&dataplane->freelist)) {
+ request = QLIST_FIRST(&dataplane->freelist);
+ QLIST_REMOVE(request, list);
+ qemu_iovec_destroy(&request->v);
+ qemu_vfree(request->buf);
+ g_free(request);
+ }
+
+ qemu_bh_delete(dataplane->bh);
+ if (dataplane->iothread) {
+ object_unref(OBJECT(dataplane->iothread));
+ }
+
+ g_free(dataplane);
+}
+
+void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
+{
+ XenDevice *xendev;
+
+ if (!dataplane) {
+ return;
+ }
+
+ xendev = dataplane->xendev;
+
+ aio_context_acquire(dataplane->ctx);
+ if (dataplane->event_channel) {
+ /* Only reason for failure is a NULL channel */
+ xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+ qemu_get_aio_context(),
+ &error_abort);
+ }
+ /* Xen doesn't have multiple users for nodes, so this can't fail */
+ blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
+ aio_context_release(dataplane->ctx);
+
+ /*
+ * Now that the context has been moved onto the main thread, cancel
+ * further processing.
+ */
+ qemu_bh_cancel(dataplane->bh);
+
+ if (dataplane->event_channel) {
+ Error *local_err = NULL;
+
+ xen_device_unbind_event_channel(xendev, dataplane->event_channel,
+ &local_err);
+ dataplane->event_channel = NULL;
+
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ }
+
+ if (dataplane->sring) {
+ Error *local_err = NULL;
+
+ xen_device_unmap_grant_refs(xendev, dataplane->sring,
+ dataplane->nr_ring_ref, &local_err);
+ dataplane->sring = NULL;
+
+ if (local_err) {
+ error_report_err(local_err);
+ }
+ }
+
+ g_free(dataplane->ring_ref);
+ dataplane->ring_ref = NULL;
+}
+
+void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
+ const unsigned int ring_ref[],
+ unsigned int nr_ring_ref,
+ unsigned int event_channel,
+ unsigned int protocol,
+ Error **errp)
+{
+ ERRP_GUARD();
+ XenDevice *xendev = dataplane->xendev;
+ AioContext *old_context;
+ unsigned int ring_size;
+ unsigned int i;
+
+ dataplane->nr_ring_ref = nr_ring_ref;
+ dataplane->ring_ref = g_new(unsigned int, nr_ring_ref);
+
+ for (i = 0; i < nr_ring_ref; i++) {
+ dataplane->ring_ref[i] = ring_ref[i];
+ }
+
+ dataplane->protocol = protocol;
+
+ ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
+ switch (dataplane->protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
+ break;
+ }
+ default:
+ error_setg(errp, "unknown protocol %u", dataplane->protocol);
+ return;
+ }
+
+ xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref,
+ errp);
+ if (*errp) {
+ goto stop;
+ }
+
+ dataplane->sring = xen_device_map_grant_refs(xendev,
+ dataplane->ring_ref,
+ dataplane->nr_ring_ref,
+ PROT_READ | PROT_WRITE,
+ errp);
+ if (*errp) {
+ goto stop;
+ }
+
+ switch (dataplane->protocol) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ blkif_sring_t *sring_native = dataplane->sring;
+
+ BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring;
+
+ BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32,
+ ring_size);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring;
+
+ BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64,
+ ring_size);
+ break;
+ }
+ }
+
+ dataplane->event_channel =
+ xen_device_bind_event_channel(xendev, event_channel,
+ xen_block_dataplane_event, dataplane,
+ errp);
+ if (*errp) {
+ goto stop;
+ }
+
+ old_context = blk_get_aio_context(dataplane->blk);
+ aio_context_acquire(old_context);
+ /* If other users keep the BlockBackend in the iothread, that's ok */
+ blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
+ aio_context_release(old_context);
+
+ /* Only reason for failure is a NULL channel */
+ aio_context_acquire(dataplane->ctx);
+ xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+ dataplane->ctx, &error_abort);
+ aio_context_release(dataplane->ctx);
+
+ return;
+
+stop:
+ xen_block_dataplane_stop(dataplane);
+}
diff --git a/hw/block/dataplane/xen-block.h b/hw/block/dataplane/xen-block.h
new file mode 100644
index 000000000..76dcd51c3
--- /dev/null
+++ b/hw/block/dataplane/xen-block.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Citrix Systems Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_BLOCK_DATAPLANE_XEN_BLOCK_H
+#define HW_BLOCK_DATAPLANE_XEN_BLOCK_H
+
+#include "hw/block/block.h"
+#include "hw/xen/xen-bus.h"
+#include "sysemu/iothread.h"
+
+typedef struct XenBlockDataPlane XenBlockDataPlane;
+
+XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
+ BlockBackend *blk,
+ unsigned int sector_size,
+ IOThread *iothread);
+void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane);
+void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
+ const unsigned int ring_ref[],
+ unsigned int nr_ring_ref,
+ unsigned int event_channel,
+ unsigned int protocol,
+ Error **errp);
+void xen_block_dataplane_stop(XenBlockDataPlane *dataplane);
+
+#endif /* HW_BLOCK_DATAPLANE_XEN_BLOCK_H */