Introduce Virtio-loopback epsilon release:

Epsilon release introduces a new compatibility layer which make virtio-loopback design to work with QEMU and rust-vmm vhost-user backend without require any changes. Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
author: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> 2023-10-10 11:40:56 +0000
committer: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> 2023-10-10 11:40:56 +0000
commit: e02cda008591317b1625707ff8e115a4841aa889 (patch)
tree: aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /hw/block/dataplane
parent: cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)
7 files changed, 1266 insertions, 0 deletions
diff --git a/hw/block/dataplane/meson.build b/hw/block/dataplane/meson.build
new file mode 100644
index 000000000..12c6a264f
--- /dev/null
+++ b/hw/block/dataplane/meson.build
@@ -0,0 +1,2 @@
+specific_ss.add(when: 'CONFIG_VIRTIO_BLK', if_true: files('virtio-blk.c'))
+specific_ss.add(when: 'CONFIG_XEN', if_true: files('xen-block.c'))
diff --git a/hw/block/dataplane/trace-events b/hw/block/dataplane/trace-events
new file mode 100644
index 000000000..38fc3e750
--- /dev/null
+++ b/hw/block/dataplane/trace-events
@@ -0,0 +1,5 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# virtio-blk.c
+virtio_blk_data_plane_start(void *s) "dataplane %p"
+virtio_blk_data_plane_stop(void *s) "dataplane %p"
diff --git a/hw/block/dataplane/trace.h b/hw/block/dataplane/trace.h
new file mode 100644
index 000000000..240cc5983
--- /dev/null
+++ b/hw/block/dataplane/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-hw_block_dataplane.h"
diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
new file mode 100644
index 000000000..ee5a5352d
--- /dev/null
+++ b/hw/block/dataplane/virtio-blk.c
@@ -0,0 +1,369 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *   Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "trace.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+#include "qemu/thread.h"
+#include "qemu/error-report.h"
+#include "hw/virtio/virtio-access.h"
+#include "hw/virtio/virtio-blk.h"
+#include "virtio-blk.h"
+#include "block/aio.h"
+#include "hw/virtio/virtio-bus.h"
+#include "qom/object_interfaces.h"
+
+struct VirtIOBlockDataPlane {
+    bool starting;
+    bool stopping;
+
+    VirtIOBlkConf *conf;
+    VirtIODevice *vdev;
+    QEMUBH *bh;                     /* bh for guest notification */
+    unsigned long *batch_notify_vqs;
+    bool batch_notifications;
+
+    /* Note that these EventNotifiers are assigned by value.  This is
+     * fine as long as you do not call event_notifier_cleanup on them
+     * (because you don't own the file descriptor or handle; you just
+     * use it).
+     */
+    IOThread *iothread;
+    AioContext *ctx;
+};
+
+/* Raise an interrupt to signal guest, if necessary */
+void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq)
+{
+    if (s->batch_notifications) {
+        set_bit(virtio_get_queue_index(vq), s->batch_notify_vqs);
+        qemu_bh_schedule(s->bh);
+    } else {
+        virtio_notify_irqfd(s->vdev, vq);
+    }
+}
+
+static void notify_guest_bh(void *opaque)
+{
+    VirtIOBlockDataPlane *s = opaque;
+    unsigned nvqs = s->conf->num_queues;
+    unsigned long bitmap[BITS_TO_LONGS(nvqs)];
+    unsigned j;
+
+    memcpy(bitmap, s->batch_notify_vqs, sizeof(bitmap));
+    memset(s->batch_notify_vqs, 0, sizeof(bitmap));
+
+    for (j = 0; j < nvqs; j += BITS_PER_LONG) {
+        unsigned long bits = bitmap[j / BITS_PER_LONG];
+
+        while (bits != 0) {
+            unsigned i = j + ctzl(bits);
+            VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+            virtio_notify_irqfd(s->vdev, vq);
+
+            bits &= bits - 1; /* clear right-most bit */
+        }
+    }
+}
+
+/* Context: QEMU global mutex held */
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
+                                  VirtIOBlockDataPlane **dataplane,
+                                  Error **errp)
+{
+    VirtIOBlockDataPlane *s;
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    *dataplane = NULL;
+
+    if (conf->iothread) {
+        if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
+            error_setg(errp,
+                       "device is incompatible with iothread "
+                       "(transport does not support notifiers)");
+            return false;
+        }
+        if (!virtio_device_ioeventfd_enabled(vdev)) {
+            error_setg(errp, "ioeventfd is required for iothread");
+            return false;
+        }
+
+        /* If dataplane is (re-)enabled while the guest is running there could
+         * be block jobs that can conflict.
+         */
+        if (blk_op_is_blocked(conf->conf.blk, BLOCK_OP_TYPE_DATAPLANE, errp)) {
+            error_prepend(errp, "cannot start virtio-blk dataplane: ");
+            return false;
+        }
+    }
+    /* Don't try if transport does not support notifiers. */
+    if (!virtio_device_ioeventfd_enabled(vdev)) {
+        return false;
+    }
+
+    s = g_new0(VirtIOBlockDataPlane, 1);
+    s->vdev = vdev;
+    s->conf = conf;
+
+    if (conf->iothread) {
+        s->iothread = conf->iothread;
+        object_ref(OBJECT(s->iothread));
+        s->ctx = iothread_get_aio_context(s->iothread);
+    } else {
+        s->ctx = qemu_get_aio_context();
+    }
+    s->bh = aio_bh_new(s->ctx, notify_guest_bh, s);
+    s->batch_notify_vqs = bitmap_new(conf->num_queues);
+
+    *dataplane = s;
+
+    return true;
+}
+
+/* Context: QEMU global mutex held */
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s)
+{
+    VirtIOBlock *vblk;
+
+    if (!s) {
+        return;
+    }
+
+    vblk = VIRTIO_BLK(s->vdev);
+    assert(!vblk->dataplane_started);
+    g_free(s->batch_notify_vqs);
+    qemu_bh_delete(s->bh);
+    if (s->iothread) {
+        object_unref(OBJECT(s->iothread));
+    }
+    g_free(s);
+}
+
+static bool virtio_blk_data_plane_handle_output(VirtIODevice *vdev,
+                                                VirtQueue *vq)
+{
+    VirtIOBlock *s = (VirtIOBlock *)vdev;
+
+    assert(s->dataplane);
+    assert(s->dataplane_started);
+
+    return virtio_blk_handle_vq(s, vq);
+}
+
+/* Context: QEMU global mutex held */
+int virtio_blk_data_plane_start(VirtIODevice *vdev)
+{
+    VirtIOBlock *vblk = VIRTIO_BLK(vdev);
+    VirtIOBlockDataPlane *s = vblk->dataplane;
+    BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vblk)));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    AioContext *old_context;
+    unsigned i;
+    unsigned nvqs = s->conf->num_queues;
+    Error *local_err = NULL;
+    int r;
+
+    if (vblk->dataplane_started || s->starting) {
+        return 0;
+    }
+
+    s->starting = true;
+
+    if (!virtio_vdev_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX)) {
+        s->batch_notifications = true;
+    } else {
+        s->batch_notifications = false;
+    }
+
+    /* Set up guest notifier (irq) */
+    r = k->set_guest_notifiers(qbus->parent, nvqs, true);
+    if (r != 0) {
+        error_report("virtio-blk failed to set guest notifier (%d), "
+                     "ensure -accel kvm is set.", r);
+        goto fail_guest_notifiers;
+    }
+
+    /*
+     * Batch all the host notifiers in a single transaction to avoid
+     * quadratic time complexity in address_space_update_ioeventfds().
+     */
+    memory_region_transaction_begin();
+
+    /* Set up virtqueue notify */
+    for (i = 0; i < nvqs; i++) {
+        r = virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, true);
+        if (r != 0) {
+            int j = i;
+
+            fprintf(stderr, "virtio-blk failed to set host notifier (%d)\n", r);
+            while (i--) {
+                virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+            }
+
+            /*
+             * The transaction expects the ioeventfds to be open when it
+             * commits. Do it now, before the cleanup loop.
+             */
+            memory_region_transaction_commit();
+
+            while (j--) {
+                virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), j);
+            }
+            goto fail_host_notifiers;
+        }
+    }
+
+    memory_region_transaction_commit();
+
+    s->starting = false;
+    vblk->dataplane_started = true;
+    trace_virtio_blk_data_plane_start(s);
+
+    old_context = blk_get_aio_context(s->conf->conf.blk);
+    aio_context_acquire(old_context);
+    r = blk_set_aio_context(s->conf->conf.blk, s->ctx, &local_err);
+    aio_context_release(old_context);
+    if (r < 0) {
+        error_report_err(local_err);
+        goto fail_aio_context;
+    }
+
+    /* Process queued requests before the ones in vring */
+    virtio_blk_process_queued_requests(vblk, false);
+
+    /* Kick right away to begin processing requests already in vring */
+    for (i = 0; i < nvqs; i++) {
+        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+        event_notifier_set(virtio_queue_get_host_notifier(vq));
+    }
+
+    /* Get this show started by hooking up our callbacks */
+    aio_context_acquire(s->ctx);
+    for (i = 0; i < nvqs; i++) {
+        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+        virtio_queue_aio_set_host_notifier_handler(vq, s->ctx,
+                virtio_blk_data_plane_handle_output);
+    }
+    aio_context_release(s->ctx);
+    return 0;
+
+  fail_aio_context:
+    memory_region_transaction_begin();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+    }
+
+    memory_region_transaction_commit();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+    }
+  fail_host_notifiers:
+    k->set_guest_notifiers(qbus->parent, nvqs, false);
+  fail_guest_notifiers:
+    /*
+     * If we failed to set up the guest notifiers queued requests will be
+     * processed on the main context.
+     */
+    virtio_blk_process_queued_requests(vblk, false);
+    vblk->dataplane_disabled = true;
+    s->starting = false;
+    vblk->dataplane_started = true;
+    return -ENOSYS;
+}
+
+/* Stop notifications for new requests from guest.
+ *
+ * Context: BH in IOThread
+ */
+static void virtio_blk_data_plane_stop_bh(void *opaque)
+{
+    VirtIOBlockDataPlane *s = opaque;
+    unsigned i;
+
+    for (i = 0; i < s->conf->num_queues; i++) {
+        VirtQueue *vq = virtio_get_queue(s->vdev, i);
+
+        virtio_queue_aio_set_host_notifier_handler(vq, s->ctx, NULL);
+    }
+}
+
+/* Context: QEMU global mutex held */
+void virtio_blk_data_plane_stop(VirtIODevice *vdev)
+{
+    VirtIOBlock *vblk = VIRTIO_BLK(vdev);
+    VirtIOBlockDataPlane *s = vblk->dataplane;
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vblk));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+    unsigned i;
+    unsigned nvqs = s->conf->num_queues;
+
+    if (!vblk->dataplane_started || s->stopping) {
+        return;
+    }
+
+    /* Better luck next time. */
+    if (vblk->dataplane_disabled) {
+        vblk->dataplane_disabled = false;
+        vblk->dataplane_started = false;
+        return;
+    }
+    s->stopping = true;
+    trace_virtio_blk_data_plane_stop(s);
+
+    aio_context_acquire(s->ctx);
+    aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
+
+    /* Drain and try to switch bs back to the QEMU main loop. If other users
+     * keep the BlockBackend in the iothread, that's ok */
+    blk_set_aio_context(s->conf->conf.blk, qemu_get_aio_context(), NULL);
+
+    aio_context_release(s->ctx);
+
+    /*
+     * Batch all the host notifiers in a single transaction to avoid
+     * quadratic time complexity in address_space_update_ioeventfds().
+     */
+    memory_region_transaction_begin();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+    }
+
+    /*
+     * The transaction expects the ioeventfds to be open when it
+     * commits. Do it now, before the cleanup loop.
+     */
+    memory_region_transaction_commit();
+
+    for (i = 0; i < nvqs; i++) {
+        virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+    }
+
+    qemu_bh_cancel(s->bh);
+    notify_guest_bh(s); /* final chance to notify guest */
+
+    /* Clean up guest notifier (irq) */
+    k->set_guest_notifiers(qbus->parent, nvqs, false);
+
+    vblk->dataplane_started = false;
+    s->stopping = false;
+}
diff --git a/hw/block/dataplane/virtio-blk.h b/hw/block/dataplane/virtio-blk.h
new file mode 100644
index 000000000..5e18bb99a
--- /dev/null
+++ b/hw/block/dataplane/virtio-blk.h
@@ -0,0 +1,31 @@
+/*
+ * Dedicated thread for virtio-blk I/O processing
+ *
+ * Copyright 2012 IBM, Corp.
+ * Copyright 2012 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *   Stefan Hajnoczi <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HW_DATAPLANE_VIRTIO_BLK_H
+#define HW_DATAPLANE_VIRTIO_BLK_H
+
+#include "hw/virtio/virtio.h"
+
+typedef struct VirtIOBlockDataPlane VirtIOBlockDataPlane;
+
+bool virtio_blk_data_plane_create(VirtIODevice *vdev, VirtIOBlkConf *conf,
+                                  VirtIOBlockDataPlane **dataplane,
+                                  Error **errp);
+void virtio_blk_data_plane_destroy(VirtIOBlockDataPlane *s);
+void virtio_blk_data_plane_notify(VirtIOBlockDataPlane *s, VirtQueue *vq);
+
+int virtio_blk_data_plane_start(VirtIODevice *vdev);
+void virtio_blk_data_plane_stop(VirtIODevice *vdev);
+
+#endif /* HW_DATAPLANE_VIRTIO_BLK_H */
diff --git a/hw/block/dataplane/xen-block.c b/hw/block/dataplane/xen-block.c
new file mode 100644
index 000000000..860787580
--- /dev/null
+++ b/hw/block/dataplane/xen-block.c
@@ -0,0 +1,828 @@
+/*
+ * Copyright (c) 2018  Citrix Systems Inc.
+ * (c) Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+#include "hw/xen/xen_common.h"
+#include "hw/block/xen_blkif.h"
+#include "sysemu/block-backend.h"
+#include "sysemu/iothread.h"
+#include "xen-block.h"
+
+typedef struct XenBlockRequest {
+    blkif_request_t req;
+    int16_t status;
+    off_t start;
+    QEMUIOVector v;
+    void *buf;
+    size_t size;
+    int presync;
+    int aio_inflight;
+    int aio_errors;
+    XenBlockDataPlane *dataplane;
+    QLIST_ENTRY(XenBlockRequest) list;
+    BlockAcctCookie acct;
+} XenBlockRequest;
+
+struct XenBlockDataPlane {
+    XenDevice *xendev;
+    XenEventChannel *event_channel;
+    unsigned int *ring_ref;
+    unsigned int nr_ring_ref;
+    void *sring;
+    int protocol;
+    blkif_back_rings_t rings;
+    int more_work;
+    QLIST_HEAD(inflight_head, XenBlockRequest) inflight;
+    QLIST_HEAD(freelist_head, XenBlockRequest) freelist;
+    int requests_total;
+    int requests_inflight;
+    unsigned int max_requests;
+    BlockBackend *blk;
+    unsigned int sector_size;
+    QEMUBH *bh;
+    IOThread *iothread;
+    AioContext *ctx;
+};
+
+static int xen_block_send_response(XenBlockRequest *request);
+
+static void reset_request(XenBlockRequest *request)
+{
+    memset(&request->req, 0, sizeof(request->req));
+    request->status = 0;
+    request->start = 0;
+    request->size = 0;
+    request->presync = 0;
+
+    request->aio_inflight = 0;
+    request->aio_errors = 0;
+
+    request->dataplane = NULL;
+    memset(&request->list, 0, sizeof(request->list));
+    memset(&request->acct, 0, sizeof(request->acct));
+
+    qemu_iovec_reset(&request->v);
+}
+
+static XenBlockRequest *xen_block_start_request(XenBlockDataPlane *dataplane)
+{
+    XenBlockRequest *request = NULL;
+
+    if (QLIST_EMPTY(&dataplane->freelist)) {
+        if (dataplane->requests_total >= dataplane->max_requests) {
+            goto out;
+        }
+        /* allocate new struct */
+        request = g_malloc0(sizeof(*request));
+        request->dataplane = dataplane;
+        /*
+         * We cannot need more pages per requests than this, and since we
+         * re-use requests, allocate the memory once here. It will be freed
+         * xen_block_dataplane_destroy() when the request list is freed.
+         */
+        request->buf = qemu_memalign(XC_PAGE_SIZE,
+                                     BLKIF_MAX_SEGMENTS_PER_REQUEST *
+                                     XC_PAGE_SIZE);
+        dataplane->requests_total++;
+        qemu_iovec_init(&request->v, 1);
+    } else {
+        /* get one from freelist */
+        request = QLIST_FIRST(&dataplane->freelist);
+        QLIST_REMOVE(request, list);
+    }
+    QLIST_INSERT_HEAD(&dataplane->inflight, request, list);
+    dataplane->requests_inflight++;
+
+out:
+    return request;
+}
+
+static void xen_block_complete_request(XenBlockRequest *request)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+
+    if (xen_block_send_response(request)) {
+        Error *local_err = NULL;
+
+        xen_device_notify_event_channel(dataplane->xendev,
+                                        dataplane->event_channel,
+                                        &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+        }
+    }
+
+    QLIST_REMOVE(request, list);
+    dataplane->requests_inflight--;
+    reset_request(request);
+    request->dataplane = dataplane;
+    QLIST_INSERT_HEAD(&dataplane->freelist, request, list);
+}
+
+/*
+ * translate request into iovec + start offset
+ * do sanity checks along the way
+ */
+static int xen_block_parse_request(XenBlockRequest *request)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+    size_t len;
+    int i;
+
+    switch (request->req.operation) {
+    case BLKIF_OP_READ:
+        break;
+    case BLKIF_OP_FLUSH_DISKCACHE:
+        request->presync = 1;
+        if (!request->req.nr_segments) {
+            return 0;
+        }
+        /* fall through */
+    case BLKIF_OP_WRITE:
+        break;
+    case BLKIF_OP_DISCARD:
+        return 0;
+    default:
+        error_report("error: unknown operation (%d)", request->req.operation);
+        goto err;
+    };
+
+    if (request->req.operation != BLKIF_OP_READ &&
+        !blk_is_writable(dataplane->blk)) {
+        error_report("error: write req for ro device");
+        goto err;
+    }
+
+    request->start = request->req.sector_number * dataplane->sector_size;
+    for (i = 0; i < request->req.nr_segments; i++) {
+        if (i == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+            error_report("error: nr_segments too big");
+            goto err;
+        }
+        if (request->req.seg[i].first_sect > request->req.seg[i].last_sect) {
+            error_report("error: first > last sector");
+            goto err;
+        }
+        if (request->req.seg[i].last_sect * dataplane->sector_size >=
+            XC_PAGE_SIZE) {
+            error_report("error: page crossing");
+            goto err;
+        }
+
+        len = (request->req.seg[i].last_sect -
+               request->req.seg[i].first_sect + 1) * dataplane->sector_size;
+        request->size += len;
+    }
+    if (request->start + request->size > blk_getlength(dataplane->blk)) {
+        error_report("error: access beyond end of file");
+        goto err;
+    }
+    return 0;
+
+err:
+    request->status = BLKIF_RSP_ERROR;
+    return -1;
+}
+
+static int xen_block_copy_request(XenBlockRequest *request)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+    XenDevice *xendev = dataplane->xendev;
+    XenDeviceGrantCopySegment segs[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+    int i, count;
+    bool to_domain = (request->req.operation == BLKIF_OP_READ);
+    void *virt = request->buf;
+    Error *local_err = NULL;
+
+    if (request->req.nr_segments == 0) {
+        return 0;
+    }
+
+    count = request->req.nr_segments;
+
+    for (i = 0; i < count; i++) {
+        if (to_domain) {
+            segs[i].dest.foreign.ref = request->req.seg[i].gref;
+            segs[i].dest.foreign.offset = request->req.seg[i].first_sect *
+                dataplane->sector_size;
+            segs[i].source.virt = virt;
+        } else {
+            segs[i].source.foreign.ref = request->req.seg[i].gref;
+            segs[i].source.foreign.offset = request->req.seg[i].first_sect *
+                dataplane->sector_size;
+            segs[i].dest.virt = virt;
+        }
+        segs[i].len = (request->req.seg[i].last_sect -
+                       request->req.seg[i].first_sect + 1) *
+                      dataplane->sector_size;
+        virt += segs[i].len;
+    }
+
+    xen_device_copy_grant_refs(xendev, to_domain, segs, count, &local_err);
+
+    if (local_err) {
+        error_reportf_err(local_err, "failed to copy data: ");
+
+        request->aio_errors++;
+        return -1;
+    }
+
+    return 0;
+}
+
+static int xen_block_do_aio(XenBlockRequest *request);
+
+static void xen_block_complete_aio(void *opaque, int ret)
+{
+    XenBlockRequest *request = opaque;
+    XenBlockDataPlane *dataplane = request->dataplane;
+
+    aio_context_acquire(dataplane->ctx);
+
+    if (ret != 0) {
+        error_report("%s I/O error",
+                     request->req.operation == BLKIF_OP_READ ?
+                     "read" : "write");
+        request->aio_errors++;
+    }
+
+    request->aio_inflight--;
+    if (request->presync) {
+        request->presync = 0;
+        xen_block_do_aio(request);
+        goto done;
+    }
+    if (request->aio_inflight > 0) {
+        goto done;
+    }
+
+    switch (request->req.operation) {
+    case BLKIF_OP_READ:
+        /* in case of failure request->aio_errors is increased */
+        if (ret == 0) {
+            xen_block_copy_request(request);
+        }
+        break;
+    case BLKIF_OP_WRITE:
+    case BLKIF_OP_FLUSH_DISKCACHE:
+    default:
+        break;
+    }
+
+    request->status = request->aio_errors ? BLKIF_RSP_ERROR : BLKIF_RSP_OKAY;
+
+    switch (request->req.operation) {
+    case BLKIF_OP_WRITE:
+    case BLKIF_OP_FLUSH_DISKCACHE:
+        if (!request->req.nr_segments) {
+            break;
+        }
+        /* fall through */
+    case BLKIF_OP_READ:
+        if (request->status == BLKIF_RSP_OKAY) {
+            block_acct_done(blk_get_stats(dataplane->blk), &request->acct);
+        } else {
+            block_acct_failed(blk_get_stats(dataplane->blk), &request->acct);
+        }
+        break;
+    case BLKIF_OP_DISCARD:
+    default:
+        break;
+    }
+
+    xen_block_complete_request(request);
+
+    if (dataplane->more_work) {
+        qemu_bh_schedule(dataplane->bh);
+    }
+
+done:
+    aio_context_release(dataplane->ctx);
+}
+
+static bool xen_block_split_discard(XenBlockRequest *request,
+                                    blkif_sector_t sector_number,
+                                    uint64_t nr_sectors)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+    int64_t byte_offset;
+    int byte_chunk;
+    uint64_t byte_remaining;
+    uint64_t sec_start = sector_number;
+    uint64_t sec_count = nr_sectors;
+
+    /* Wrap around, or overflowing byte limit? */
+    if (sec_start + sec_count < sec_count ||
+        sec_start + sec_count > INT64_MAX / dataplane->sector_size) {
+        return false;
+    }
+
+    byte_offset = sec_start * dataplane->sector_size;
+    byte_remaining = sec_count * dataplane->sector_size;
+
+    do {
+        byte_chunk = byte_remaining > BDRV_REQUEST_MAX_BYTES ?
+            BDRV_REQUEST_MAX_BYTES : byte_remaining;
+        request->aio_inflight++;
+        blk_aio_pdiscard(dataplane->blk, byte_offset, byte_chunk,
+                         xen_block_complete_aio, request);
+        byte_remaining -= byte_chunk;
+        byte_offset += byte_chunk;
+    } while (byte_remaining > 0);
+
+    return true;
+}
+
+static int xen_block_do_aio(XenBlockRequest *request)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+
+    if (request->req.nr_segments &&
+        (request->req.operation == BLKIF_OP_WRITE ||
+         request->req.operation == BLKIF_OP_FLUSH_DISKCACHE) &&
+        xen_block_copy_request(request)) {
+        goto err;
+    }
+
+    request->aio_inflight++;
+    if (request->presync) {
+        blk_aio_flush(request->dataplane->blk, xen_block_complete_aio,
+                      request);
+        return 0;
+    }
+
+    switch (request->req.operation) {
+    case BLKIF_OP_READ:
+        qemu_iovec_add(&request->v, request->buf, request->size);
+        block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
+                         request->v.size, BLOCK_ACCT_READ);
+        request->aio_inflight++;
+        blk_aio_preadv(dataplane->blk, request->start, &request->v, 0,
+                       xen_block_complete_aio, request);
+        break;
+    case BLKIF_OP_WRITE:
+    case BLKIF_OP_FLUSH_DISKCACHE:
+        if (!request->req.nr_segments) {
+            break;
+        }
+
+        qemu_iovec_add(&request->v, request->buf, request->size);
+        block_acct_start(blk_get_stats(dataplane->blk), &request->acct,
+                         request->v.size,
+                         request->req.operation == BLKIF_OP_WRITE ?
+                         BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
+        request->aio_inflight++;
+        blk_aio_pwritev(dataplane->blk, request->start, &request->v, 0,
+                        xen_block_complete_aio, request);
+        break;
+    case BLKIF_OP_DISCARD:
+    {
+        struct blkif_request_discard *req = (void *)&request->req;
+        if (!xen_block_split_discard(request, req->sector_number,
+                                     req->nr_sectors)) {
+            goto err;
+        }
+        break;
+    }
+    default:
+        /* unknown operation (shouldn't happen -- parse catches this) */
+        goto err;
+    }
+
+    xen_block_complete_aio(request, 0);
+
+    return 0;
+
+err:
+    request->status = BLKIF_RSP_ERROR;
+    xen_block_complete_request(request);
+    return -1;
+}
+
+static int xen_block_send_response(XenBlockRequest *request)
+{
+    XenBlockDataPlane *dataplane = request->dataplane;
+    int send_notify = 0;
+    int have_requests = 0;
+    blkif_response_t *resp;
+
+    /* Place on the response ring for the relevant domain. */
+    switch (dataplane->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+        resp = (blkif_response_t *)RING_GET_RESPONSE(
+            &dataplane->rings.native,
+            dataplane->rings.native.rsp_prod_pvt);
+        break;
+    case BLKIF_PROTOCOL_X86_32:
+        resp = (blkif_response_t *)RING_GET_RESPONSE(
+            &dataplane->rings.x86_32_part,
+            dataplane->rings.x86_32_part.rsp_prod_pvt);
+        break;
+    case BLKIF_PROTOCOL_X86_64:
+        resp = (blkif_response_t *)RING_GET_RESPONSE(
+            &dataplane->rings.x86_64_part,
+            dataplane->rings.x86_64_part.rsp_prod_pvt);
+        break;
+    default:
+        return 0;
+    }
+
+    resp->id = request->req.id;
+    resp->operation = request->req.operation;
+    resp->status = request->status;
+
+    dataplane->rings.common.rsp_prod_pvt++;
+
+    RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&dataplane->rings.common,
+                                         send_notify);
+    if (dataplane->rings.common.rsp_prod_pvt ==
+        dataplane->rings.common.req_cons) {
+        /*
+         * Tail check for pending requests. Allows frontend to avoid
+         * notifications if requests are already in flight (lower
+         * overheads and promotes batching).
+         */
+        RING_FINAL_CHECK_FOR_REQUESTS(&dataplane->rings.common,
+                                      have_requests);
+    } else if (RING_HAS_UNCONSUMED_REQUESTS(&dataplane->rings.common)) {
+        have_requests = 1;
+    }
+
+    if (have_requests) {
+        dataplane->more_work++;
+    }
+    return send_notify;
+}
+
+static int xen_block_get_request(XenBlockDataPlane *dataplane,
+                                 XenBlockRequest *request, RING_IDX rc)
+{
+    switch (dataplane->protocol) {
+    case BLKIF_PROTOCOL_NATIVE: {
+        blkif_request_t *req =
+            RING_GET_REQUEST(&dataplane->rings.native, rc);
+
+        memcpy(&request->req, req, sizeof(request->req));
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_32: {
+        blkif_x86_32_request_t *req =
+            RING_GET_REQUEST(&dataplane->rings.x86_32_part, rc);
+
+        blkif_get_x86_32_req(&request->req, req);
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_64: {
+        blkif_x86_64_request_t *req =
+            RING_GET_REQUEST(&dataplane->rings.x86_64_part, rc);
+
+        blkif_get_x86_64_req(&request->req, req);
+        break;
+    }
+    }
+    /* Prevent the compiler from accessing the on-ring fields instead. */
+    barrier();
+    return 0;
+}
+
+/*
+ * Threshold of in-flight requests above which we will start using
+ * blk_io_plug()/blk_io_unplug() to batch requests.
+ */
+#define IO_PLUG_THRESHOLD 1
+
+static bool xen_block_handle_requests(XenBlockDataPlane *dataplane)
+{
+    RING_IDX rc, rp;
+    XenBlockRequest *request;
+    int inflight_atstart = dataplane->requests_inflight;
+    int batched = 0;
+    bool done_something = false;
+
+    dataplane->more_work = 0;
+
+    rc = dataplane->rings.common.req_cons;
+    rp = dataplane->rings.common.sring->req_prod;
+    xen_rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+    /*
+     * If there was more than IO_PLUG_THRESHOLD requests in flight
+     * when we got here, this is an indication that there the bottleneck
+     * is below us, so it's worth beginning to batch up I/O requests
+     * rather than submitting them immediately. The maximum number
+     * of requests we're willing to batch is the number already in
+     * flight, so it can grow up to max_requests when the bottleneck
+     * is below us.
+     */
+    if (inflight_atstart > IO_PLUG_THRESHOLD) {
+        blk_io_plug(dataplane->blk);
+    }
+    while (rc != rp) {
+        /* pull request from ring */
+        if (RING_REQUEST_CONS_OVERFLOW(&dataplane->rings.common, rc)) {
+            break;
+        }
+        request = xen_block_start_request(dataplane);
+        if (request == NULL) {
+            dataplane->more_work++;
+            break;
+        }
+        xen_block_get_request(dataplane, request, rc);
+        dataplane->rings.common.req_cons = ++rc;
+        done_something = true;
+
+        /* parse them */
+        if (xen_block_parse_request(request) != 0) {
+            switch (request->req.operation) {
+            case BLKIF_OP_READ:
+                block_acct_invalid(blk_get_stats(dataplane->blk),
+                                   BLOCK_ACCT_READ);
+                break;
+            case BLKIF_OP_WRITE:
+                block_acct_invalid(blk_get_stats(dataplane->blk),
+                                   BLOCK_ACCT_WRITE);
+                break;
+            case BLKIF_OP_FLUSH_DISKCACHE:
+                block_acct_invalid(blk_get_stats(dataplane->blk),
+                                   BLOCK_ACCT_FLUSH);
+            default:
+                break;
+            };
+
+            xen_block_complete_request(request);
+            continue;
+        }
+
+        if (inflight_atstart > IO_PLUG_THRESHOLD &&
+            batched >= inflight_atstart) {
+            blk_io_unplug(dataplane->blk);
+        }
+        xen_block_do_aio(request);
+        if (inflight_atstart > IO_PLUG_THRESHOLD) {
+            if (batched >= inflight_atstart) {
+                blk_io_plug(dataplane->blk);
+                batched = 0;
+            } else {
+                batched++;
+            }
+        }
+    }
+    if (inflight_atstart > IO_PLUG_THRESHOLD) {
+        blk_io_unplug(dataplane->blk);
+    }
+
+    return done_something;
+}
+
+static void xen_block_dataplane_bh(void *opaque)
+{
+    XenBlockDataPlane *dataplane = opaque;
+
+    aio_context_acquire(dataplane->ctx);
+    xen_block_handle_requests(dataplane);
+    aio_context_release(dataplane->ctx);
+}
+
+static bool xen_block_dataplane_event(void *opaque)
+{
+    XenBlockDataPlane *dataplane = opaque;
+
+    return xen_block_handle_requests(dataplane);
+}
+
+XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
+                                              BlockBackend *blk,
+                                              unsigned int sector_size,
+                                              IOThread *iothread)
+{
+    XenBlockDataPlane *dataplane = g_new0(XenBlockDataPlane, 1);
+
+    dataplane->xendev = xendev;
+    dataplane->blk = blk;
+    dataplane->sector_size = sector_size;
+
+    QLIST_INIT(&dataplane->inflight);
+    QLIST_INIT(&dataplane->freelist);
+
+    if (iothread) {
+        dataplane->iothread = iothread;
+        object_ref(OBJECT(dataplane->iothread));
+        dataplane->ctx = iothread_get_aio_context(dataplane->iothread);
+    } else {
+        dataplane->ctx = qemu_get_aio_context();
+    }
+    dataplane->bh = aio_bh_new(dataplane->ctx, xen_block_dataplane_bh,
+                               dataplane);
+
+    return dataplane;
+}
+
+void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane)
+{
+    XenBlockRequest *request;
+
+    if (!dataplane) {
+        return;
+    }
+
+    while (!QLIST_EMPTY(&dataplane->freelist)) {
+        request = QLIST_FIRST(&dataplane->freelist);
+        QLIST_REMOVE(request, list);
+        qemu_iovec_destroy(&request->v);
+        qemu_vfree(request->buf);
+        g_free(request);
+    }
+
+    qemu_bh_delete(dataplane->bh);
+    if (dataplane->iothread) {
+        object_unref(OBJECT(dataplane->iothread));
+    }
+
+    g_free(dataplane);
+}
+
+void xen_block_dataplane_stop(XenBlockDataPlane *dataplane)
+{
+    XenDevice *xendev;
+
+    if (!dataplane) {
+        return;
+    }
+
+    xendev = dataplane->xendev;
+
+    aio_context_acquire(dataplane->ctx);
+    if (dataplane->event_channel) {
+        /* Only reason for failure is a NULL channel */
+        xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+                                             qemu_get_aio_context(),
+                                             &error_abort);
+    }
+    /* Xen doesn't have multiple users for nodes, so this can't fail */
+    blk_set_aio_context(dataplane->blk, qemu_get_aio_context(), &error_abort);
+    aio_context_release(dataplane->ctx);
+
+    /*
+     * Now that the context has been moved onto the main thread, cancel
+     * further processing.
+     */
+    qemu_bh_cancel(dataplane->bh);
+
+    if (dataplane->event_channel) {
+        Error *local_err = NULL;
+
+        xen_device_unbind_event_channel(xendev, dataplane->event_channel,
+                                        &local_err);
+        dataplane->event_channel = NULL;
+
+        if (local_err) {
+            error_report_err(local_err);
+        }
+    }
+
+    if (dataplane->sring) {
+        Error *local_err = NULL;
+
+        xen_device_unmap_grant_refs(xendev, dataplane->sring,
+                                    dataplane->nr_ring_ref, &local_err);
+        dataplane->sring = NULL;
+
+        if (local_err) {
+            error_report_err(local_err);
+        }
+    }
+
+    g_free(dataplane->ring_ref);
+    dataplane->ring_ref = NULL;
+}
+
+void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
+                               const unsigned int ring_ref[],
+                               unsigned int nr_ring_ref,
+                               unsigned int event_channel,
+                               unsigned int protocol,
+                               Error **errp)
+{
+    ERRP_GUARD();
+    XenDevice *xendev = dataplane->xendev;
+    AioContext *old_context;
+    unsigned int ring_size;
+    unsigned int i;
+
+    dataplane->nr_ring_ref = nr_ring_ref;
+    dataplane->ring_ref = g_new(unsigned int, nr_ring_ref);
+
+    for (i = 0; i < nr_ring_ref; i++) {
+        dataplane->ring_ref[i] = ring_ref[i];
+    }
+
+    dataplane->protocol = protocol;
+
+    ring_size = XC_PAGE_SIZE * dataplane->nr_ring_ref;
+    switch (dataplane->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+    {
+        dataplane->max_requests = __CONST_RING_SIZE(blkif, ring_size);
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_32:
+    {
+        dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_32, ring_size);
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_64:
+    {
+        dataplane->max_requests = __CONST_RING_SIZE(blkif_x86_64, ring_size);
+        break;
+    }
+    default:
+        error_setg(errp, "unknown protocol %u", dataplane->protocol);
+        return;
+    }
+
+    xen_device_set_max_grant_refs(xendev, dataplane->nr_ring_ref,
+                                  errp);
+    if (*errp) {
+        goto stop;
+    }
+
+    dataplane->sring = xen_device_map_grant_refs(xendev,
+                                              dataplane->ring_ref,
+                                              dataplane->nr_ring_ref,
+                                              PROT_READ | PROT_WRITE,
+                                              errp);
+    if (*errp) {
+        goto stop;
+    }
+
+    switch (dataplane->protocol) {
+    case BLKIF_PROTOCOL_NATIVE:
+    {
+        blkif_sring_t *sring_native = dataplane->sring;
+
+        BACK_RING_INIT(&dataplane->rings.native, sring_native, ring_size);
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_32:
+    {
+        blkif_x86_32_sring_t *sring_x86_32 = dataplane->sring;
+
+        BACK_RING_INIT(&dataplane->rings.x86_32_part, sring_x86_32,
+                       ring_size);
+        break;
+    }
+    case BLKIF_PROTOCOL_X86_64:
+    {
+        blkif_x86_64_sring_t *sring_x86_64 = dataplane->sring;
+
+        BACK_RING_INIT(&dataplane->rings.x86_64_part, sring_x86_64,
+                       ring_size);
+        break;
+    }
+    }
+
+    dataplane->event_channel =
+        xen_device_bind_event_channel(xendev, event_channel,
+                                      xen_block_dataplane_event, dataplane,
+                                      errp);
+    if (*errp) {
+        goto stop;
+    }
+
+    old_context = blk_get_aio_context(dataplane->blk);
+    aio_context_acquire(old_context);
+    /* If other users keep the BlockBackend in the iothread, that's ok */
+    blk_set_aio_context(dataplane->blk, dataplane->ctx, NULL);
+    aio_context_release(old_context);
+
+    /* Only reason for failure is a NULL channel */
+    aio_context_acquire(dataplane->ctx);
+    xen_device_set_event_channel_context(xendev, dataplane->event_channel,
+                                         dataplane->ctx, &error_abort);
+    aio_context_release(dataplane->ctx);
+
+    return;
+
+stop:
+    xen_block_dataplane_stop(dataplane);
+}
diff --git a/hw/block/dataplane/xen-block.h b/hw/block/dataplane/xen-block.h
new file mode 100644
index 000000000..76dcd51c3
--- /dev/null
+++ b/hw/block/dataplane/xen-block.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018  Citrix Systems Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef HW_BLOCK_DATAPLANE_XEN_BLOCK_H
+#define HW_BLOCK_DATAPLANE_XEN_BLOCK_H
+
+#include "hw/block/block.h"
+#include "hw/xen/xen-bus.h"
+#include "sysemu/iothread.h"
+
+typedef struct XenBlockDataPlane XenBlockDataPlane;
+
+XenBlockDataPlane *xen_block_dataplane_create(XenDevice *xendev,
+                                              BlockBackend *blk,
+                                              unsigned int sector_size,
+                                              IOThread *iothread);
+void xen_block_dataplane_destroy(XenBlockDataPlane *dataplane);
+void xen_block_dataplane_start(XenBlockDataPlane *dataplane,
+                               const unsigned int ring_ref[],
+                               unsigned int nr_ring_ref,
+                               unsigned int event_channel,
+                               unsigned int protocol,
+                               Error **errp);
+void xen_block_dataplane_stop(XenBlockDataPlane *dataplane);
+
+#endif /* HW_BLOCK_DATAPLANE_XEN_BLOCK_H */
author	Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>	2023-10-10 11:40:56 +0000
committer	Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>	2023-10-10 11:40:56 +0000
commit	e02cda008591317b1625707ff8e115a4841aa889 (patch)
tree	aee302e3cf8b59ec2d32ec481be3d1afddfc8968 /hw/block/dataplane
parent	cc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)