aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
committerTimos Ampelikiotis <t.ampelikiotis@virtualopensystems.com>2023-10-10 11:40:56 +0000
commite02cda008591317b1625707ff8e115a4841aa889 (patch)
treeaee302e3cf8b59ec2d32ec481be3d1afddfc8968 /net
parentcc668e6b7e0ffd8c9d130513d12053cf5eda1d3b (diff)
Introduce Virtio-loopback epsilon release:
Epsilon release introduces a new compatibility layer which make virtio-loopback design to work with QEMU and rust-vmm vhost-user backend without require any changes. Signed-off-by: Timos Ampelikiotis <t.ampelikiotis@virtualopensystems.com> Change-Id: I52e57563e08a7d0bdc002f8e928ee61ba0c53dd9
Diffstat (limited to 'net')
-rw-r--r--net/announce.c202
-rw-r--r--net/can/can_core.c176
-rw-r--r--net/can/can_host.c112
-rw-r--r--net/can/can_socketcan.c333
-rw-r--r--net/can/meson.build5
-rw-r--r--net/checksum.c208
-rw-r--r--net/clients.h66
-rw-r--r--net/colo-compare.c1502
-rw-r--r--net/colo-compare.h25
-rw-r--r--net/colo.c238
-rw-r--r--net/colo.h108
-rw-r--r--net/dump.c263
-rw-r--r--net/eth.c567
-rw-r--r--net/filter-buffer.c201
-rw-r--r--net/filter-mirror.c458
-rw-r--r--net/filter-replay.c89
-rw-r--r--net/filter-rewriter.c441
-rw-r--r--net/filter.c377
-rw-r--r--net/hub.c345
-rw-r--r--net/hub.h24
-rw-r--r--net/l2tpv3.c739
-rw-r--r--net/meson.build45
-rw-r--r--net/net.c1766
-rw-r--r--net/netmap.c431
-rw-r--r--net/queue.c304
-rw-r--r--net/slirp.c1124
-rw-r--r--net/socket.c783
-rw-r--r--net/tap-bsd.c258
-rw-r--r--net/tap-linux.c331
-rw-r--r--net/tap-linux.h54
-rw-r--r--net/tap-solaris.c262
-rw-r--r--net/tap-stub.c92
-rw-r--r--net/tap-win32.c832
-rw-r--r--net/tap.c1056
-rw-r--r--net/tap_int.h49
-rw-r--r--net/trace-events24
-rw-r--r--net/trace.h1
-rw-r--r--net/util.c59
-rw-r--r--net/util.h86
-rw-r--r--net/vde.c129
-rw-r--r--net/vhost-user-stub.c23
-rw-r--r--net/vhost-user.c438
-rw-r--r--net/vhost-vdpa.c315
43 files changed, 14941 insertions, 0 deletions
diff --git a/net/announce.c b/net/announce.c
new file mode 100644
index 000000000..26f057f5e
--- /dev/null
+++ b/net/announce.c
@@ -0,0 +1,202 @@
+/*
+ * Self-announce
+ * (c) 2017-2019 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "net/announce.h"
+#include "net/net.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-visit-net.h"
+#include "qapi/qapi-commands-net.h"
+#include "trace.h"
+
+static GData *named_timers;
+
+int64_t qemu_announce_timer_step(AnnounceTimer *timer)
+{
+ int64_t step;
+
+ step = timer->params.initial +
+ (timer->params.rounds - timer->round - 1) *
+ timer->params.step;
+
+ if (step < 0 || step > timer->params.max) {
+ step = timer->params.max;
+ }
+ timer_mod(timer->tm, qemu_clock_get_ms(timer->type) + step);
+
+ return step;
+}
+
+/*
+ * If 'free_named' is true, then remove the timer from the list
+ * and free the timer itself.
+ */
+void qemu_announce_timer_del(AnnounceTimer *timer, bool free_named)
+{
+ bool free_timer = false;
+ if (timer->tm) {
+ timer_free(timer->tm);
+ timer->tm = NULL;
+ }
+ qapi_free_strList(timer->params.interfaces);
+ timer->params.interfaces = NULL;
+ if (free_named && timer->params.has_id) {
+ AnnounceTimer *list_timer;
+ /*
+ * Sanity check: There should only be one timer on the list with
+ * the id.
+ */
+ list_timer = g_datalist_get_data(&named_timers, timer->params.id);
+ assert(timer == list_timer);
+ free_timer = true;
+ g_datalist_remove_data(&named_timers, timer->params.id);
+ }
+ trace_qemu_announce_timer_del(free_named, free_timer, timer->params.id);
+ g_free(timer->params.id);
+ timer->params.id = NULL;
+
+ if (free_timer) {
+ g_free(timer);
+ }
+}
+
+/*
+ * Under BQL/main thread
+ * Reset the timer to the given parameters/type/notifier.
+ */
+void qemu_announce_timer_reset(AnnounceTimer *timer,
+ AnnounceParameters *params,
+ QEMUClockType type,
+ QEMUTimerCB *cb,
+ void *opaque)
+{
+ /*
+ * We're under the BQL, so the current timer can't
+ * be firing, so we should be able to delete it.
+ */
+ qemu_announce_timer_del(timer, false);
+
+ QAPI_CLONE_MEMBERS(AnnounceParameters, &timer->params, params);
+ timer->round = params->rounds;
+ timer->type = type;
+ timer->tm = timer_new_ms(type, cb, opaque);
+}
+
+#ifndef ETH_P_RARP
+#define ETH_P_RARP 0x8035
+#endif
+#define ARP_HTYPE_ETH 0x0001
+#define ARP_PTYPE_IP 0x0800
+#define ARP_OP_REQUEST_REV 0x3
+
+static int announce_self_create(uint8_t *buf,
+ uint8_t *mac_addr)
+{
+ /* Ethernet header. */
+ memset(buf, 0xff, 6); /* destination MAC addr */
+ memcpy(buf + 6, mac_addr, 6); /* source MAC addr */
+ *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */
+
+ /* RARP header. */
+ *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */
+ *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */
+ *(buf + 18) = 6; /* hardware addr length (ethernet) */
+ *(buf + 19) = 4; /* protocol addr length (IPv4) */
+ *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */
+ memcpy(buf + 22, mac_addr, 6); /* source hw addr */
+ memset(buf + 28, 0x00, 4); /* source protocol addr */
+ memcpy(buf + 32, mac_addr, 6); /* target hw addr */
+ memset(buf + 38, 0x00, 4); /* target protocol addr */
+
+ /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */
+ memset(buf + 42, 0x00, 18);
+
+ return 60; /* len (FCS will be added by hardware) */
+}
+
+static void qemu_announce_self_iter(NICState *nic, void *opaque)
+{
+ AnnounceTimer *timer = opaque;
+ uint8_t buf[60];
+ int len;
+ bool skip;
+
+ if (timer->params.has_interfaces) {
+ strList *entry = timer->params.interfaces;
+ /* Skip unless we find our name in the requested list */
+ skip = true;
+
+ while (entry) {
+ if (!strcmp(entry->value, nic->ncs->name)) {
+ /* Found us */
+ skip = false;
+ break;
+ }
+ entry = entry->next;
+ }
+ } else {
+ skip = false;
+ }
+
+ trace_qemu_announce_self_iter(timer->params.has_id ? timer->params.id : "_",
+ nic->ncs->name,
+ qemu_ether_ntoa(&nic->conf->macaddr), skip);
+
+ if (!skip) {
+ len = announce_self_create(buf, nic->conf->macaddr.a);
+
+ qemu_send_packet_raw(qemu_get_queue(nic), buf, len);
+
+ /* if the NIC provides it's own announcement support, use it as well */
+ if (nic->ncs->info->announce) {
+ nic->ncs->info->announce(nic->ncs);
+ }
+ }
+}
+static void qemu_announce_self_once(void *opaque)
+{
+ AnnounceTimer *timer = (AnnounceTimer *)opaque;
+
+ qemu_foreach_nic(qemu_announce_self_iter, timer);
+
+ if (--timer->round) {
+ qemu_announce_timer_step(timer);
+ } else {
+ qemu_announce_timer_del(timer, true);
+ }
+}
+
+void qemu_announce_self(AnnounceTimer *timer, AnnounceParameters *params)
+{
+ qemu_announce_timer_reset(timer, params, QEMU_CLOCK_REALTIME,
+ qemu_announce_self_once, timer);
+ if (params->rounds) {
+ qemu_announce_self_once(timer);
+ } else {
+ qemu_announce_timer_del(timer, true);
+ }
+}
+
+void qmp_announce_self(AnnounceParameters *params, Error **errp)
+{
+ AnnounceTimer *named_timer;
+ if (!params->has_id) {
+ params->id = g_strdup("");
+ params->has_id = true;
+ }
+
+ named_timer = g_datalist_get_data(&named_timers, params->id);
+
+ if (!named_timer) {
+ named_timer = g_new0(AnnounceTimer, 1);
+ g_datalist_set_data(&named_timers, params->id, named_timer);
+ }
+
+ qemu_announce_self(named_timer, params);
+}
diff --git a/net/can/can_core.c b/net/can/can_core.c
new file mode 100644
index 000000000..0115d7879
--- /dev/null
+++ b/net/can/can_core.c
@@ -0,0 +1,176 @@
+/*
+ * CAN common CAN bus emulation support
+ *
+ * Copyright (c) 2013-2014 Jin Yang
+ * Copyright (c) 2014-2018 Pavel Pisa
+ *
+ * Initial development supported by Google GSoC 2013 from RTEMS project slot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "chardev/char.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"
+#include "qapi/error.h"
+#include "net/can_emu.h"
+#include "qom/object_interfaces.h"
+
+/* CAN DLC to real data length conversion helpers */
+
+static const uint8_t dlc2len[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 12, 16, 20, 24, 32, 48, 64
+};
+
+/* get data length from can_dlc with sanitized can_dlc */
+uint8_t can_dlc2len(uint8_t can_dlc)
+{
+ return dlc2len[can_dlc & 0x0F];
+}
+
+static const uint8_t len2dlc[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, /* 0 - 8 */
+ 9, 9, 9, 9, /* 9 - 12 */
+ 10, 10, 10, 10, /* 13 - 16 */
+ 11, 11, 11, 11, /* 17 - 20 */
+ 12, 12, 12, 12, /* 21 - 24 */
+ 13, 13, 13, 13, 13, 13, 13, 13, /* 25 - 32 */
+ 14, 14, 14, 14, 14, 14, 14, 14, /* 33 - 40 */
+ 14, 14, 14, 14, 14, 14, 14, 14, /* 41 - 48 */
+ 15, 15, 15, 15, 15, 15, 15, 15, /* 49 - 56 */
+ 15, 15, 15, 15, 15, 15, 15, 15 /* 57 - 64 */
+};
+
+/* map the sanitized data length to an appropriate data length code */
+uint8_t can_len2dlc(uint8_t len)
+{
+ if (unlikely(len > 64)) {
+ return 0xF;
+ }
+
+ return len2dlc[len];
+}
+
+struct CanBusState {
+ Object object;
+
+ QTAILQ_HEAD(, CanBusClientState) clients;
+};
+
+static void can_bus_instance_init(Object *object)
+{
+ CanBusState *bus = (CanBusState *)object;
+
+ QTAILQ_INIT(&bus->clients);
+}
+
+int can_bus_insert_client(CanBusState *bus, CanBusClientState *client)
+{
+ client->bus = bus;
+ QTAILQ_INSERT_TAIL(&bus->clients, client, next);
+ return 0;
+}
+
+int can_bus_remove_client(CanBusClientState *client)
+{
+ CanBusState *bus = client->bus;
+ if (bus == NULL) {
+ return 0;
+ }
+
+ QTAILQ_REMOVE(&bus->clients, client, next);
+ client->bus = NULL;
+ return 1;
+}
+
+ssize_t can_bus_client_send(CanBusClientState *client,
+ const struct qemu_can_frame *frames, size_t frames_cnt)
+{
+ int ret = 0;
+ CanBusState *bus = client->bus;
+ CanBusClientState *peer;
+ if (bus == NULL) {
+ return -1;
+ }
+
+ QTAILQ_FOREACH(peer, &bus->clients, next) {
+ if (peer->info->can_receive(peer)) {
+ if (peer == client) {
+ /* No loopback support for now */
+ continue;
+ }
+ if (peer->info->receive(peer, frames, frames_cnt) > 0) {
+ ret = 1;
+ }
+ }
+ }
+
+ return ret;
+}
+
+int can_bus_filter_match(struct qemu_can_filter *filter, qemu_canid_t can_id)
+{
+ int m;
+ if (((can_id | filter->can_mask) & QEMU_CAN_ERR_FLAG)) {
+ return (filter->can_mask & QEMU_CAN_ERR_FLAG) != 0;
+ }
+ m = (can_id & filter->can_mask) == (filter->can_id & filter->can_mask);
+ return filter->can_id & QEMU_CAN_INV_FILTER ? !m : m;
+}
+
+int can_bus_client_set_filters(CanBusClientState *client,
+ const struct qemu_can_filter *filters, size_t filters_cnt)
+{
+ return 0;
+}
+
+
+static bool can_bus_can_be_deleted(UserCreatable *uc)
+{
+ return false;
+}
+
+static void can_bus_class_init(ObjectClass *klass,
+ void *class_data G_GNUC_UNUSED)
+{
+ UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass);
+
+ uc_klass->can_be_deleted = can_bus_can_be_deleted;
+}
+
+static const TypeInfo can_bus_info = {
+ .parent = TYPE_OBJECT,
+ .name = TYPE_CAN_BUS,
+ .instance_size = sizeof(CanBusState),
+ .instance_init = can_bus_instance_init,
+ .class_init = can_bus_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void can_bus_register_types(void)
+{
+ type_register_static(&can_bus_info);
+}
+
+type_init(can_bus_register_types);
diff --git a/net/can/can_host.c b/net/can/can_host.c
new file mode 100644
index 000000000..a3c84028c
--- /dev/null
+++ b/net/can/can_host.c
@@ -0,0 +1,112 @@
+/*
+ * CAN generic CAN host connection support
+ *
+ * Copyright (c) 2013-2014 Jin Yang
+ * Copyright (c) 2014-2018 Pavel Pisa
+ *
+ * Initial development supported by Google GSoC 2013 from RTEMS project slot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "chardev/char.h"
+#include "qemu/module.h"
+#include "qemu/sockets.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "net/can_emu.h"
+#include "net/can_host.h"
+
+struct CanBusState {
+ Object object;
+
+ QTAILQ_HEAD(, CanBusClientState) clients;
+};
+
+static void can_host_disconnect(CanHostState *ch)
+{
+ CanHostClass *chc = CAN_HOST_GET_CLASS(ch);
+
+ can_bus_remove_client(&ch->bus_client);
+ chc->disconnect(ch);
+}
+
+static void can_host_connect(CanHostState *ch, Error **errp)
+{
+ CanHostClass *chc = CAN_HOST_GET_CLASS(ch);
+ Error *local_err = NULL;
+
+ if (ch->bus == NULL) {
+ error_setg(errp, "'canbus' property not set");
+ return;
+ }
+
+ chc->connect(ch, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+
+ can_bus_insert_client(ch->bus, &ch->bus_client);
+}
+
+static void can_host_unparent(Object *obj)
+{
+ can_host_disconnect(CAN_HOST(obj));
+}
+
+static void can_host_complete(UserCreatable *uc, Error **errp)
+{
+ can_host_connect(CAN_HOST(uc), errp);
+}
+
+static void can_host_class_init(ObjectClass *klass,
+ void *class_data G_GNUC_UNUSED)
+{
+ UserCreatableClass *uc_klass = USER_CREATABLE_CLASS(klass);
+
+ object_class_property_add_link(klass, "canbus", TYPE_CAN_BUS,
+ offsetof(CanHostState, bus),
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+
+ klass->unparent = can_host_unparent;
+ uc_klass->complete = can_host_complete;
+}
+
+static const TypeInfo can_host_info = {
+ .parent = TYPE_OBJECT,
+ .name = TYPE_CAN_HOST,
+ .instance_size = sizeof(CanHostState),
+ .class_size = sizeof(CanHostClass),
+ .abstract = true,
+ .class_init = can_host_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void can_host_register_types(void)
+{
+ type_register_static(&can_host_info);
+}
+
+type_init(can_host_register_types);
diff --git a/net/can/can_socketcan.c b/net/can/can_socketcan.c
new file mode 100644
index 000000000..4b68f60c6
--- /dev/null
+++ b/net/can/can_socketcan.c
@@ -0,0 +1,333 @@
+/*
+ * CAN c support to connect to the Linux host SocketCAN interfaces
+ *
+ * Copyright (c) 2013-2014 Jin Yang
+ * Copyright (c) 2014-2018 Pavel Pisa
+ *
+ * Initial development supported by Google GSoC 2013 from RTEMS project slot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "qapi/error.h"
+#include "chardev/char.h"
+#include "qemu/sockets.h"
+#include "qemu/error-report.h"
+#include "net/can_emu.h"
+#include "net/can_host.h"
+
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <linux/can.h>
+#include <linux/can/raw.h>
+#include "qom/object.h"
+
+#ifndef DEBUG_CAN
+#define DEBUG_CAN 0
+#endif /*DEBUG_CAN*/
+
+#define TYPE_CAN_HOST_SOCKETCAN "can-host-socketcan"
+OBJECT_DECLARE_SIMPLE_TYPE(CanHostSocketCAN, CAN_HOST_SOCKETCAN)
+
+#define CAN_READ_BUF_LEN 5
+struct CanHostSocketCAN {
+ CanHostState parent;
+ char *ifname;
+
+ qemu_can_filter *rfilter;
+ int rfilter_num;
+ can_err_mask_t err_mask;
+
+ qemu_can_frame buf[CAN_READ_BUF_LEN];
+ int bufcnt;
+ int bufptr;
+
+ int fd;
+};
+
+/* Check that QEMU and Linux kernel flags encoding and structure matches */
+QEMU_BUILD_BUG_ON(QEMU_CAN_EFF_FLAG != CAN_EFF_FLAG);
+QEMU_BUILD_BUG_ON(QEMU_CAN_RTR_FLAG != CAN_RTR_FLAG);
+QEMU_BUILD_BUG_ON(QEMU_CAN_ERR_FLAG != CAN_ERR_FLAG);
+QEMU_BUILD_BUG_ON(QEMU_CAN_INV_FILTER != CAN_INV_FILTER);
+QEMU_BUILD_BUG_ON(offsetof(qemu_can_frame, data)
+ != offsetof(struct can_frame, data));
+
+static void can_host_socketcan_display_msg(struct qemu_can_frame *msg)
+{
+ int i;
+ FILE *logfile = qemu_log_lock();
+ qemu_log("[cansocketcan]: %03X [%01d] %s %s",
+ msg->can_id & QEMU_CAN_EFF_MASK,
+ msg->can_dlc,
+ msg->can_id & QEMU_CAN_EFF_FLAG ? "EFF" : "SFF",
+ msg->can_id & QEMU_CAN_RTR_FLAG ? "RTR" : "DAT");
+
+ for (i = 0; i < msg->can_dlc; i++) {
+ qemu_log(" %02X", msg->data[i]);
+ }
+ qemu_log("\n");
+ qemu_log_flush();
+ qemu_log_unlock(logfile);
+}
+
+static void can_host_socketcan_read(void *opaque)
+{
+ CanHostSocketCAN *c = opaque;
+ CanHostState *ch = CAN_HOST(c);
+
+ /* CAN_READ_BUF_LEN for multiple messages syscall is possible for future */
+ c->bufcnt = read(c->fd, c->buf, sizeof(qemu_can_frame));
+ if (c->bufcnt < 0) {
+ warn_report("CAN bus host read failed (%s)", strerror(errno));
+ return;
+ }
+
+ if (!ch->bus_client.fd_mode) {
+ c->buf[0].flags = 0;
+ } else {
+ if (c->bufcnt > CAN_MTU) {
+ c->buf[0].flags |= QEMU_CAN_FRMF_TYPE_FD;
+ }
+ }
+
+ can_bus_client_send(&ch->bus_client, c->buf, 1);
+
+ if (DEBUG_CAN) {
+ can_host_socketcan_display_msg(c->buf);
+ }
+}
+
+static bool can_host_socketcan_can_receive(CanBusClientState *client)
+{
+ return true;
+}
+
+static ssize_t can_host_socketcan_receive(CanBusClientState *client,
+ const qemu_can_frame *frames, size_t frames_cnt)
+{
+ CanHostState *ch = container_of(client, CanHostState, bus_client);
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch);
+
+ size_t len;
+ int res;
+
+ if (c->fd < 0) {
+ return -1;
+ }
+ if (frames->flags & QEMU_CAN_FRMF_TYPE_FD) {
+ if (!ch->bus_client.fd_mode) {
+ return 0;
+ }
+ len = CANFD_MTU;
+ } else {
+ len = CAN_MTU;
+
+ }
+
+ res = write(c->fd, frames, len);
+
+ if (!res) {
+ warn_report("[cansocketcan]: write message to host returns zero");
+ return -1;
+ }
+
+ if (res != len) {
+ if (res < 0) {
+ warn_report("[cansocketcan]: write to host failed (%s)",
+ strerror(errno));
+ } else {
+ warn_report("[cansocketcan]: write to host truncated");
+ }
+ return -1;
+ }
+
+ return 1;
+}
+
+static void can_host_socketcan_disconnect(CanHostState *ch)
+{
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch);
+
+ if (c->fd >= 0) {
+ qemu_set_fd_handler(c->fd, NULL, NULL, c);
+ close(c->fd);
+ c->fd = -1;
+ }
+
+ g_free(c->rfilter);
+ c->rfilter = NULL;
+ c->rfilter_num = 0;
+}
+
+static CanBusClientInfo can_host_socketcan_bus_client_info = {
+ .can_receive = can_host_socketcan_can_receive,
+ .receive = can_host_socketcan_receive,
+};
+
+static void can_host_socketcan_connect(CanHostState *ch, Error **errp)
+{
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(ch);
+ int s; /* can raw socket */
+ int mtu;
+ int enable_canfd = 1;
+ struct sockaddr_can addr;
+ struct ifreq ifr;
+
+ if (!c->ifname) {
+ error_setg(errp, "'if' property not set");
+ return;
+ }
+
+ /* open socket */
+ s = qemu_socket(PF_CAN, SOCK_RAW, CAN_RAW);
+ if (s < 0) {
+ error_setg_errno(errp, errno, "failed to create CAN_RAW socket");
+ return;
+ }
+
+ addr.can_family = AF_CAN;
+ memset(&ifr.ifr_name, 0, sizeof(ifr.ifr_name));
+ strcpy(ifr.ifr_name, c->ifname);
+ /* check if the frame fits into the CAN netdevice */
+ if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
+ error_setg_errno(errp, errno,
+ "SocketCAN host interface %s not available",
+ c->ifname);
+ goto fail;
+ }
+ addr.can_ifindex = ifr.ifr_ifindex;
+
+ if (ioctl(s, SIOCGIFMTU, &ifr) < 0) {
+ error_setg_errno(errp, errno,
+ "SocketCAN host interface %s SIOCGIFMTU failed",
+ c->ifname);
+ goto fail;
+ }
+ mtu = ifr.ifr_mtu;
+
+ if (mtu >= CANFD_MTU) {
+ /* interface is ok - try to switch the socket into CAN FD mode */
+ if (setsockopt(s, SOL_CAN_RAW, CAN_RAW_FD_FRAMES,
+ &enable_canfd, sizeof(enable_canfd))) {
+ warn_report("SocketCAN host interface %s enabling CAN FD failed",
+ c->ifname);
+ } else {
+ c->parent.bus_client.fd_mode = true;
+ }
+ }
+
+ c->err_mask = 0xffffffff; /* Receive error frame. */
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_ERR_FILTER,
+ &c->err_mask, sizeof(c->err_mask));
+
+ c->rfilter_num = 1;
+ c->rfilter = g_new(struct qemu_can_filter, c->rfilter_num);
+
+ /* Receive all data frame. If |= CAN_INV_FILTER no data. */
+ c->rfilter[0].can_id = 0;
+ c->rfilter[0].can_mask = 0;
+ c->rfilter[0].can_mask &= ~CAN_ERR_FLAG;
+
+ setsockopt(s, SOL_CAN_RAW, CAN_RAW_FILTER, c->rfilter,
+ c->rfilter_num * sizeof(struct qemu_can_filter));
+
+ if (bind(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ error_setg_errno(errp, errno, "failed to bind to host interface %s",
+ c->ifname);
+ goto fail;
+ }
+
+ c->fd = s;
+ ch->bus_client.info = &can_host_socketcan_bus_client_info;
+ qemu_set_fd_handler(c->fd, can_host_socketcan_read, NULL, c);
+ return;
+
+fail:
+ close(s);
+ g_free(c->rfilter);
+ c->rfilter = NULL;
+ c->rfilter_num = 0;
+}
+
+static char *can_host_socketcan_get_if(Object *obj, Error **errp)
+{
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj);
+
+ return g_strdup(c->ifname);
+}
+
+static void can_host_socketcan_set_if(Object *obj, const char *value,
+ Error **errp)
+{
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj);
+ struct ifreq ifr;
+
+ if (strlen(value) >= sizeof(ifr.ifr_name)) {
+ error_setg(errp, "CAN interface name longer than %zd characters",
+ sizeof(ifr.ifr_name) - 1);
+ return;
+ }
+
+ if (c->fd != -1) {
+ error_setg(errp, "CAN interface already connected");
+ return;
+ }
+
+ g_free(c->ifname);
+ c->ifname = g_strdup(value);
+}
+
+static void can_host_socketcan_instance_init(Object *obj)
+{
+ CanHostSocketCAN *c = CAN_HOST_SOCKETCAN(obj);
+
+ c->fd = -1;
+}
+
+static void can_host_socketcan_class_init(ObjectClass *klass,
+ void *class_data G_GNUC_UNUSED)
+{
+ CanHostClass *chc = CAN_HOST_CLASS(klass);
+
+ object_class_property_add_str(klass, "if",
+ can_host_socketcan_get_if,
+ can_host_socketcan_set_if);
+ chc->connect = can_host_socketcan_connect;
+ chc->disconnect = can_host_socketcan_disconnect;
+}
+
+static const TypeInfo can_host_socketcan_info = {
+ .parent = TYPE_CAN_HOST,
+ .name = TYPE_CAN_HOST_SOCKETCAN,
+ .instance_size = sizeof(CanHostSocketCAN),
+ .instance_init = can_host_socketcan_instance_init,
+ .class_init = can_host_socketcan_class_init,
+};
+
+static void can_host_register_types(void)
+{
+ type_register_static(&can_host_socketcan_info);
+}
+
+type_init(can_host_register_types);
diff --git a/net/can/meson.build b/net/can/meson.build
new file mode 100644
index 000000000..f53d9ec54
--- /dev/null
+++ b/net/can/meson.build
@@ -0,0 +1,5 @@
+can_ss = ss.source_set()
+can_ss.add(files('can_core.c', 'can_host.c'))
+can_ss.add(when: 'CONFIG_LINUX', if_true: files('can_socketcan.c'))
+
+softmmu_ss.add_all(when: 'CONFIG_CAN_BUS', if_true: can_ss)
diff --git a/net/checksum.c b/net/checksum.c
new file mode 100644
index 000000000..68245fd74
--- /dev/null
+++ b/net/checksum.c
@@ -0,0 +1,208 @@
+/*
+ * IP checksumming functions.
+ * (c) 2008 Gerd Hoffmann <kraxel@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; under version 2 or later of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "net/checksum.h"
+#include "net/eth.h"
+
+uint32_t net_checksum_add_cont(int len, uint8_t *buf, int seq)
+{
+ uint32_t sum1 = 0, sum2 = 0;
+ int i;
+
+ for (i = 0; i < len - 1; i += 2) {
+ sum1 += (uint32_t)buf[i];
+ sum2 += (uint32_t)buf[i + 1];
+ }
+ if (i < len) {
+ sum1 += (uint32_t)buf[i];
+ }
+
+ if (seq & 1) {
+ return sum1 + (sum2 << 8);
+ } else {
+ return sum2 + (sum1 << 8);
+ }
+}
+
+uint16_t net_checksum_finish(uint32_t sum)
+{
+ while (sum>>16)
+ sum = (sum & 0xFFFF)+(sum >> 16);
+ return ~sum;
+}
+
+uint16_t net_checksum_tcpudp(uint16_t length, uint16_t proto,
+ uint8_t *addrs, uint8_t *buf)
+{
+ uint32_t sum = 0;
+
+ sum += net_checksum_add(length, buf); // payload
+ sum += net_checksum_add(8, addrs); // src + dst address
+ sum += proto + length; // protocol & length
+ return net_checksum_finish(sum);
+}
+
+void net_checksum_calculate(uint8_t *data, int length, int csum_flag)
+{
+ int mac_hdr_len, ip_len;
+ struct ip_header *ip;
+ uint16_t csum;
+
+ /*
+ * Note: We cannot assume "data" is aligned, so the all code uses
+ * some macros that take care of possible unaligned access for
+ * struct members (just in case).
+ */
+
+ /* Ensure we have at least an Eth header */
+ if (length < sizeof(struct eth_header)) {
+ return;
+ }
+
+ /* Handle the optionnal VLAN headers */
+ switch (lduw_be_p(&PKT_GET_ETH_HDR(data)->h_proto)) {
+ case ETH_P_VLAN:
+ mac_hdr_len = sizeof(struct eth_header) +
+ sizeof(struct vlan_header);
+ break;
+ case ETH_P_DVLAN:
+ if (lduw_be_p(&PKT_GET_VLAN_HDR(data)->h_proto) == ETH_P_VLAN) {
+ mac_hdr_len = sizeof(struct eth_header) +
+ 2 * sizeof(struct vlan_header);
+ } else {
+ mac_hdr_len = sizeof(struct eth_header) +
+ sizeof(struct vlan_header);
+ }
+ break;
+ default:
+ mac_hdr_len = sizeof(struct eth_header);
+ break;
+ }
+
+ length -= mac_hdr_len;
+
+ /* Now check we have an IP header (with an optionnal VLAN header) */
+ if (length < sizeof(struct ip_header)) {
+ return;
+ }
+
+ ip = (struct ip_header *)(data + mac_hdr_len);
+
+ if (IP_HEADER_VERSION(ip) != IP_HEADER_VERSION_4) {
+ return; /* not IPv4 */
+ }
+
+ /* Calculate IP checksum */
+ if (csum_flag & CSUM_IP) {
+ stw_he_p(&ip->ip_sum, 0);
+ csum = net_raw_checksum((uint8_t *)ip, IP_HDR_GET_LEN(ip));
+ stw_be_p(&ip->ip_sum, csum);
+ }
+
+ if (IP4_IS_FRAGMENT(ip)) {
+ return; /* a fragmented IP packet */
+ }
+
+ ip_len = lduw_be_p(&ip->ip_len);
+
+ /* Last, check that we have enough data for the all IP frame */
+ if (length < ip_len) {
+ return;
+ }
+
+ ip_len -= IP_HDR_GET_LEN(ip);
+
+ switch (ip->ip_p) {
+ case IP_PROTO_TCP:
+ {
+ if (!(csum_flag & CSUM_TCP)) {
+ return;
+ }
+
+ tcp_header *tcp = (tcp_header *)(ip + 1);
+
+ if (ip_len < sizeof(tcp_header)) {
+ return;
+ }
+
+ /* Set csum to 0 */
+ stw_he_p(&tcp->th_sum, 0);
+
+ csum = net_checksum_tcpudp(ip_len, ip->ip_p,
+ (uint8_t *)&ip->ip_src,
+ (uint8_t *)tcp);
+
+ /* Store computed csum */
+ stw_be_p(&tcp->th_sum, csum);
+
+ break;
+ }
+ case IP_PROTO_UDP:
+ {
+ if (!(csum_flag & CSUM_UDP)) {
+ return;
+ }
+
+ udp_header *udp = (udp_header *)(ip + 1);
+
+ if (ip_len < sizeof(udp_header)) {
+ return;
+ }
+
+ /* Set csum to 0 */
+ stw_he_p(&udp->uh_sum, 0);
+
+ csum = net_checksum_tcpudp(ip_len, ip->ip_p,
+ (uint8_t *)&ip->ip_src,
+ (uint8_t *)udp);
+
+ /* Store computed csum */
+ stw_be_p(&udp->uh_sum, csum);
+
+ break;
+ }
+ default:
+ /* Can't handle any other protocol */
+ break;
+ }
+}
+
+uint32_t
+net_checksum_add_iov(const struct iovec *iov, const unsigned int iov_cnt,
+ uint32_t iov_off, uint32_t size, uint32_t csum_offset)
+{
+ size_t iovec_off;
+ unsigned int i;
+ uint32_t res = 0;
+
+ iovec_off = 0;
+ for (i = 0; i < iov_cnt && size; i++) {
+ if (iov_off < (iovec_off + iov[i].iov_len)) {
+ size_t len = MIN((iovec_off + iov[i].iov_len) - iov_off , size);
+ void *chunk_buf = iov[i].iov_base + (iov_off - iovec_off);
+
+ res += net_checksum_add_cont(len, chunk_buf, csum_offset);
+ csum_offset += len;
+
+ iov_off += len;
+ size -= len;
+ }
+ iovec_off += iov[i].iov_len;
+ }
+ return res;
+}
diff --git a/net/clients.h b/net/clients.h
new file mode 100644
index 000000000..92f9b59ae
--- /dev/null
+++ b/net/clients.h
@@ -0,0 +1,66 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef QEMU_NET_CLIENTS_H
+#define QEMU_NET_CLIENTS_H
+
+#include "net/net.h"
+
+int net_init_dump(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+#ifdef CONFIG_SLIRP
+int net_init_slirp(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
+int net_init_hubport(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+int net_init_socket(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+int net_init_tap(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+int net_init_bridge(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+int net_init_l2tpv3(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#ifdef CONFIG_VDE
+int net_init_vde(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
+#ifdef CONFIG_NETMAP
+int net_init_netmap(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif
+
+int net_init_vhost_user(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+
+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp);
+#endif /* QEMU_NET_CLIENTS_H */
diff --git a/net/colo-compare.c b/net/colo-compare.c
new file mode 100644
index 000000000..b966e7e51
--- /dev/null
+++ b/net/colo-compare.c
@@ -0,0 +1,1502 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "qapi/error.h"
+#include "net/net.h"
+#include "net/eth.h"
+#include "qom/object_interfaces.h"
+#include "qemu/iov.h"
+#include "qom/object.h"
+#include "net/queue.h"
+#include "chardev/char-fe.h"
+#include "qemu/sockets.h"
+#include "colo.h"
+#include "sysemu/iothread.h"
+#include "net/colo-compare.h"
+#include "migration/colo.h"
+#include "migration/migration.h"
+#include "util.h"
+
+#include "block/aio-wait.h"
+#include "qemu/coroutine.h"
+
+#define TYPE_COLO_COMPARE "colo-compare"
+typedef struct CompareState CompareState;
+DECLARE_INSTANCE_CHECKER(CompareState, COLO_COMPARE,
+ TYPE_COLO_COMPARE)
+
+static QTAILQ_HEAD(, CompareState) net_compares =
+ QTAILQ_HEAD_INITIALIZER(net_compares);
+
+static NotifierList colo_compare_notifiers =
+ NOTIFIER_LIST_INITIALIZER(colo_compare_notifiers);
+
+#define COMPARE_READ_LEN_MAX NET_BUFSIZE
+#define MAX_QUEUE_SIZE 1024
+
+#define COLO_COMPARE_FREE_PRIMARY 0x01
+#define COLO_COMPARE_FREE_SECONDARY 0x02
+
+#define REGULAR_PACKET_CHECK_MS 1000
+#define DEFAULT_TIME_OUT_MS 3000
+
+/* #define DEBUG_COLO_PACKETS */
+
+static QemuMutex colo_compare_mutex;
+static bool colo_compare_active;
+static QemuMutex event_mtx;
+static QemuCond event_complete_cond;
+static int event_unhandled_count;
+static uint32_t max_queue_size;
+
+/*
+ * + CompareState ++
+ * | |
+ * +---------------+ +---------------+ +---------------+
+ * | conn list + - > conn + ------- > conn + -- > ......
+ * +---------------+ +---------------+ +---------------+
+ * | | | | | |
+ * +---------------+ +---v----+ +---v----+ +---v----+ +---v----+
+ * |primary | |secondary |primary | |secondary
+ * |packet | |packet + |packet | |packet +
+ * +--------+ +--------+ +--------+ +--------+
+ * | | | |
+ * +---v----+ +---v----+ +---v----+ +---v----+
+ * |primary | |secondary |primary | |secondary
+ * |packet | |packet + |packet | |packet +
+ * +--------+ +--------+ +--------+ +--------+
+ * | | | |
+ * +---v----+ +---v----+ +---v----+ +---v----+
+ * |primary | |secondary |primary | |secondary
+ * |packet | |packet + |packet | |packet +
+ * +--------+ +--------+ +--------+ +--------+
+ */
+
+typedef struct SendCo {
+ Coroutine *co;
+ struct CompareState *s;
+ CharBackend *chr;
+ GQueue send_list;
+ bool notify_remote_frame;
+ bool done;
+ int ret;
+} SendCo;
+
+typedef struct SendEntry {
+ uint32_t size;
+ uint32_t vnet_hdr_len;
+ uint8_t *buf;
+} SendEntry;
+
+struct CompareState {
+ Object parent;
+
+ char *pri_indev;
+ char *sec_indev;
+ char *outdev;
+ char *notify_dev;
+ CharBackend chr_pri_in;
+ CharBackend chr_sec_in;
+ CharBackend chr_out;
+ CharBackend chr_notify_dev;
+ SocketReadState pri_rs;
+ SocketReadState sec_rs;
+ SocketReadState notify_rs;
+ SendCo out_sendco;
+ SendCo notify_sendco;
+ bool vnet_hdr;
+ uint64_t compare_timeout;
+ uint32_t expired_scan_cycle;
+
+ /*
+ * Record the connection that through the NIC
+ * Element type: Connection
+ */
+ GQueue conn_list;
+ /* Record the connection without repetition */
+ GHashTable *connection_track_table;
+
+ IOThread *iothread;
+ GMainContext *worker_context;
+ QEMUTimer *packet_check_timer;
+
+ QEMUBH *event_bh;
+ enum colo_event event;
+
+ QTAILQ_ENTRY(CompareState) next;
+};
+
+typedef struct CompareClass {
+ ObjectClass parent_class;
+} CompareClass;
+
+enum {
+ PRIMARY_IN = 0,
+ SECONDARY_IN,
+};
+
+static const char *colo_mode[] = {
+ [PRIMARY_IN] = "primary",
+ [SECONDARY_IN] = "secondary",
+};
+
+static int compare_chr_send(CompareState *s,
+ uint8_t *buf,
+ uint32_t size,
+ uint32_t vnet_hdr_len,
+ bool notify_remote_frame,
+ bool zero_copy);
+
+static bool packet_matches_str(const char *str,
+ const uint8_t *buf,
+ uint32_t packet_len)
+{
+ if (packet_len != strlen(str)) {
+ return false;
+ }
+
+ return !memcmp(str, buf, packet_len);
+}
+
+static void notify_remote_frame(CompareState *s)
+{
+ char msg[] = "DO_CHECKPOINT";
+ int ret = 0;
+
+ ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
+ if (ret < 0) {
+ error_report("Notify Xen COLO-frame failed");
+ }
+}
+
+static void colo_compare_inconsistency_notify(CompareState *s)
+{
+ if (s->notify_dev) {
+ notify_remote_frame(s);
+ } else {
+ notifier_list_notify(&colo_compare_notifiers,
+ migrate_get_current());
+ }
+}
+
+/* Use restricted to colo_insert_packet() */
+static gint seq_sorter(Packet *a, Packet *b, gpointer data)
+{
+ return a->tcp_seq - b->tcp_seq;
+}
+
+static void fill_pkt_tcp_info(void *data, uint32_t *max_ack)
+{
+ Packet *pkt = data;
+ struct tcp_hdr *tcphd;
+
+ tcphd = (struct tcp_hdr *)pkt->transport_header;
+
+ pkt->tcp_seq = ntohl(tcphd->th_seq);
+ pkt->tcp_ack = ntohl(tcphd->th_ack);
+ /* Need to consider ACK will bigger than uint32_t MAX */
+ *max_ack = pkt->tcp_ack - *max_ack > 0 ? pkt->tcp_ack : *max_ack;
+ pkt->header_size = pkt->transport_header - (uint8_t *)pkt->data
+ + (tcphd->th_off << 2);
+ pkt->payload_size = pkt->size - pkt->header_size;
+ pkt->seq_end = pkt->tcp_seq + pkt->payload_size;
+ pkt->flags = tcphd->th_flags;
+}
+
+/*
+ * Return 1 on success, if return 0 means the
+ * packet will be dropped
+ */
+static int colo_insert_packet(GQueue *queue, Packet *pkt, uint32_t *max_ack)
+{
+ if (g_queue_get_length(queue) <= max_queue_size) {
+ if (pkt->ip->ip_p == IPPROTO_TCP) {
+ fill_pkt_tcp_info(pkt, max_ack);
+ g_queue_insert_sorted(queue,
+ pkt,
+ (GCompareDataFunc)seq_sorter,
+ NULL);
+ } else {
+ g_queue_push_tail(queue, pkt);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * Return 0 on success, if return -1 means the pkt
+ * is unsupported(arp and ipv6) and will be sent later
+ */
+static int packet_enqueue(CompareState *s, int mode, Connection **con)
+{
+ ConnectionKey key;
+ Packet *pkt = NULL;
+ Connection *conn;
+ int ret;
+
+ if (mode == PRIMARY_IN) {
+ pkt = packet_new(s->pri_rs.buf,
+ s->pri_rs.packet_len,
+ s->pri_rs.vnet_hdr_len);
+ } else {
+ pkt = packet_new(s->sec_rs.buf,
+ s->sec_rs.packet_len,
+ s->sec_rs.vnet_hdr_len);
+ }
+
+ if (parse_packet_early(pkt)) {
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return -1;
+ }
+ fill_connection_key(pkt, &key, false);
+
+ conn = connection_get(s->connection_track_table,
+ &key,
+ &s->conn_list);
+
+ if (!conn->processing) {
+ g_queue_push_tail(&s->conn_list, conn);
+ conn->processing = true;
+ }
+
+ if (mode == PRIMARY_IN) {
+ ret = colo_insert_packet(&conn->primary_list, pkt, &conn->pack);
+ } else {
+ ret = colo_insert_packet(&conn->secondary_list, pkt, &conn->sack);
+ }
+
+ if (!ret) {
+ trace_colo_compare_drop_packet(colo_mode[mode],
+ "queue size too big, drop packet");
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ }
+
+ *con = conn;
+
+ return 0;
+}
+
+static inline bool after(uint32_t seq1, uint32_t seq2)
+{
+ return (int32_t)(seq1 - seq2) > 0;
+}
+
+static void colo_release_primary_pkt(CompareState *s, Packet *pkt)
+{
+ int ret;
+ ret = compare_chr_send(s,
+ pkt->data,
+ pkt->size,
+ pkt->vnet_hdr_len,
+ false,
+ true);
+ if (ret < 0) {
+ error_report("colo send primary packet failed");
+ }
+ trace_colo_compare_main("packet same and release packet");
+ packet_destroy_partial(pkt, NULL);
+}
+
+/*
+ * The IP packets sent by primary and secondary
+ * will be compared in here
+ * TODO support ip fragment, Out-Of-Order
+ * return: 0 means packet same
+ * > 0 || < 0 means packet different
+ */
+static int colo_compare_packet_payload(Packet *ppkt,
+ Packet *spkt,
+ uint16_t poffset,
+ uint16_t soffset,
+ uint16_t len)
+
+{
+ if (trace_event_get_state_backends(TRACE_COLO_COMPARE_IP_INFO)) {
+ char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
+
+ strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src));
+ strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst));
+ strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src));
+ strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst));
+
+ trace_colo_compare_ip_info(ppkt->size, pri_ip_src,
+ pri_ip_dst, spkt->size,
+ sec_ip_src, sec_ip_dst);
+ }
+
+ return memcmp(ppkt->data + poffset, spkt->data + soffset, len);
+}
+
+/*
+ * return true means that the payload is consist and
+ * need to make the next comparison, false means do
+ * the checkpoint
+*/
+static bool colo_mark_tcp_pkt(Packet *ppkt, Packet *spkt,
+ int8_t *mark, uint32_t max_ack)
+{
+ *mark = 0;
+
+ if (ppkt->tcp_seq == spkt->tcp_seq && ppkt->seq_end == spkt->seq_end) {
+ if (!colo_compare_packet_payload(ppkt, spkt,
+ ppkt->header_size, spkt->header_size,
+ ppkt->payload_size)) {
+ *mark = COLO_COMPARE_FREE_SECONDARY | COLO_COMPARE_FREE_PRIMARY;
+ return true;
+ }
+ }
+
+ /* one part of secondary packet payload still need to be compared */
+ if (!after(ppkt->seq_end, spkt->seq_end)) {
+ if (!colo_compare_packet_payload(ppkt, spkt,
+ ppkt->header_size + ppkt->offset,
+ spkt->header_size + spkt->offset,
+ ppkt->payload_size - ppkt->offset)) {
+ if (!after(ppkt->tcp_ack, max_ack)) {
+ *mark = COLO_COMPARE_FREE_PRIMARY;
+ spkt->offset += ppkt->payload_size - ppkt->offset;
+ return true;
+ } else {
+ /* secondary guest hasn't ack the data, don't send
+ * out this packet
+ */
+ return false;
+ }
+ }
+ } else {
+ /* primary packet is longer than secondary packet, compare
+ * the same part and mark the primary packet offset
+ */
+ if (!colo_compare_packet_payload(ppkt, spkt,
+ ppkt->header_size + ppkt->offset,
+ spkt->header_size + spkt->offset,
+ spkt->payload_size - spkt->offset)) {
+ *mark = COLO_COMPARE_FREE_SECONDARY;
+ ppkt->offset += spkt->payload_size - spkt->offset;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void colo_compare_tcp(CompareState *s, Connection *conn)
+{
+ Packet *ppkt = NULL, *spkt = NULL;
+ int8_t mark;
+
+ /*
+ * If ppkt and spkt have the same payload, but ppkt's ACK
+ * is greater than spkt's ACK, in this case we can not
+ * send the ppkt because it will cause the secondary guest
+ * to miss sending some data in the next. Therefore, we
+ * record the maximum ACK in the current queue at both
+ * primary side and secondary side. Only when the ack is
+ * less than the smaller of the two maximum ack, then we
+ * can ensure that the packet's payload is acknowledged by
+ * primary and secondary.
+ */
+ uint32_t min_ack = conn->pack - conn->sack > 0 ?
+ conn->sack : conn->pack;
+
+pri:
+ if (g_queue_is_empty(&conn->primary_list)) {
+ return;
+ }
+ ppkt = g_queue_pop_head(&conn->primary_list);
+sec:
+ if (g_queue_is_empty(&conn->secondary_list)) {
+ g_queue_push_head(&conn->primary_list, ppkt);
+ return;
+ }
+ spkt = g_queue_pop_head(&conn->secondary_list);
+
+ if (ppkt->tcp_seq == ppkt->seq_end) {
+ colo_release_primary_pkt(s, ppkt);
+ ppkt = NULL;
+ }
+
+ if (ppkt && conn->compare_seq && !after(ppkt->seq_end, conn->compare_seq)) {
+ trace_colo_compare_main("pri: this packet has compared");
+ colo_release_primary_pkt(s, ppkt);
+ ppkt = NULL;
+ }
+
+ if (spkt->tcp_seq == spkt->seq_end) {
+ packet_destroy(spkt, NULL);
+ if (!ppkt) {
+ goto pri;
+ } else {
+ goto sec;
+ }
+ } else {
+ if (conn->compare_seq && !after(spkt->seq_end, conn->compare_seq)) {
+ trace_colo_compare_main("sec: this packet has compared");
+ packet_destroy(spkt, NULL);
+ if (!ppkt) {
+ goto pri;
+ } else {
+ goto sec;
+ }
+ }
+ if (!ppkt) {
+ g_queue_push_head(&conn->secondary_list, spkt);
+ goto pri;
+ }
+ }
+
+ if (colo_mark_tcp_pkt(ppkt, spkt, &mark, min_ack)) {
+ trace_colo_compare_tcp_info("pri",
+ ppkt->tcp_seq, ppkt->tcp_ack,
+ ppkt->header_size, ppkt->payload_size,
+ ppkt->offset, ppkt->flags);
+
+ trace_colo_compare_tcp_info("sec",
+ spkt->tcp_seq, spkt->tcp_ack,
+ spkt->header_size, spkt->payload_size,
+ spkt->offset, spkt->flags);
+
+ if (mark == COLO_COMPARE_FREE_PRIMARY) {
+ conn->compare_seq = ppkt->seq_end;
+ colo_release_primary_pkt(s, ppkt);
+ g_queue_push_head(&conn->secondary_list, spkt);
+ goto pri;
+ } else if (mark == COLO_COMPARE_FREE_SECONDARY) {
+ conn->compare_seq = spkt->seq_end;
+ packet_destroy(spkt, NULL);
+ goto sec;
+ } else if (mark == (COLO_COMPARE_FREE_PRIMARY | COLO_COMPARE_FREE_SECONDARY)) {
+ conn->compare_seq = ppkt->seq_end;
+ colo_release_primary_pkt(s, ppkt);
+ packet_destroy(spkt, NULL);
+ goto pri;
+ }
+ } else {
+ g_queue_push_head(&conn->primary_list, ppkt);
+ g_queue_push_head(&conn->secondary_list, spkt);
+
+#ifdef DEBUG_COLO_PACKETS
+ qemu_hexdump(stderr, "colo-compare ppkt", ppkt->data, ppkt->size);
+ qemu_hexdump(stderr, "colo-compare spkt", spkt->data, spkt->size);
+#endif
+
+ colo_compare_inconsistency_notify(s);
+ }
+}
+
+
+/*
+ * Called from the compare thread on the primary
+ * for compare udp packet
+ */
+static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
+{
+ uint16_t network_header_length = ppkt->ip->ip_hl << 2;
+ uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
+
+ trace_colo_compare_main("compare udp");
+
+ /*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * COLO just concern the response net packet payload from primary guest
+ * and secondary guest are same or not, So we ignored all IP header include
+ * other field like TOS,TTL,IP Checksum. we only need to compare
+ * the ip payload here.
+ */
+ if (ppkt->size != spkt->size) {
+ trace_colo_compare_main("UDP: payload size of packets are different");
+ return -1;
+ }
+ if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
+ ppkt->size - offset)) {
+ trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
+ trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
+#ifdef DEBUG_COLO_PACKETS
+ qemu_hexdump(stderr, "colo-compare pri pkt", ppkt->data, ppkt->size);
+ qemu_hexdump(stderr, "colo-compare sec pkt", spkt->data, spkt->size);
+#endif
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare icmp packet
+ */
+static int colo_packet_compare_icmp(Packet *spkt, Packet *ppkt)
+{
+ uint16_t network_header_length = ppkt->ip->ip_hl << 2;
+ uint16_t offset = network_header_length + ETH_HLEN + ppkt->vnet_hdr_len;
+
+ trace_colo_compare_main("compare icmp");
+
+ /*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * COLO just concern the response net packet payload from primary guest
+ * and secondary guest are same or not, So we ignored all IP header include
+ * other field like TOS,TTL,IP Checksum. we only need to compare
+ * the ip payload here.
+ */
+ if (ppkt->size != spkt->size) {
+ trace_colo_compare_main("ICMP: payload size of packets are different");
+ return -1;
+ }
+ if (colo_compare_packet_payload(ppkt, spkt, offset, offset,
+ ppkt->size - offset)) {
+ trace_colo_compare_icmp_miscompare("primary pkt size",
+ ppkt->size);
+ trace_colo_compare_icmp_miscompare("Secondary pkt size",
+ spkt->size);
+#ifdef DEBUG_COLO_PACKETS
+ qemu_hexdump(stderr, "colo-compare pri pkt", ppkt->data, ppkt->size);
+ qemu_hexdump(stderr, "colo-compare sec pkt", spkt->data, spkt->size);
+#endif
+ return -1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare other packet
+ */
+static int colo_packet_compare_other(Packet *spkt, Packet *ppkt)
+{
+ uint16_t offset = ppkt->vnet_hdr_len;
+
+ trace_colo_compare_main("compare other");
+ if (ppkt->size != spkt->size) {
+ trace_colo_compare_main("Other: payload size of packets are different");
+ return -1;
+ }
+ return colo_compare_packet_payload(ppkt, spkt, offset, offset,
+ ppkt->size - offset);
+}
+
+static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
+{
+ int64_t now = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+
+ if ((now - pkt->creation_ms) > (*check_time)) {
+ trace_colo_old_packet_check_found(pkt->creation_ms);
+ return 0;
+ } else {
+ return 1;
+ }
+}
+
+void colo_compare_register_notifier(Notifier *notify)
+{
+ notifier_list_add(&colo_compare_notifiers, notify);
+}
+
+void colo_compare_unregister_notifier(Notifier *notify)
+{
+ notifier_remove(notify);
+}
+
+static int colo_old_packet_check_one_conn(Connection *conn,
+ CompareState *s)
+{
+ if (!g_queue_is_empty(&conn->primary_list)) {
+ if (g_queue_find_custom(&conn->primary_list,
+ &s->compare_timeout,
+ (GCompareFunc)colo_old_packet_check_one))
+ goto out;
+ }
+
+ if (!g_queue_is_empty(&conn->secondary_list)) {
+ if (g_queue_find_custom(&conn->secondary_list,
+ &s->compare_timeout,
+ (GCompareFunc)colo_old_packet_check_one))
+ goto out;
+ }
+
+ return 1;
+
+out:
+ /* Do checkpoint will flush old packet */
+ colo_compare_inconsistency_notify(s);
+ return 0;
+}
+
+/*
+ * Look for old packets that the secondary hasn't matched,
+ * if we have some then we have to checkpoint to wake
+ * the secondary up.
+ */
+static void colo_old_packet_check(void *opaque)
+{
+ CompareState *s = opaque;
+
+ /*
+ * If we find one old packet, stop finding job and notify
+ * COLO frame do checkpoint.
+ */
+ g_queue_find_custom(&s->conn_list, s,
+ (GCompareFunc)colo_old_packet_check_one_conn);
+}
+
+static void colo_compare_packet(CompareState *s, Connection *conn,
+ int (*HandlePacket)(Packet *spkt,
+ Packet *ppkt))
+{
+ Packet *pkt = NULL;
+ GList *result = NULL;
+
+ while (!g_queue_is_empty(&conn->primary_list) &&
+ !g_queue_is_empty(&conn->secondary_list)) {
+ pkt = g_queue_pop_head(&conn->primary_list);
+ result = g_queue_find_custom(&conn->secondary_list,
+ pkt, (GCompareFunc)HandlePacket);
+
+ if (result) {
+ colo_release_primary_pkt(s, pkt);
+ packet_destroy(result->data, NULL);
+ g_queue_delete_link(&conn->secondary_list, result);
+ } else {
+ /*
+ * If one packet arrive late, the secondary_list or
+ * primary_list will be empty, so we can't compare it
+ * until next comparison. If the packets in the list are
+ * timeout, it will trigger a checkpoint request.
+ */
+ trace_colo_compare_main("packet different");
+ g_queue_push_head(&conn->primary_list, pkt);
+
+ colo_compare_inconsistency_notify(s);
+ break;
+ }
+ }
+}
+
+/*
+ * Called from the compare thread on the primary
+ * for compare packet with secondary list of the
+ * specified connection when a new packet was
+ * queued to it.
+ */
+static void colo_compare_connection(void *opaque, void *user_data)
+{
+ CompareState *s = user_data;
+ Connection *conn = opaque;
+
+ switch (conn->ip_proto) {
+ case IPPROTO_TCP:
+ colo_compare_tcp(s, conn);
+ break;
+ case IPPROTO_UDP:
+ colo_compare_packet(s, conn, colo_packet_compare_udp);
+ break;
+ case IPPROTO_ICMP:
+ colo_compare_packet(s, conn, colo_packet_compare_icmp);
+ break;
+ default:
+ colo_compare_packet(s, conn, colo_packet_compare_other);
+ break;
+ }
+}
+
+static void coroutine_fn _compare_chr_send(void *opaque)
+{
+ SendCo *sendco = opaque;
+ CompareState *s = sendco->s;
+ int ret = 0;
+
+ while (!g_queue_is_empty(&sendco->send_list)) {
+ SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
+ uint32_t len = htonl(entry->size);
+
+ ret = qemu_chr_fe_write_all(sendco->chr, (uint8_t *)&len, sizeof(len));
+
+ if (ret != sizeof(len)) {
+ g_free(entry->buf);
+ g_slice_free(SendEntry, entry);
+ goto err;
+ }
+
+ if (!sendco->notify_remote_frame && s->vnet_hdr) {
+ /*
+ * We send vnet header len make other module(like filter-redirector)
+ * know how to parse net packet correctly.
+ */
+ len = htonl(entry->vnet_hdr_len);
+
+ ret = qemu_chr_fe_write_all(sendco->chr,
+ (uint8_t *)&len,
+ sizeof(len));
+
+ if (ret != sizeof(len)) {
+ g_free(entry->buf);
+ g_slice_free(SendEntry, entry);
+ goto err;
+ }
+ }
+
+ ret = qemu_chr_fe_write_all(sendco->chr,
+ (uint8_t *)entry->buf,
+ entry->size);
+
+ if (ret != entry->size) {
+ g_free(entry->buf);
+ g_slice_free(SendEntry, entry);
+ goto err;
+ }
+
+ g_free(entry->buf);
+ g_slice_free(SendEntry, entry);
+ }
+
+ sendco->ret = 0;
+ goto out;
+
+err:
+ while (!g_queue_is_empty(&sendco->send_list)) {
+ SendEntry *entry = g_queue_pop_tail(&sendco->send_list);
+ g_free(entry->buf);
+ g_slice_free(SendEntry, entry);
+ }
+ sendco->ret = ret < 0 ? ret : -EIO;
+out:
+ sendco->co = NULL;
+ sendco->done = true;
+ aio_wait_kick();
+}
+
+static int compare_chr_send(CompareState *s,
+ uint8_t *buf,
+ uint32_t size,
+ uint32_t vnet_hdr_len,
+ bool notify_remote_frame,
+ bool zero_copy)
+{
+ SendCo *sendco;
+ SendEntry *entry;
+
+ if (notify_remote_frame) {
+ sendco = &s->notify_sendco;
+ } else {
+ sendco = &s->out_sendco;
+ }
+
+ if (!size) {
+ return -1;
+ }
+
+ entry = g_slice_new(SendEntry);
+ entry->size = size;
+ entry->vnet_hdr_len = vnet_hdr_len;
+ if (zero_copy) {
+ entry->buf = buf;
+ } else {
+ entry->buf = g_malloc(size);
+ memcpy(entry->buf, buf, size);
+ }
+ g_queue_push_head(&sendco->send_list, entry);
+
+ if (sendco->done) {
+ sendco->co = qemu_coroutine_create(_compare_chr_send, sendco);
+ sendco->done = false;
+ qemu_coroutine_enter(sendco->co);
+ if (sendco->done) {
+ /* report early errors */
+ return sendco->ret;
+ }
+ }
+
+ /* assume success */
+ return 0;
+}
+
+static int compare_chr_can_read(void *opaque)
+{
+ return COMPARE_READ_LEN_MAX;
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the primary.
+ */
+static void compare_pri_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->pri_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_pri_in, NULL, NULL, NULL, NULL,
+ NULL, NULL, true);
+ error_report("colo-compare primary_in error");
+ }
+}
+
+/*
+ * Called from the main thread on the primary for packets
+ * arriving over the socket from the secondary.
+ */
+static void compare_sec_chr_in(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->sec_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_sec_in, NULL, NULL, NULL, NULL,
+ NULL, NULL, true);
+ error_report("colo-compare secondary_in error");
+ }
+}
+
+static void compare_notify_chr(void *opaque, const uint8_t *buf, int size)
+{
+ CompareState *s = COLO_COMPARE(opaque);
+ int ret;
+
+ ret = net_fill_rstate(&s->notify_rs, buf, size);
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_notify_dev, NULL, NULL, NULL, NULL,
+ NULL, NULL, true);
+ error_report("colo-compare notify_dev error");
+ }
+}
+
+/*
+ * Check old packet regularly so it can watch for any packets
+ * that the secondary hasn't produced equivalents of.
+ */
+static void check_old_packet_regular(void *opaque)
+{
+ CompareState *s = opaque;
+
+ /* if have old packet we will notify checkpoint */
+ colo_old_packet_check(s);
+ timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_HOST) +
+ s->expired_scan_cycle);
+}
+
+/* Public API, Used for COLO frame to notify compare event */
+void colo_notify_compares_event(void *opaque, int event, Error **errp)
+{
+ CompareState *s;
+ qemu_mutex_lock(&colo_compare_mutex);
+
+ if (!colo_compare_active) {
+ qemu_mutex_unlock(&colo_compare_mutex);
+ return;
+ }
+
+ qemu_mutex_lock(&event_mtx);
+ QTAILQ_FOREACH(s, &net_compares, next) {
+ s->event = event;
+ qemu_bh_schedule(s->event_bh);
+ event_unhandled_count++;
+ }
+ /* Wait all compare threads to finish handling this event */
+ while (event_unhandled_count > 0) {
+ qemu_cond_wait(&event_complete_cond, &event_mtx);
+ }
+
+ qemu_mutex_unlock(&event_mtx);
+ qemu_mutex_unlock(&colo_compare_mutex);
+}
+
+static void colo_compare_timer_init(CompareState *s)
+{
+ AioContext *ctx = iothread_get_aio_context(s->iothread);
+
+ s->packet_check_timer = aio_timer_new(ctx, QEMU_CLOCK_HOST,
+ SCALE_MS, check_old_packet_regular,
+ s);
+ timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_HOST) +
+ s->expired_scan_cycle);
+}
+
+static void colo_compare_timer_del(CompareState *s)
+{
+ if (s->packet_check_timer) {
+ timer_free(s->packet_check_timer);
+ s->packet_check_timer = NULL;
+ }
+ }
+
+static void colo_flush_packets(void *opaque, void *user_data);
+
+static void colo_compare_handle_event(void *opaque)
+{
+ CompareState *s = opaque;
+
+ switch (s->event) {
+ case COLO_EVENT_CHECKPOINT:
+ g_queue_foreach(&s->conn_list, colo_flush_packets, s);
+ break;
+ case COLO_EVENT_FAILOVER:
+ break;
+ default:
+ break;
+ }
+
+ qemu_mutex_lock(&event_mtx);
+ assert(event_unhandled_count > 0);
+ event_unhandled_count--;
+ qemu_cond_broadcast(&event_complete_cond);
+ qemu_mutex_unlock(&event_mtx);
+}
+
+static void colo_compare_iothread(CompareState *s)
+{
+ AioContext *ctx = iothread_get_aio_context(s->iothread);
+ object_ref(OBJECT(s->iothread));
+ s->worker_context = iothread_get_g_main_context(s->iothread);
+
+ qemu_chr_fe_set_handlers(&s->chr_pri_in, compare_chr_can_read,
+ compare_pri_chr_in, NULL, NULL,
+ s, s->worker_context, true);
+ qemu_chr_fe_set_handlers(&s->chr_sec_in, compare_chr_can_read,
+ compare_sec_chr_in, NULL, NULL,
+ s, s->worker_context, true);
+ if (s->notify_dev) {
+ qemu_chr_fe_set_handlers(&s->chr_notify_dev, compare_chr_can_read,
+ compare_notify_chr, NULL, NULL,
+ s, s->worker_context, true);
+ }
+
+ colo_compare_timer_init(s);
+ s->event_bh = aio_bh_new(ctx, colo_compare_handle_event, s);
+}
+
+static char *compare_get_pri_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->pri_indev);
+}
+
+static void compare_set_pri_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->pri_indev);
+ s->pri_indev = g_strdup(value);
+}
+
+static char *compare_get_sec_indev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->sec_indev);
+}
+
+static void compare_set_sec_indev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->sec_indev);
+ s->sec_indev = g_strdup(value);
+}
+
+static char *compare_get_outdev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->outdev);
+}
+
+static void compare_set_outdev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->outdev);
+ s->outdev = g_strdup(value);
+}
+
+static bool compare_get_vnet_hdr(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return s->vnet_hdr;
+}
+
+static void compare_set_vnet_hdr(Object *obj,
+ bool value,
+ Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ s->vnet_hdr = value;
+}
+
+static char *compare_get_notify_dev(Object *obj, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ return g_strdup(s->notify_dev);
+}
+
+static void compare_set_notify_dev(Object *obj, const char *value, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ g_free(s->notify_dev);
+ s->notify_dev = g_strdup(value);
+}
+
+static void compare_get_timeout(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+ uint64_t value = s->compare_timeout;
+
+ visit_type_uint64(v, name, &value, errp);
+}
+
+static void compare_set_timeout(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+ uint32_t value;
+
+ if (!visit_type_uint32(v, name, &value, errp)) {
+ return;
+ }
+ if (!value) {
+ error_setg(errp, "Property '%s.%s' requires a positive value",
+ object_get_typename(obj), name);
+ return;
+ }
+ s->compare_timeout = value;
+}
+
+static void compare_get_expired_scan_cycle(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+ uint32_t value = s->expired_scan_cycle;
+
+ visit_type_uint32(v, name, &value, errp);
+}
+
+static void compare_set_expired_scan_cycle(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ CompareState *s = COLO_COMPARE(obj);
+ uint32_t value;
+
+ if (!visit_type_uint32(v, name, &value, errp)) {
+ return;
+ }
+ if (!value) {
+ error_setg(errp, "Property '%s.%s' requires a positive value",
+ object_get_typename(obj), name);
+ return;
+ }
+ s->expired_scan_cycle = value;
+}
+
+static void get_max_queue_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ uint32_t value = max_queue_size;
+
+ visit_type_uint32(v, name, &value, errp);
+}
+
+static void set_max_queue_size(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ Error *local_err = NULL;
+ uint64_t value;
+
+ visit_type_uint64(v, name, &value, &local_err);
+ if (local_err) {
+ goto out;
+ }
+ if (!value) {
+ error_setg(&local_err, "Property '%s.%s' requires a positive value",
+ object_get_typename(obj), name);
+ goto out;
+ }
+ max_queue_size = value;
+
+out:
+ error_propagate(errp, local_err);
+}
+
+static void compare_pri_rs_finalize(SocketReadState *pri_rs)
+{
+ CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+ Connection *conn = NULL;
+
+ if (packet_enqueue(s, PRIMARY_IN, &conn)) {
+ trace_colo_compare_main("primary: unsupported packet in");
+ compare_chr_send(s,
+ pri_rs->buf,
+ pri_rs->packet_len,
+ pri_rs->vnet_hdr_len,
+ false,
+ false);
+ } else {
+ /* compare packet in the specified connection */
+ colo_compare_connection(conn, s);
+ }
+}
+
+static void compare_sec_rs_finalize(SocketReadState *sec_rs)
+{
+ CompareState *s = container_of(sec_rs, CompareState, sec_rs);
+ Connection *conn = NULL;
+
+ if (packet_enqueue(s, SECONDARY_IN, &conn)) {
+ trace_colo_compare_main("secondary: unsupported packet in");
+ } else {
+ /* compare packet in the specified connection */
+ colo_compare_connection(conn, s);
+ }
+}
+
+static void compare_notify_rs_finalize(SocketReadState *notify_rs)
+{
+ CompareState *s = container_of(notify_rs, CompareState, notify_rs);
+
+ const char msg[] = "COLO_COMPARE_GET_XEN_INIT";
+ int ret;
+
+ if (packet_matches_str("COLO_USERSPACE_PROXY_INIT",
+ notify_rs->buf,
+ notify_rs->packet_len)) {
+ ret = compare_chr_send(s, (uint8_t *)msg, strlen(msg), 0, true, false);
+ if (ret < 0) {
+ error_report("Notify Xen COLO-frame INIT failed");
+ }
+ } else if (packet_matches_str("COLO_CHECKPOINT",
+ notify_rs->buf,
+ notify_rs->packet_len)) {
+ /* colo-compare do checkpoint, flush pri packet and remove sec packet */
+ g_queue_foreach(&s->conn_list, colo_flush_packets, s);
+ } else {
+ error_report("COLO compare got unsupported instruction");
+ }
+}
+
+/*
+ * Return 0 is success.
+ * Return 1 is failed.
+ */
+static int find_and_check_chardev(Chardev **chr,
+ char *chr_name,
+ Error **errp)
+{
+ *chr = qemu_chr_find(chr_name);
+ if (*chr == NULL) {
+ error_setg(errp, "Device '%s' not found",
+ chr_name);
+ return 1;
+ }
+
+ if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
+ error_setg(errp, "chardev \"%s\" is not reconnectable",
+ chr_name);
+ return 1;
+ }
+
+ if (!qemu_chr_has_feature(*chr, QEMU_CHAR_FEATURE_GCONTEXT)) {
+ error_setg(errp, "chardev \"%s\" cannot switch context",
+ chr_name);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * Called from the main thread on the primary
+ * to setup colo-compare.
+ */
+static void colo_compare_complete(UserCreatable *uc, Error **errp)
+{
+ CompareState *s = COLO_COMPARE(uc);
+ Chardev *chr;
+
+ if (!s->pri_indev || !s->sec_indev || !s->outdev || !s->iothread) {
+ error_setg(errp, "colo compare needs 'primary_in' ,"
+ "'secondary_in','outdev','iothread' property set");
+ return;
+ } else if (!strcmp(s->pri_indev, s->outdev) ||
+ !strcmp(s->sec_indev, s->outdev) ||
+ !strcmp(s->pri_indev, s->sec_indev)) {
+ error_setg(errp, "'indev' and 'outdev' could not be same "
+ "for compare module");
+ return;
+ }
+
+ if (!s->compare_timeout) {
+ /* Set default value to 3000 MS */
+ s->compare_timeout = DEFAULT_TIME_OUT_MS;
+ }
+
+ if (!s->expired_scan_cycle) {
+ /* Set default value to 3000 MS */
+ s->expired_scan_cycle = REGULAR_PACKET_CHECK_MS;
+ }
+
+ if (!max_queue_size) {
+ /* Set default queue size to 1024 */
+ max_queue_size = MAX_QUEUE_SIZE;
+ }
+
+ if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
+ !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&chr, s->sec_indev, errp) ||
+ !qemu_chr_fe_init(&s->chr_sec_in, chr, errp)) {
+ return;
+ }
+
+ if (find_and_check_chardev(&chr, s->outdev, errp) ||
+ !qemu_chr_fe_init(&s->chr_out, chr, errp)) {
+ return;
+ }
+
+ net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr);
+ net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr);
+
+ /* Try to enable remote notify chardev, currently just for Xen COLO */
+ if (s->notify_dev) {
+ if (find_and_check_chardev(&chr, s->notify_dev, errp) ||
+ !qemu_chr_fe_init(&s->chr_notify_dev, chr, errp)) {
+ return;
+ }
+
+ net_socket_rs_init(&s->notify_rs, compare_notify_rs_finalize,
+ s->vnet_hdr);
+ }
+
+ s->out_sendco.s = s;
+ s->out_sendco.chr = &s->chr_out;
+ s->out_sendco.notify_remote_frame = false;
+ s->out_sendco.done = true;
+ g_queue_init(&s->out_sendco.send_list);
+
+ if (s->notify_dev) {
+ s->notify_sendco.s = s;
+ s->notify_sendco.chr = &s->chr_notify_dev;
+ s->notify_sendco.notify_remote_frame = true;
+ s->notify_sendco.done = true;
+ g_queue_init(&s->notify_sendco.send_list);
+ }
+
+ g_queue_init(&s->conn_list);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+
+ colo_compare_iothread(s);
+
+ qemu_mutex_lock(&colo_compare_mutex);
+ if (!colo_compare_active) {
+ qemu_mutex_init(&event_mtx);
+ qemu_cond_init(&event_complete_cond);
+ colo_compare_active = true;
+ }
+ QTAILQ_INSERT_TAIL(&net_compares, s, next);
+ qemu_mutex_unlock(&colo_compare_mutex);
+
+ return;
+}
+
+static void colo_flush_packets(void *opaque, void *user_data)
+{
+ CompareState *s = user_data;
+ Connection *conn = opaque;
+ Packet *pkt = NULL;
+
+ while (!g_queue_is_empty(&conn->primary_list)) {
+ pkt = g_queue_pop_head(&conn->primary_list);
+ compare_chr_send(s,
+ pkt->data,
+ pkt->size,
+ pkt->vnet_hdr_len,
+ false,
+ true);
+ packet_destroy_partial(pkt, NULL);
+ }
+ while (!g_queue_is_empty(&conn->secondary_list)) {
+ pkt = g_queue_pop_head(&conn->secondary_list);
+ packet_destroy(pkt, NULL);
+ }
+}
+
+static void colo_compare_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+
+ ucc->complete = colo_compare_complete;
+}
+
+static void colo_compare_init(Object *obj)
+{
+ CompareState *s = COLO_COMPARE(obj);
+
+ object_property_add_str(obj, "primary_in",
+ compare_get_pri_indev, compare_set_pri_indev);
+ object_property_add_str(obj, "secondary_in",
+ compare_get_sec_indev, compare_set_sec_indev);
+ object_property_add_str(obj, "outdev",
+ compare_get_outdev, compare_set_outdev);
+ object_property_add_link(obj, "iothread", TYPE_IOTHREAD,
+ (Object **)&s->iothread,
+ object_property_allow_set_link,
+ OBJ_PROP_LINK_STRONG);
+ /* This parameter just for Xen COLO */
+ object_property_add_str(obj, "notify_dev",
+ compare_get_notify_dev, compare_set_notify_dev);
+
+ object_property_add(obj, "compare_timeout", "uint64",
+ compare_get_timeout,
+ compare_set_timeout, NULL, NULL);
+
+ object_property_add(obj, "expired_scan_cycle", "uint32",
+ compare_get_expired_scan_cycle,
+ compare_set_expired_scan_cycle, NULL, NULL);
+
+ object_property_add(obj, "max_queue_size", "uint32",
+ get_max_queue_size,
+ set_max_queue_size, NULL, NULL);
+
+ s->vnet_hdr = false;
+ object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr,
+ compare_set_vnet_hdr);
+}
+
+void colo_compare_cleanup(void)
+{
+ CompareState *tmp = NULL;
+ CompareState *n = NULL;
+
+ QTAILQ_FOREACH_SAFE(tmp, &net_compares, next, n) {
+ object_unparent(OBJECT(tmp));
+ }
+}
+
+static void colo_compare_finalize(Object *obj)
+{
+ CompareState *s = COLO_COMPARE(obj);
+ CompareState *tmp = NULL;
+
+ qemu_mutex_lock(&colo_compare_mutex);
+ QTAILQ_FOREACH(tmp, &net_compares, next) {
+ if (tmp == s) {
+ QTAILQ_REMOVE(&net_compares, s, next);
+ break;
+ }
+ }
+ if (QTAILQ_EMPTY(&net_compares)) {
+ colo_compare_active = false;
+ qemu_mutex_destroy(&event_mtx);
+ qemu_cond_destroy(&event_complete_cond);
+ }
+ qemu_mutex_unlock(&colo_compare_mutex);
+
+ qemu_chr_fe_deinit(&s->chr_pri_in, false);
+ qemu_chr_fe_deinit(&s->chr_sec_in, false);
+ qemu_chr_fe_deinit(&s->chr_out, false);
+ if (s->notify_dev) {
+ qemu_chr_fe_deinit(&s->chr_notify_dev, false);
+ }
+
+ colo_compare_timer_del(s);
+
+ qemu_bh_delete(s->event_bh);
+
+ AioContext *ctx = iothread_get_aio_context(s->iothread);
+ aio_context_acquire(ctx);
+ AIO_WAIT_WHILE(ctx, !s->out_sendco.done);
+ if (s->notify_dev) {
+ AIO_WAIT_WHILE(ctx, !s->notify_sendco.done);
+ }
+ aio_context_release(ctx);
+
+ /* Release all unhandled packets after compare thead exited */
+ g_queue_foreach(&s->conn_list, colo_flush_packets, s);
+ AIO_WAIT_WHILE(NULL, !s->out_sendco.done);
+
+ g_queue_clear(&s->conn_list);
+ g_queue_clear(&s->out_sendco.send_list);
+ if (s->notify_dev) {
+ g_queue_clear(&s->notify_sendco.send_list);
+ }
+
+ if (s->connection_track_table) {
+ g_hash_table_destroy(s->connection_track_table);
+ }
+
+ object_unref(OBJECT(s->iothread));
+
+ g_free(s->pri_indev);
+ g_free(s->sec_indev);
+ g_free(s->outdev);
+ g_free(s->notify_dev);
+}
+
+static void __attribute__((__constructor__)) colo_compare_init_globals(void)
+{
+ colo_compare_active = false;
+ qemu_mutex_init(&colo_compare_mutex);
+}
+
+static const TypeInfo colo_compare_info = {
+ .name = TYPE_COLO_COMPARE,
+ .parent = TYPE_OBJECT,
+ .instance_size = sizeof(CompareState),
+ .instance_init = colo_compare_init,
+ .instance_finalize = colo_compare_finalize,
+ .class_size = sizeof(CompareClass),
+ .class_init = colo_compare_class_init,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_compare_info);
+}
+
+type_init(register_types);
diff --git a/net/colo-compare.h b/net/colo-compare.h
new file mode 100644
index 000000000..b055270da
--- /dev/null
+++ b/net/colo-compare.h
@@ -0,0 +1,25 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2017 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2017 FUJITSU LIMITED
+ * Copyright (c) 2017 Intel Corporation
+ *
+ * Authors:
+ * zhanghailiang <zhang.zhanghailiang@huawei.com>
+ * Zhang Chen <zhangckid@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_COLO_COMPARE_H
+#define QEMU_COLO_COMPARE_H
+
+void colo_notify_compares_event(void *opaque, int event, Error **errp);
+void colo_compare_register_notifier(Notifier *notify);
+void colo_compare_unregister_notifier(Notifier *notify);
+void colo_compare_cleanup(void);
+
+#endif /* QEMU_COLO_COMPARE_H */
diff --git a/net/colo.c b/net/colo.c
new file mode 100644
index 000000000..1f8162f59
--- /dev/null
+++ b/net/colo.c
@@ -0,0 +1,238 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "colo.h"
+#include "util.h"
+
+uint32_t connection_key_hash(const void *opaque)
+{
+ const ConnectionKey *key = opaque;
+ uint32_t a, b, c;
+
+ /* Jenkins hash */
+ a = b = c = JHASH_INITVAL + sizeof(*key);
+ a += key->src.s_addr;
+ b += key->dst.s_addr;
+ c += (key->src_port | key->dst_port << 16);
+ __jhash_mix(a, b, c);
+
+ a += key->ip_proto;
+ __jhash_final(a, b, c);
+
+ return c;
+}
+
+int connection_key_equal(const void *key1, const void *key2)
+{
+ return memcmp(key1, key2, sizeof(ConnectionKey)) == 0;
+}
+
+int parse_packet_early(Packet *pkt)
+{
+ int network_length;
+ static const uint8_t vlan[] = {0x81, 0x00};
+ uint8_t *data = pkt->data + pkt->vnet_hdr_len;
+ uint16_t l3_proto;
+ ssize_t l2hdr_len = eth_get_l2_hdr_length(data);
+
+ if (pkt->size < ETH_HLEN + pkt->vnet_hdr_len) {
+ trace_colo_proxy_main("pkt->size < ETH_HLEN");
+ return 1;
+ }
+
+ /*
+ * TODO: support vlan.
+ */
+ if (!memcmp(&data[12], vlan, sizeof(vlan))) {
+ trace_colo_proxy_main("COLO-proxy don't support vlan");
+ return 1;
+ }
+
+ pkt->network_header = data + l2hdr_len;
+
+ const struct iovec l2vec = {
+ .iov_base = (void *) data,
+ .iov_len = l2hdr_len
+ };
+ l3_proto = eth_get_l3_proto(&l2vec, 1, l2hdr_len);
+
+ if (l3_proto != ETH_P_IP) {
+ return 1;
+ }
+
+ network_length = pkt->ip->ip_hl * 4;
+ if (pkt->size < l2hdr_len + network_length + pkt->vnet_hdr_len) {
+ trace_colo_proxy_main("pkt->size < network_header + network_length");
+ return 1;
+ }
+ pkt->transport_header = pkt->network_header + network_length;
+
+ return 0;
+}
+
+void extract_ip_and_port(uint32_t tmp_ports, ConnectionKey *key,
+ Packet *pkt, bool reverse)
+{
+ if (reverse) {
+ key->src = pkt->ip->ip_dst;
+ key->dst = pkt->ip->ip_src;
+ key->src_port = ntohs(tmp_ports & 0xffff);
+ key->dst_port = ntohs(tmp_ports >> 16);
+ } else {
+ key->src = pkt->ip->ip_src;
+ key->dst = pkt->ip->ip_dst;
+ key->src_port = ntohs(tmp_ports >> 16);
+ key->dst_port = ntohs(tmp_ports & 0xffff);
+ }
+}
+
+void fill_connection_key(Packet *pkt, ConnectionKey *key, bool reverse)
+{
+ uint32_t tmp_ports = 0;
+
+ key->ip_proto = pkt->ip->ip_p;
+
+ switch (key->ip_proto) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_DCCP:
+ case IPPROTO_ESP:
+ case IPPROTO_SCTP:
+ case IPPROTO_UDPLITE:
+ tmp_ports = *(uint32_t *)(pkt->transport_header);
+ break;
+ case IPPROTO_AH:
+ tmp_ports = *(uint32_t *)(pkt->transport_header + 4);
+ break;
+ default:
+ break;
+ }
+
+ extract_ip_and_port(tmp_ports, key, pkt, reverse);
+}
+
+Connection *connection_new(ConnectionKey *key)
+{
+ Connection *conn = g_slice_new0(Connection);
+
+ conn->ip_proto = key->ip_proto;
+ conn->processing = false;
+ conn->tcp_state = TCPS_CLOSED;
+ g_queue_init(&conn->primary_list);
+ g_queue_init(&conn->secondary_list);
+
+ return conn;
+}
+
+void connection_destroy(void *opaque)
+{
+ Connection *conn = opaque;
+
+ g_queue_foreach(&conn->primary_list, packet_destroy, NULL);
+ g_queue_clear(&conn->primary_list);
+ g_queue_foreach(&conn->secondary_list, packet_destroy, NULL);
+ g_queue_clear(&conn->secondary_list);
+ g_slice_free(Connection, conn);
+}
+
+Packet *packet_new(const void *data, int size, int vnet_hdr_len)
+{
+ Packet *pkt = g_slice_new0(Packet);
+
+ pkt->data = g_memdup(data, size);
+ pkt->size = size;
+ pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+ pkt->vnet_hdr_len = vnet_hdr_len;
+
+ return pkt;
+}
+
+/*
+ * packet_new_nocopy will not copy data, so the caller can't release
+ * the data. And it will be released in packet_destroy.
+ */
+Packet *packet_new_nocopy(void *data, int size, int vnet_hdr_len)
+{
+ Packet *pkt = g_slice_new0(Packet);
+
+ pkt->data = data;
+ pkt->size = size;
+ pkt->creation_ms = qemu_clock_get_ms(QEMU_CLOCK_HOST);
+ pkt->vnet_hdr_len = vnet_hdr_len;
+
+ return pkt;
+}
+
+void packet_destroy(void *opaque, void *user_data)
+{
+ Packet *pkt = opaque;
+
+ g_free(pkt->data);
+ g_slice_free(Packet, pkt);
+}
+
+void packet_destroy_partial(void *opaque, void *user_data)
+{
+ Packet *pkt = opaque;
+
+ g_slice_free(Packet, pkt);
+}
+
+/*
+ * Clear hashtable, stop this hash growing really huge
+ */
+void connection_hashtable_reset(GHashTable *connection_track_table)
+{
+ g_hash_table_remove_all(connection_track_table);
+}
+
+/* if not found, create a new connection and add to hash table */
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list)
+{
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+ if (conn == NULL) {
+ ConnectionKey *new_key = g_memdup(key, sizeof(*key));
+
+ conn = connection_new(key);
+
+ if (g_hash_table_size(connection_track_table) > HASHTABLE_MAX_SIZE) {
+ trace_colo_proxy_main("colo proxy connection hashtable full,"
+ " clear it");
+ connection_hashtable_reset(connection_track_table);
+ /*
+ * clear the conn_list
+ */
+ while (!g_queue_is_empty(conn_list)) {
+ connection_destroy(g_queue_pop_head(conn_list));
+ }
+ }
+
+ g_hash_table_insert(connection_track_table, new_key, conn);
+ }
+
+ return conn;
+}
+
+bool connection_has_tracked(GHashTable *connection_track_table,
+ ConnectionKey *key)
+{
+ Connection *conn = g_hash_table_lookup(connection_track_table, key);
+
+ return conn ? true : false;
+}
diff --git a/net/colo.h b/net/colo.h
new file mode 100644
index 000000000..8b3e8d5a8
--- /dev/null
+++ b/net/colo.h
@@ -0,0 +1,108 @@
+/*
+ * COarse-grain LOck-stepping Virtual Machines for Non-stop Service (COLO)
+ * (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#ifndef NET_COLO_H
+#define NET_COLO_H
+
+#include "qemu/jhash.h"
+#include "qemu/timer.h"
+#include "net/eth.h"
+
+#define HASHTABLE_MAX_SIZE 16384
+
+#ifndef IPPROTO_DCCP
+#define IPPROTO_DCCP 33
+#endif
+
+#ifndef IPPROTO_SCTP
+#define IPPROTO_SCTP 132
+#endif
+
+#ifndef IPPROTO_UDPLITE
+#define IPPROTO_UDPLITE 136
+#endif
+
+typedef struct Packet {
+ void *data;
+ union {
+ uint8_t *network_header;
+ struct ip *ip;
+ };
+ uint8_t *transport_header;
+ int size;
+ /* Time of packet creation, in wall clock ms */
+ int64_t creation_ms;
+ /* Get vnet_hdr_len from filter */
+ uint32_t vnet_hdr_len;
+ uint32_t tcp_seq; /* sequence number */
+ uint32_t tcp_ack; /* acknowledgement number */
+ /* the sequence number of the last byte of the packet */
+ uint32_t seq_end;
+ uint8_t header_size; /* the header length */
+ uint16_t payload_size; /* the payload length */
+ /* record the payload offset(the length that has been compared) */
+ uint16_t offset;
+ uint8_t flags; /* Flags(aka Control bits) */
+} Packet;
+
+typedef struct ConnectionKey {
+ /* (src, dst) must be grouped, in the same way than in IP header */
+ struct in_addr src;
+ struct in_addr dst;
+ uint16_t src_port;
+ uint16_t dst_port;
+ uint8_t ip_proto;
+} QEMU_PACKED ConnectionKey;
+
+typedef struct Connection {
+ /* connection primary send queue: element type: Packet */
+ GQueue primary_list;
+ /* connection secondary send queue: element type: Packet */
+ GQueue secondary_list;
+ /* flag to enqueue unprocessed_connections */
+ bool processing;
+ uint8_t ip_proto;
+ /* record the sequence number that has been compared */
+ uint32_t compare_seq;
+ /* the maximum of acknowledgement number in primary_list queue */
+ uint32_t pack;
+ /* the maximum of acknowledgement number in secondary_list queue */
+ uint32_t sack;
+ /* offset = secondary_seq - primary_seq */
+ uint32_t offset;
+
+ int tcp_state; /* TCP FSM state */
+ uint32_t fin_ack_seq; /* the seq of 'fin=1,ack=1' */
+} Connection;
+
+uint32_t connection_key_hash(const void *opaque);
+int connection_key_equal(const void *opaque1, const void *opaque2);
+int parse_packet_early(Packet *pkt);
+void extract_ip_and_port(uint32_t tmp_ports, ConnectionKey *key,
+ Packet *pkt, bool reverse);
+void fill_connection_key(Packet *pkt, ConnectionKey *key, bool reverse);
+Connection *connection_new(ConnectionKey *key);
+void connection_destroy(void *opaque);
+Connection *connection_get(GHashTable *connection_track_table,
+ ConnectionKey *key,
+ GQueue *conn_list);
+bool connection_has_tracked(GHashTable *connection_track_table,
+ ConnectionKey *key);
+void connection_hashtable_reset(GHashTable *connection_track_table);
+Packet *packet_new(const void *data, int size, int vnet_hdr_len);
+Packet *packet_new_nocopy(void *data, int size, int vnet_hdr_len);
+void packet_destroy(void *opaque, void *user_data);
+void packet_destroy_partial(void *opaque, void *user_data);
+
+#endif /* NET_COLO_H */
diff --git a/net/dump.c b/net/dump.c
new file mode 100644
index 000000000..a07ba6240
--- /dev/null
+++ b/net/dump.c
@@ -0,0 +1,263 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "clients.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "qapi/visitor.h"
+#include "net/filter.h"
+#include "qom/object.h"
+
+typedef struct DumpState {
+ int64_t start_ts;
+ int fd;
+ int pcap_caplen;
+} DumpState;
+
+#define PCAP_MAGIC 0xa1b2c3d4
+
+struct pcap_file_hdr {
+ uint32_t magic;
+ uint16_t version_major;
+ uint16_t version_minor;
+ int32_t thiszone;
+ uint32_t sigfigs;
+ uint32_t snaplen;
+ uint32_t linktype;
+};
+
+struct pcap_sf_pkthdr {
+ struct {
+ int32_t tv_sec;
+ int32_t tv_usec;
+ } ts;
+ uint32_t caplen;
+ uint32_t len;
+};
+
+static ssize_t dump_receive_iov(DumpState *s, const struct iovec *iov, int cnt)
+{
+ struct pcap_sf_pkthdr hdr;
+ int64_t ts;
+ int caplen;
+ size_t size = iov_size(iov, cnt);
+ struct iovec dumpiov[cnt + 1];
+
+ /* Early return in case of previous error. */
+ if (s->fd < 0) {
+ return size;
+ }
+
+ ts = qemu_clock_get_us(QEMU_CLOCK_VIRTUAL);
+ caplen = size > s->pcap_caplen ? s->pcap_caplen : size;
+
+ hdr.ts.tv_sec = ts / 1000000 + s->start_ts;
+ hdr.ts.tv_usec = ts % 1000000;
+ hdr.caplen = caplen;
+ hdr.len = size;
+
+ dumpiov[0].iov_base = &hdr;
+ dumpiov[0].iov_len = sizeof(hdr);
+ cnt = iov_copy(&dumpiov[1], cnt, iov, cnt, 0, caplen);
+
+ if (writev(s->fd, dumpiov, cnt + 1) != sizeof(hdr) + caplen) {
+ error_report("network dump write error - stopping dump");
+ close(s->fd);
+ s->fd = -1;
+ }
+
+ return size;
+}
+
+static void dump_cleanup(DumpState *s)
+{
+ close(s->fd);
+ s->fd = -1;
+}
+
+static int net_dump_state_init(DumpState *s, const char *filename,
+ int len, Error **errp)
+{
+ struct pcap_file_hdr hdr;
+ struct tm tm;
+ int fd;
+
+ fd = open(filename, O_CREAT | O_TRUNC | O_WRONLY | O_BINARY, 0644);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "net dump: can't open %s", filename);
+ return -1;
+ }
+
+ hdr.magic = PCAP_MAGIC;
+ hdr.version_major = 2;
+ hdr.version_minor = 4;
+ hdr.thiszone = 0;
+ hdr.sigfigs = 0;
+ hdr.snaplen = len;
+ hdr.linktype = 1;
+
+ if (write(fd, &hdr, sizeof(hdr)) < sizeof(hdr)) {
+ error_setg_errno(errp, errno, "net dump write error");
+ close(fd);
+ return -1;
+ }
+
+ s->fd = fd;
+ s->pcap_caplen = len;
+
+ qemu_get_timedate(&tm, 0);
+ s->start_ts = mktime(&tm);
+
+ return 0;
+}
+
+#define TYPE_FILTER_DUMP "filter-dump"
+
+OBJECT_DECLARE_SIMPLE_TYPE(NetFilterDumpState, FILTER_DUMP)
+
+struct NetFilterDumpState {
+ NetFilterState nfs;
+ DumpState ds;
+ char *filename;
+ uint32_t maxlen;
+};
+
+static ssize_t filter_dump_receive_iov(NetFilterState *nf, NetClientState *sndr,
+ unsigned flags, const struct iovec *iov,
+ int iovcnt, NetPacketSent *sent_cb)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(nf);
+
+ dump_receive_iov(&nfds->ds, iov, iovcnt);
+ return 0;
+}
+
+static void filter_dump_cleanup(NetFilterState *nf)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(nf);
+
+ dump_cleanup(&nfds->ds);
+}
+
+static void filter_dump_setup(NetFilterState *nf, Error **errp)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(nf);
+
+ if (!nfds->filename) {
+ error_setg(errp, "dump filter needs 'file' property set!");
+ return;
+ }
+
+ net_dump_state_init(&nfds->ds, nfds->filename, nfds->maxlen, errp);
+}
+
+static void filter_dump_get_maxlen(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+ uint32_t value = nfds->maxlen;
+
+ visit_type_uint32(v, name, &value, errp);
+}
+
+static void filter_dump_set_maxlen(Object *obj, Visitor *v, const char *name,
+ void *opaque, Error **errp)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+ uint32_t value;
+
+ if (!visit_type_uint32(v, name, &value, errp)) {
+ return;
+ }
+ if (value == 0) {
+ error_setg(errp, "Property '%s.%s' doesn't take value '%u'",
+ object_get_typename(obj), name, value);
+ return;
+ }
+ nfds->maxlen = value;
+}
+
+static char *file_dump_get_filename(Object *obj, Error **errp)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+
+ return g_strdup(nfds->filename);
+}
+
+static void file_dump_set_filename(Object *obj, const char *value, Error **errp)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+
+ g_free(nfds->filename);
+ nfds->filename = g_strdup(value);
+}
+
+static void filter_dump_instance_init(Object *obj)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+
+ nfds->maxlen = 65536;
+}
+
+static void filter_dump_instance_finalize(Object *obj)
+{
+ NetFilterDumpState *nfds = FILTER_DUMP(obj);
+
+ g_free(nfds->filename);
+}
+
+static void filter_dump_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add(oc, "maxlen", "uint32", filter_dump_get_maxlen,
+ filter_dump_set_maxlen, NULL, NULL);
+ object_class_property_add_str(oc, "file", file_dump_get_filename,
+ file_dump_set_filename);
+
+ nfc->setup = filter_dump_setup;
+ nfc->cleanup = filter_dump_cleanup;
+ nfc->receive_iov = filter_dump_receive_iov;
+}
+
+static const TypeInfo filter_dump_info = {
+ .name = TYPE_FILTER_DUMP,
+ .parent = TYPE_NETFILTER,
+ .class_init = filter_dump_class_init,
+ .instance_init = filter_dump_instance_init,
+ .instance_finalize = filter_dump_instance_finalize,
+ .instance_size = sizeof(NetFilterDumpState),
+};
+
+static void filter_dump_register_types(void)
+{
+ type_register_static(&filter_dump_info);
+}
+
+type_init(filter_dump_register_types);
diff --git a/net/eth.c b/net/eth.c
new file mode 100644
index 000000000..fe876d1a5
--- /dev/null
+++ b/net/eth.c
@@ -0,0 +1,567 @@
+/*
+ * QEMU network structures definitions and helper functions
+ *
+ * Copyright (c) 2012 Ravello Systems LTD (http://ravellosystems.com)
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Dmitry Fleytman <dmitry@daynix.com>
+ * Tamir Shomer <tamirs@daynix.com>
+ * Yan Vugenfirer <yan@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "net/eth.h"
+#include "net/checksum.h"
+#include "net/tap.h"
+
+void eth_setup_vlan_headers_ex(struct eth_header *ehdr, uint16_t vlan_tag,
+ uint16_t vlan_ethtype, bool *is_new)
+{
+ struct vlan_header *vhdr = PKT_GET_VLAN_HDR(ehdr);
+
+ switch (be16_to_cpu(ehdr->h_proto)) {
+ case ETH_P_VLAN:
+ case ETH_P_DVLAN:
+ /* vlan hdr exists */
+ *is_new = false;
+ break;
+
+ default:
+ /* No VLAN header, put a new one */
+ vhdr->h_proto = ehdr->h_proto;
+ ehdr->h_proto = cpu_to_be16(vlan_ethtype);
+ *is_new = true;
+ break;
+ }
+ vhdr->h_tci = cpu_to_be16(vlan_tag);
+}
+
+uint8_t
+eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto)
+{
+ uint8_t ecn_state = 0;
+
+ if (l3_proto == ETH_P_IP) {
+ struct ip_header *iphdr = (struct ip_header *) l3_hdr;
+
+ if (IP_HEADER_VERSION(iphdr) == IP_HEADER_VERSION_4) {
+ if (IPTOS_ECN(iphdr->ip_tos) == IPTOS_ECN_CE) {
+ ecn_state = VIRTIO_NET_HDR_GSO_ECN;
+ }
+ if (l4proto == IP_PROTO_TCP) {
+ return VIRTIO_NET_HDR_GSO_TCPV4 | ecn_state;
+ } else if (l4proto == IP_PROTO_UDP) {
+ return VIRTIO_NET_HDR_GSO_UDP | ecn_state;
+ }
+ }
+ } else if (l3_proto == ETH_P_IPV6) {
+ struct ip6_header *ip6hdr = (struct ip6_header *) l3_hdr;
+
+ if (IP6_ECN(ip6hdr->ip6_ecn_acc) == IP6_ECN_CE) {
+ ecn_state = VIRTIO_NET_HDR_GSO_ECN;
+ }
+
+ if (l4proto == IP_PROTO_TCP) {
+ return VIRTIO_NET_HDR_GSO_TCPV6 | ecn_state;
+ }
+ }
+ qemu_log_mask(LOG_UNIMP, "%s: probably not GSO frame, "
+ "unknown L3 protocol: 0x%04"PRIx16"\n", __func__, l3_proto);
+
+ return VIRTIO_NET_HDR_GSO_NONE | ecn_state;
+}
+
+uint16_t
+eth_get_l3_proto(const struct iovec *l2hdr_iov, int iovcnt, size_t l2hdr_len)
+{
+ uint16_t proto;
+ size_t copied;
+ size_t size = iov_size(l2hdr_iov, iovcnt);
+ size_t proto_offset = l2hdr_len - sizeof(proto);
+
+ if (size < proto_offset) {
+ return ETH_P_UNKNOWN;
+ }
+
+ copied = iov_to_buf(l2hdr_iov, iovcnt, proto_offset,
+ &proto, sizeof(proto));
+
+ return (copied == sizeof(proto)) ? be16_to_cpu(proto) : ETH_P_UNKNOWN;
+}
+
+static bool
+_eth_copy_chunk(size_t input_size,
+ const struct iovec *iov, int iovcnt,
+ size_t offset, size_t length,
+ void *buffer)
+{
+ size_t copied;
+
+ if (input_size < offset) {
+ return false;
+ }
+
+ copied = iov_to_buf(iov, iovcnt, offset, buffer, length);
+
+ if (copied < length) {
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+_eth_tcp_has_data(bool is_ip4,
+ const struct ip_header *ip4_hdr,
+ const struct ip6_header *ip6_hdr,
+ size_t full_ip6hdr_len,
+ const struct tcp_header *tcp)
+{
+ uint32_t l4len;
+
+ if (is_ip4) {
+ l4len = be16_to_cpu(ip4_hdr->ip_len) - IP_HDR_GET_LEN(ip4_hdr);
+ } else {
+ size_t opts_len = full_ip6hdr_len - sizeof(struct ip6_header);
+ l4len = be16_to_cpu(ip6_hdr->ip6_ctlun.ip6_un1.ip6_un1_plen) - opts_len;
+ }
+
+ return l4len > TCP_HEADER_DATA_OFFSET(tcp);
+}
+
+void eth_get_protocols(const struct iovec *iov, int iovcnt,
+ bool *isip4, bool *isip6,
+ bool *isudp, bool *istcp,
+ size_t *l3hdr_off,
+ size_t *l4hdr_off,
+ size_t *l5hdr_off,
+ eth_ip6_hdr_info *ip6hdr_info,
+ eth_ip4_hdr_info *ip4hdr_info,
+ eth_l4_hdr_info *l4hdr_info)
+{
+ int proto;
+ bool fragment = false;
+ size_t l2hdr_len = eth_get_l2_hdr_length_iov(iov, iovcnt);
+ size_t input_size = iov_size(iov, iovcnt);
+ size_t copied;
+
+ *isip4 = *isip6 = *isudp = *istcp = false;
+
+ proto = eth_get_l3_proto(iov, iovcnt, l2hdr_len);
+
+ *l3hdr_off = l2hdr_len;
+
+ if (proto == ETH_P_IP) {
+ struct ip_header *iphdr = &ip4hdr_info->ip4_hdr;
+
+ if (input_size < l2hdr_len) {
+ return;
+ }
+
+ copied = iov_to_buf(iov, iovcnt, l2hdr_len, iphdr, sizeof(*iphdr));
+
+ *isip4 = true;
+
+ if (copied < sizeof(*iphdr)) {
+ return;
+ }
+
+ if (IP_HEADER_VERSION(iphdr) == IP_HEADER_VERSION_4) {
+ if (iphdr->ip_p == IP_PROTO_TCP) {
+ *istcp = true;
+ } else if (iphdr->ip_p == IP_PROTO_UDP) {
+ *isudp = true;
+ }
+ }
+
+ ip4hdr_info->fragment = IP4_IS_FRAGMENT(iphdr);
+ *l4hdr_off = l2hdr_len + IP_HDR_GET_LEN(iphdr);
+
+ fragment = ip4hdr_info->fragment;
+ } else if (proto == ETH_P_IPV6) {
+
+ *isip6 = true;
+ if (eth_parse_ipv6_hdr(iov, iovcnt, l2hdr_len,
+ ip6hdr_info)) {
+ if (ip6hdr_info->l4proto == IP_PROTO_TCP) {
+ *istcp = true;
+ } else if (ip6hdr_info->l4proto == IP_PROTO_UDP) {
+ *isudp = true;
+ }
+ } else {
+ return;
+ }
+
+ *l4hdr_off = l2hdr_len + ip6hdr_info->full_hdr_len;
+ fragment = ip6hdr_info->fragment;
+ }
+
+ if (!fragment) {
+ if (*istcp) {
+ *istcp = _eth_copy_chunk(input_size,
+ iov, iovcnt,
+ *l4hdr_off, sizeof(l4hdr_info->hdr.tcp),
+ &l4hdr_info->hdr.tcp);
+
+ if (*istcp) {
+ *l5hdr_off = *l4hdr_off +
+ TCP_HEADER_DATA_OFFSET(&l4hdr_info->hdr.tcp);
+
+ l4hdr_info->has_tcp_data =
+ _eth_tcp_has_data(proto == ETH_P_IP,
+ &ip4hdr_info->ip4_hdr,
+ &ip6hdr_info->ip6_hdr,
+ *l4hdr_off - *l3hdr_off,
+ &l4hdr_info->hdr.tcp);
+ }
+ } else if (*isudp) {
+ *isudp = _eth_copy_chunk(input_size,
+ iov, iovcnt,
+ *l4hdr_off, sizeof(l4hdr_info->hdr.udp),
+ &l4hdr_info->hdr.udp);
+ *l5hdr_off = *l4hdr_off + sizeof(l4hdr_info->hdr.udp);
+ }
+ }
+}
+
+size_t
+eth_strip_vlan(const struct iovec *iov, int iovcnt, size_t iovoff,
+ uint8_t *new_ehdr_buf,
+ uint16_t *payload_offset, uint16_t *tci)
+{
+ struct vlan_header vlan_hdr;
+ struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf;
+
+ size_t copied = iov_to_buf(iov, iovcnt, iovoff,
+ new_ehdr, sizeof(*new_ehdr));
+
+ if (copied < sizeof(*new_ehdr)) {
+ return 0;
+ }
+
+ switch (be16_to_cpu(new_ehdr->h_proto)) {
+ case ETH_P_VLAN:
+ case ETH_P_DVLAN:
+ copied = iov_to_buf(iov, iovcnt, iovoff + sizeof(*new_ehdr),
+ &vlan_hdr, sizeof(vlan_hdr));
+
+ if (copied < sizeof(vlan_hdr)) {
+ return 0;
+ }
+
+ new_ehdr->h_proto = vlan_hdr.h_proto;
+
+ *tci = be16_to_cpu(vlan_hdr.h_tci);
+ *payload_offset = iovoff + sizeof(*new_ehdr) + sizeof(vlan_hdr);
+
+ if (be16_to_cpu(new_ehdr->h_proto) == ETH_P_VLAN) {
+
+ copied = iov_to_buf(iov, iovcnt, *payload_offset,
+ PKT_GET_VLAN_HDR(new_ehdr), sizeof(vlan_hdr));
+
+ if (copied < sizeof(vlan_hdr)) {
+ return 0;
+ }
+
+ *payload_offset += sizeof(vlan_hdr);
+
+ return sizeof(struct eth_header) + sizeof(struct vlan_header);
+ } else {
+ return sizeof(struct eth_header);
+ }
+ default:
+ return 0;
+ }
+}
+
+size_t
+eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, size_t iovoff,
+ uint16_t vet, uint8_t *new_ehdr_buf,
+ uint16_t *payload_offset, uint16_t *tci)
+{
+ struct vlan_header vlan_hdr;
+ struct eth_header *new_ehdr = (struct eth_header *) new_ehdr_buf;
+
+ size_t copied = iov_to_buf(iov, iovcnt, iovoff,
+ new_ehdr, sizeof(*new_ehdr));
+
+ if (copied < sizeof(*new_ehdr)) {
+ return 0;
+ }
+
+ if (be16_to_cpu(new_ehdr->h_proto) == vet) {
+ copied = iov_to_buf(iov, iovcnt, iovoff + sizeof(*new_ehdr),
+ &vlan_hdr, sizeof(vlan_hdr));
+
+ if (copied < sizeof(vlan_hdr)) {
+ return 0;
+ }
+
+ new_ehdr->h_proto = vlan_hdr.h_proto;
+
+ *tci = be16_to_cpu(vlan_hdr.h_tci);
+ *payload_offset = iovoff + sizeof(*new_ehdr) + sizeof(vlan_hdr);
+ return sizeof(struct eth_header);
+ }
+
+ return 0;
+}
+
+void
+eth_setup_ip4_fragmentation(const void *l2hdr, size_t l2hdr_len,
+ void *l3hdr, size_t l3hdr_len,
+ size_t l3payload_len,
+ size_t frag_offset, bool more_frags)
+{
+ const struct iovec l2vec = {
+ .iov_base = (void *) l2hdr,
+ .iov_len = l2hdr_len
+ };
+
+ if (eth_get_l3_proto(&l2vec, 1, l2hdr_len) == ETH_P_IP) {
+ uint16_t orig_flags;
+ struct ip_header *iphdr = (struct ip_header *) l3hdr;
+ uint16_t frag_off_units = frag_offset / IP_FRAG_UNIT_SIZE;
+ uint16_t new_ip_off;
+
+ assert(frag_offset % IP_FRAG_UNIT_SIZE == 0);
+ assert((frag_off_units & ~IP_OFFMASK) == 0);
+
+ orig_flags = be16_to_cpu(iphdr->ip_off) & ~(IP_OFFMASK|IP_MF);
+ new_ip_off = frag_off_units | orig_flags | (more_frags ? IP_MF : 0);
+ iphdr->ip_off = cpu_to_be16(new_ip_off);
+ iphdr->ip_len = cpu_to_be16(l3payload_len + l3hdr_len);
+ }
+}
+
+void
+eth_fix_ip4_checksum(void *l3hdr, size_t l3hdr_len)
+{
+ struct ip_header *iphdr = (struct ip_header *) l3hdr;
+ iphdr->ip_sum = 0;
+ iphdr->ip_sum = cpu_to_be16(net_raw_checksum(l3hdr, l3hdr_len));
+}
+
+uint32_t
+eth_calc_ip4_pseudo_hdr_csum(struct ip_header *iphdr,
+ uint16_t csl,
+ uint32_t *cso)
+{
+ struct ip_pseudo_header ipph;
+ ipph.ip_src = iphdr->ip_src;
+ ipph.ip_dst = iphdr->ip_dst;
+ ipph.ip_payload = cpu_to_be16(csl);
+ ipph.ip_proto = iphdr->ip_p;
+ ipph.zeros = 0;
+ *cso = sizeof(ipph);
+ return net_checksum_add(*cso, (uint8_t *) &ipph);
+}
+
+uint32_t
+eth_calc_ip6_pseudo_hdr_csum(struct ip6_header *iphdr,
+ uint16_t csl,
+ uint8_t l4_proto,
+ uint32_t *cso)
+{
+ struct ip6_pseudo_header ipph;
+ ipph.ip6_src = iphdr->ip6_src;
+ ipph.ip6_dst = iphdr->ip6_dst;
+ ipph.len = cpu_to_be16(csl);
+ ipph.zero[0] = 0;
+ ipph.zero[1] = 0;
+ ipph.zero[2] = 0;
+ ipph.next_hdr = l4_proto;
+ *cso = sizeof(ipph);
+ return net_checksum_add(*cso, (uint8_t *)&ipph);
+}
+
+static bool
+eth_is_ip6_extension_header_type(uint8_t hdr_type)
+{
+ switch (hdr_type) {
+ case IP6_HOP_BY_HOP:
+ case IP6_ROUTING:
+ case IP6_FRAGMENT:
+ case IP6_ESP:
+ case IP6_AUTHENTICATION:
+ case IP6_DESTINATON:
+ case IP6_MOBILITY:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+_eth_get_rss_ex_dst_addr(const struct iovec *pkt, int pkt_frags,
+ size_t ext_hdr_offset,
+ struct ip6_ext_hdr *ext_hdr,
+ struct in6_address *dst_addr)
+{
+ struct ip6_ext_hdr_routing rt_hdr;
+ size_t input_size = iov_size(pkt, pkt_frags);
+ size_t bytes_read;
+
+ if (input_size < ext_hdr_offset + sizeof(rt_hdr) + sizeof(*dst_addr)) {
+ return false;
+ }
+
+ bytes_read = iov_to_buf(pkt, pkt_frags, ext_hdr_offset,
+ &rt_hdr, sizeof(rt_hdr));
+ assert(bytes_read == sizeof(rt_hdr));
+ if ((rt_hdr.rtype != 2) || (rt_hdr.segleft != 1)) {
+ return false;
+ }
+ bytes_read = iov_to_buf(pkt, pkt_frags, ext_hdr_offset + sizeof(rt_hdr),
+ dst_addr, sizeof(*dst_addr));
+ assert(bytes_read == sizeof(*dst_addr));
+
+ return true;
+}
+
+static bool
+_eth_get_rss_ex_src_addr(const struct iovec *pkt, int pkt_frags,
+ size_t dsthdr_offset,
+ struct ip6_ext_hdr *ext_hdr,
+ struct in6_address *src_addr)
+{
+ size_t bytes_left = (ext_hdr->ip6r_len + 1) * 8 - sizeof(*ext_hdr);
+ struct ip6_option_hdr opthdr;
+ size_t opt_offset = dsthdr_offset + sizeof(*ext_hdr);
+
+ while (bytes_left > sizeof(opthdr)) {
+ size_t input_size = iov_size(pkt, pkt_frags);
+ size_t bytes_read, optlen;
+
+ if (input_size < opt_offset) {
+ return false;
+ }
+
+ bytes_read = iov_to_buf(pkt, pkt_frags, opt_offset,
+ &opthdr, sizeof(opthdr));
+
+ if (bytes_read != sizeof(opthdr)) {
+ return false;
+ }
+
+ optlen = (opthdr.type == IP6_OPT_PAD1) ? 1
+ : (opthdr.len + sizeof(opthdr));
+
+ if (optlen > bytes_left) {
+ return false;
+ }
+
+ if (opthdr.type == IP6_OPT_HOME) {
+ size_t input_size = iov_size(pkt, pkt_frags);
+
+ if (input_size < opt_offset + sizeof(opthdr)) {
+ return false;
+ }
+
+ bytes_read = iov_to_buf(pkt, pkt_frags,
+ opt_offset + sizeof(opthdr),
+ src_addr, sizeof(*src_addr));
+
+ return bytes_read == sizeof(*src_addr);
+ }
+
+ opt_offset += optlen;
+ bytes_left -= optlen;
+ }
+
+ return false;
+}
+
+bool eth_parse_ipv6_hdr(const struct iovec *pkt, int pkt_frags,
+ size_t ip6hdr_off, eth_ip6_hdr_info *info)
+{
+ struct ip6_ext_hdr ext_hdr;
+ size_t bytes_read;
+ uint8_t curr_ext_hdr_type;
+ size_t input_size = iov_size(pkt, pkt_frags);
+
+ info->rss_ex_dst_valid = false;
+ info->rss_ex_src_valid = false;
+ info->fragment = false;
+
+ if (input_size < ip6hdr_off) {
+ return false;
+ }
+
+ bytes_read = iov_to_buf(pkt, pkt_frags, ip6hdr_off,
+ &info->ip6_hdr, sizeof(info->ip6_hdr));
+ if (bytes_read < sizeof(info->ip6_hdr)) {
+ return false;
+ }
+
+ info->full_hdr_len = sizeof(struct ip6_header);
+
+ curr_ext_hdr_type = info->ip6_hdr.ip6_nxt;
+
+ if (!eth_is_ip6_extension_header_type(curr_ext_hdr_type)) {
+ info->l4proto = info->ip6_hdr.ip6_nxt;
+ info->has_ext_hdrs = false;
+ return true;
+ }
+
+ info->has_ext_hdrs = true;
+
+ do {
+ if (input_size < ip6hdr_off + info->full_hdr_len) {
+ return false;
+ }
+
+ bytes_read = iov_to_buf(pkt, pkt_frags, ip6hdr_off + info->full_hdr_len,
+ &ext_hdr, sizeof(ext_hdr));
+
+ if (bytes_read < sizeof(ext_hdr)) {
+ return false;
+ }
+
+ if (curr_ext_hdr_type == IP6_ROUTING) {
+ if (ext_hdr.ip6r_len == sizeof(struct in6_address) / 8) {
+ info->rss_ex_dst_valid =
+ _eth_get_rss_ex_dst_addr(pkt, pkt_frags,
+ ip6hdr_off + info->full_hdr_len,
+ &ext_hdr, &info->rss_ex_dst);
+ }
+ } else if (curr_ext_hdr_type == IP6_DESTINATON) {
+ info->rss_ex_src_valid =
+ _eth_get_rss_ex_src_addr(pkt, pkt_frags,
+ ip6hdr_off + info->full_hdr_len,
+ &ext_hdr, &info->rss_ex_src);
+ } else if (curr_ext_hdr_type == IP6_FRAGMENT) {
+ info->fragment = true;
+ }
+
+ info->full_hdr_len += (ext_hdr.ip6r_len + 1) * IP6_EXT_GRANULARITY;
+ curr_ext_hdr_type = ext_hdr.ip6r_nxt;
+ } while (eth_is_ip6_extension_header_type(curr_ext_hdr_type));
+
+ info->l4proto = ext_hdr.ip6r_nxt;
+ return true;
+}
+
+bool eth_pad_short_frame(uint8_t *padded_pkt, size_t *padded_buflen,
+ const void *pkt, size_t pkt_size)
+{
+ assert(padded_buflen && *padded_buflen >= ETH_ZLEN);
+
+ if (pkt_size >= ETH_ZLEN) {
+ return false;
+ }
+
+ /* pad to minimum Ethernet frame length */
+ memcpy(padded_pkt, pkt, pkt_size);
+ memset(&padded_pkt[pkt_size], 0, ETH_ZLEN - pkt_size);
+ *padded_buflen = ETH_ZLEN;
+
+ return true;
+}
diff --git a/net/filter-buffer.c b/net/filter-buffer.c
new file mode 100644
index 000000000..283dc9cbe
--- /dev/null
+++ b/net/filter-buffer.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2015 FUJITSU LIMITED
+ * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "net/filter.h"
+#include "net/queue.h"
+#include "qapi/error.h"
+#include "qemu/timer.h"
+#include "qemu/iov.h"
+#include "qapi/qapi-builtin-visit.h"
+#include "qapi/qmp/qerror.h"
+#include "qom/object.h"
+
+#define TYPE_FILTER_BUFFER "filter-buffer"
+
+OBJECT_DECLARE_SIMPLE_TYPE(FilterBufferState, FILTER_BUFFER)
+
+struct FilterBufferState {
+ NetFilterState parent_obj;
+
+ NetQueue *incoming_queue;
+ uint32_t interval;
+ QEMUTimer release_timer;
+};
+
+static void filter_buffer_flush(NetFilterState *nf)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ if (!qemu_net_queue_flush(s->incoming_queue)) {
+ /* Unable to empty the queue, purge remaining packets */
+ qemu_net_queue_purge(s->incoming_queue, nf->netdev);
+ }
+}
+
+static void filter_buffer_release_timer(void *opaque)
+{
+ NetFilterState *nf = opaque;
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ /*
+ * Note: filter_buffer_flush() drops packets that can't be sent
+ * TODO: We should leave them queued. But currently there's no way
+ * for the next filter or receiver to notify us that it can receive
+ * more packets.
+ */
+ filter_buffer_flush(nf);
+ /* Timer rearmed to fire again in s->interval microseconds. */
+ timer_mod(&s->release_timer,
+ qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
+}
+
+/* filter APIs */
+static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ /*
+ * We return size when buffer a packet, the sender will take it as
+ * a already sent packet, so sent_cb should not be called later.
+ *
+ * FIXME: Even if the guest can't receive packets for some reasons,
+ * the filter can still accept packets until its internal queue is full.
+ * For example:
+ * For some reason, receiver could not receive more packets
+ * (.can_receive() returns false). Without a filter, at most one packet
+ * will be queued in incoming queue and sender's poll will be disabled
+ * unit its sent_cb() was called. With a filter, it will keep receiving
+ * the packets without caring about the receiver. This is suboptimal.
+ * May need more thoughts (e.g keeping sent_cb).
+ */
+ qemu_net_queue_append_iov(s->incoming_queue, sender, flags,
+ iov, iovcnt, NULL);
+ return iov_size(iov, iovcnt);
+}
+
+static void filter_buffer_cleanup(NetFilterState *nf)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ if (s->interval) {
+ timer_del(&s->release_timer);
+ }
+
+ /* flush packets */
+ if (s->incoming_queue) {
+ filter_buffer_flush(nf);
+ g_free(s->incoming_queue);
+ }
+}
+
+static void filter_buffer_setup_timer(NetFilterState *nf)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ if (s->interval) {
+ timer_init_us(&s->release_timer, QEMU_CLOCK_VIRTUAL,
+ filter_buffer_release_timer, nf);
+ /* Timer armed to fire in s->interval microseconds. */
+ timer_mod(&s->release_timer,
+ qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + s->interval);
+ }
+}
+
+static void filter_buffer_setup(NetFilterState *nf, Error **errp)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ /*
+ * We may want to accept zero interval when VM FT solutions like MC
+ * or COLO use this filter to release packets on demand.
+ */
+ if (!s->interval) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "interval",
+ "a non-zero interval");
+ return;
+ }
+
+ s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+ filter_buffer_setup_timer(nf);
+}
+
+static void filter_buffer_status_changed(NetFilterState *nf, Error **errp)
+{
+ FilterBufferState *s = FILTER_BUFFER(nf);
+
+ if (!nf->on) {
+ if (s->interval) {
+ timer_del(&s->release_timer);
+ }
+ filter_buffer_flush(nf);
+ } else {
+ filter_buffer_setup_timer(nf);
+ }
+}
+
+static void filter_buffer_get_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ FilterBufferState *s = FILTER_BUFFER(obj);
+ uint32_t value = s->interval;
+
+ visit_type_uint32(v, name, &value, errp);
+}
+
+static void filter_buffer_set_interval(Object *obj, Visitor *v,
+ const char *name, void *opaque,
+ Error **errp)
+{
+ FilterBufferState *s = FILTER_BUFFER(obj);
+ uint32_t value;
+
+ if (!visit_type_uint32(v, name, &value, errp)) {
+ return;
+ }
+ if (!value) {
+ error_setg(errp, "Property '%s.%s' requires a positive value",
+ object_get_typename(obj), name);
+ return;
+ }
+ s->interval = value;
+}
+
+static void filter_buffer_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add(oc, "interval", "uint32",
+ filter_buffer_get_interval,
+ filter_buffer_set_interval, NULL, NULL);
+
+ nfc->setup = filter_buffer_setup;
+ nfc->cleanup = filter_buffer_cleanup;
+ nfc->receive_iov = filter_buffer_receive_iov;
+ nfc->status_changed = filter_buffer_status_changed;
+}
+
+static const TypeInfo filter_buffer_info = {
+ .name = TYPE_FILTER_BUFFER,
+ .parent = TYPE_NETFILTER,
+ .class_init = filter_buffer_class_init,
+ .instance_size = sizeof(FilterBufferState),
+};
+
+static void register_types(void)
+{
+ type_register_static(&filter_buffer_info);
+}
+
+type_init(register_types);
diff --git a/net/filter-mirror.c b/net/filter-mirror.c
new file mode 100644
index 000000000..f20240cc9
--- /dev/null
+++ b/net/filter-mirror.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "net/filter.h"
+#include "net/net.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+#include "chardev/char-fe.h"
+#include "qemu/iov.h"
+#include "qemu/sockets.h"
+
+#define TYPE_FILTER_MIRROR "filter-mirror"
+typedef struct MirrorState MirrorState;
+DECLARE_INSTANCE_CHECKER(MirrorState, FILTER_MIRROR,
+ TYPE_FILTER_MIRROR)
+
+#define TYPE_FILTER_REDIRECTOR "filter-redirector"
+DECLARE_INSTANCE_CHECKER(MirrorState, FILTER_REDIRECTOR,
+ TYPE_FILTER_REDIRECTOR)
+
+#define REDIRECTOR_MAX_LEN NET_BUFSIZE
+
+struct MirrorState {
+ NetFilterState parent_obj;
+ char *indev;
+ char *outdev;
+ CharBackend chr_in;
+ CharBackend chr_out;
+ SocketReadState rs;
+ bool vnet_hdr;
+};
+
+static int filter_send(MirrorState *s,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ NetFilterState *nf = NETFILTER(s);
+ int ret = 0;
+ ssize_t size = 0;
+ uint32_t len = 0;
+ char *buf;
+
+ size = iov_size(iov, iovcnt);
+ if (!size) {
+ return 0;
+ }
+
+ len = htonl(size);
+ ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len));
+ if (ret != sizeof(len)) {
+ goto err;
+ }
+
+ if (s->vnet_hdr) {
+ /*
+ * If vnet_hdr = on, we send vnet header len to make other
+ * module(like colo-compare) know how to parse net
+ * packet correctly.
+ */
+ ssize_t vnet_hdr_len;
+
+ vnet_hdr_len = nf->netdev->vnet_hdr_len;
+
+ len = htonl(vnet_hdr_len);
+ ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)&len, sizeof(len));
+ if (ret != sizeof(len)) {
+ goto err;
+ }
+ }
+
+ buf = g_malloc(size);
+ iov_to_buf(iov, iovcnt, 0, buf, size);
+ ret = qemu_chr_fe_write_all(&s->chr_out, (uint8_t *)buf, size);
+ g_free(buf);
+ if (ret != size) {
+ goto err;
+ }
+
+ return size;
+
+err:
+ return ret < 0 ? ret : -EIO;
+}
+
+static void redirector_to_filter(NetFilterState *nf,
+ const uint8_t *buf,
+ int len)
+{
+ struct iovec iov = {
+ .iov_base = (void *)buf,
+ .iov_len = len,
+ };
+
+ if (nf->direction == NET_FILTER_DIRECTION_ALL ||
+ nf->direction == NET_FILTER_DIRECTION_TX) {
+ qemu_netfilter_pass_to_next(nf->netdev, 0, &iov, 1, nf);
+ }
+
+ if (nf->direction == NET_FILTER_DIRECTION_ALL ||
+ nf->direction == NET_FILTER_DIRECTION_RX) {
+ qemu_netfilter_pass_to_next(nf->netdev->peer, 0, &iov, 1, nf);
+ }
+}
+
+static int redirector_chr_can_read(void *opaque)
+{
+ return REDIRECTOR_MAX_LEN;
+}
+
+static void redirector_chr_read(void *opaque, const uint8_t *buf, int size)
+{
+ NetFilterState *nf = opaque;
+ MirrorState *s = FILTER_REDIRECTOR(nf);
+ int ret;
+
+ ret = net_fill_rstate(&s->rs, buf, size);
+
+ if (ret == -1) {
+ qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL,
+ NULL, NULL, NULL, true);
+ }
+}
+
+static void redirector_chr_event(void *opaque, QEMUChrEvent event)
+{
+ NetFilterState *nf = opaque;
+ MirrorState *s = FILTER_REDIRECTOR(nf);
+
+ switch (event) {
+ case CHR_EVENT_CLOSED:
+ qemu_chr_fe_set_handlers(&s->chr_in, NULL, NULL, NULL,
+ NULL, NULL, NULL, true);
+ break;
+ default:
+ break;
+ }
+}
+
+static ssize_t filter_mirror_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ MirrorState *s = FILTER_MIRROR(nf);
+ int ret;
+
+ ret = filter_send(s, iov, iovcnt);
+ if (ret < 0) {
+ error_report("filter mirror send failed(%s)", strerror(-ret));
+ }
+
+ /*
+ * we don't hope this error interrupt the normal
+ * path of net packet, so we always return zero.
+ */
+ return 0;
+}
+
+static ssize_t filter_redirector_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ MirrorState *s = FILTER_REDIRECTOR(nf);
+ int ret;
+
+ if (qemu_chr_fe_backend_connected(&s->chr_out)) {
+ ret = filter_send(s, iov, iovcnt);
+ if (ret < 0) {
+ error_report("filter redirector send failed(%s)", strerror(-ret));
+ }
+ return ret;
+ } else {
+ return 0;
+ }
+}
+
+static void filter_mirror_cleanup(NetFilterState *nf)
+{
+ MirrorState *s = FILTER_MIRROR(nf);
+
+ qemu_chr_fe_deinit(&s->chr_out, false);
+}
+
+static void filter_redirector_cleanup(NetFilterState *nf)
+{
+ MirrorState *s = FILTER_REDIRECTOR(nf);
+
+ qemu_chr_fe_deinit(&s->chr_in, false);
+ qemu_chr_fe_deinit(&s->chr_out, false);
+}
+
+static void filter_mirror_setup(NetFilterState *nf, Error **errp)
+{
+ MirrorState *s = FILTER_MIRROR(nf);
+ Chardev *chr;
+
+ if (s->outdev == NULL) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND, "filter-mirror parameter"\
+ " 'outdev' cannot be empty");
+ return;
+ }
+
+ chr = qemu_chr_find(s->outdev);
+ if (chr == NULL) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "Device '%s' not found", s->outdev);
+ return;
+ }
+
+ qemu_chr_fe_init(&s->chr_out, chr, errp);
+}
+
+static void redirector_rs_finalize(SocketReadState *rs)
+{
+ MirrorState *s = container_of(rs, MirrorState, rs);
+ NetFilterState *nf = NETFILTER(s);
+
+ redirector_to_filter(nf, rs->buf, rs->packet_len);
+}
+
+static void filter_redirector_setup(NetFilterState *nf, Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(nf);
+ Chardev *chr;
+
+ if (!s->indev && !s->outdev) {
+ error_setg(errp, "filter redirector needs 'indev' or "
+ "'outdev' at least one property set");
+ return;
+ } else if (s->indev && s->outdev) {
+ if (!strcmp(s->indev, s->outdev)) {
+ error_setg(errp, "'indev' and 'outdev' could not be same "
+ "for filter redirector");
+ return;
+ }
+ }
+
+ net_socket_rs_init(&s->rs, redirector_rs_finalize, s->vnet_hdr);
+
+ if (s->indev) {
+ chr = qemu_chr_find(s->indev);
+ if (chr == NULL) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "IN Device '%s' not found", s->indev);
+ return;
+ }
+
+ if (!qemu_chr_fe_init(&s->chr_in, chr, errp)) {
+ return;
+ }
+
+ qemu_chr_fe_set_handlers(&s->chr_in, redirector_chr_can_read,
+ redirector_chr_read, redirector_chr_event,
+ NULL, nf, NULL, true);
+ }
+
+ if (s->outdev) {
+ chr = qemu_chr_find(s->outdev);
+ if (chr == NULL) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "OUT Device '%s' not found", s->outdev);
+ return;
+ }
+ if (!qemu_chr_fe_init(&s->chr_out, chr, errp)) {
+ return;
+ }
+ }
+}
+
+static char *filter_redirector_get_indev(Object *obj, Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ return g_strdup(s->indev);
+}
+
+static void filter_redirector_set_indev(Object *obj,
+ const char *value,
+ Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ g_free(s->indev);
+ s->indev = g_strdup(value);
+}
+
+static char *filter_mirror_get_outdev(Object *obj, Error **errp)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ return g_strdup(s->outdev);
+}
+
+static void filter_mirror_set_outdev(Object *obj,
+ const char *value,
+ Error **errp)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ g_free(s->outdev);
+ s->outdev = g_strdup(value);
+ if (!s->outdev) {
+ error_setg(errp, "filter mirror needs 'outdev' "
+ "property set");
+ return;
+ }
+}
+
+static bool filter_mirror_get_vnet_hdr(Object *obj, Error **errp)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ return s->vnet_hdr;
+}
+
+static void filter_mirror_set_vnet_hdr(Object *obj, bool value, Error **errp)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ s->vnet_hdr = value;
+}
+
+static char *filter_redirector_get_outdev(Object *obj, Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ return g_strdup(s->outdev);
+}
+
+static void filter_redirector_set_outdev(Object *obj,
+ const char *value,
+ Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ g_free(s->outdev);
+ s->outdev = g_strdup(value);
+}
+
+static bool filter_redirector_get_vnet_hdr(Object *obj, Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ return s->vnet_hdr;
+}
+
+static void filter_redirector_set_vnet_hdr(Object *obj,
+ bool value,
+ Error **errp)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ s->vnet_hdr = value;
+}
+
+static void filter_mirror_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add_str(oc, "outdev", filter_mirror_get_outdev,
+ filter_mirror_set_outdev);
+ object_class_property_add_bool(oc, "vnet_hdr_support",
+ filter_mirror_get_vnet_hdr,
+ filter_mirror_set_vnet_hdr);
+
+ nfc->setup = filter_mirror_setup;
+ nfc->cleanup = filter_mirror_cleanup;
+ nfc->receive_iov = filter_mirror_receive_iov;
+}
+
+static void filter_redirector_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add_str(oc, "indev", filter_redirector_get_indev,
+ filter_redirector_set_indev);
+ object_class_property_add_str(oc, "outdev", filter_redirector_get_outdev,
+ filter_redirector_set_outdev);
+ object_class_property_add_bool(oc, "vnet_hdr_support",
+ filter_redirector_get_vnet_hdr,
+ filter_redirector_set_vnet_hdr);
+
+ nfc->setup = filter_redirector_setup;
+ nfc->cleanup = filter_redirector_cleanup;
+ nfc->receive_iov = filter_redirector_receive_iov;
+}
+
+static void filter_mirror_init(Object *obj)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ s->vnet_hdr = false;
+}
+
+static void filter_redirector_init(Object *obj)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ s->vnet_hdr = false;
+}
+
+static void filter_mirror_fini(Object *obj)
+{
+ MirrorState *s = FILTER_MIRROR(obj);
+
+ g_free(s->outdev);
+}
+
+static void filter_redirector_fini(Object *obj)
+{
+ MirrorState *s = FILTER_REDIRECTOR(obj);
+
+ g_free(s->indev);
+ g_free(s->outdev);
+}
+
+static const TypeInfo filter_redirector_info = {
+ .name = TYPE_FILTER_REDIRECTOR,
+ .parent = TYPE_NETFILTER,
+ .class_init = filter_redirector_class_init,
+ .instance_init = filter_redirector_init,
+ .instance_finalize = filter_redirector_fini,
+ .instance_size = sizeof(MirrorState),
+};
+
+static const TypeInfo filter_mirror_info = {
+ .name = TYPE_FILTER_MIRROR,
+ .parent = TYPE_NETFILTER,
+ .class_init = filter_mirror_class_init,
+ .instance_init = filter_mirror_init,
+ .instance_finalize = filter_mirror_fini,
+ .instance_size = sizeof(MirrorState),
+};
+
+static void register_types(void)
+{
+ type_register_static(&filter_mirror_info);
+ type_register_static(&filter_redirector_info);
+}
+
+type_init(register_types);
diff --git a/net/filter-replay.c b/net/filter-replay.c
new file mode 100644
index 000000000..54690676e
--- /dev/null
+++ b/net/filter-replay.c
@@ -0,0 +1,89 @@
+/*
+ * filter-replay.c
+ *
+ * Copyright (c) 2010-2016 Institute for System Programming
+ * of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "qemu/error-report.h"
+#include "qemu/iov.h"
+#include "qemu/module.h"
+#include "qemu/timer.h"
+#include "qapi/visitor.h"
+#include "net/filter.h"
+#include "sysemu/replay.h"
+#include "qom/object.h"
+
+#define TYPE_FILTER_REPLAY "filter-replay"
+
+OBJECT_DECLARE_SIMPLE_TYPE(NetFilterReplayState, FILTER_REPLAY)
+
+struct NetFilterReplayState {
+ NetFilterState nfs;
+ ReplayNetState *rns;
+};
+
+static ssize_t filter_replay_receive_iov(NetFilterState *nf,
+ NetClientState *sndr,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt, NetPacketSent *sent_cb)
+{
+ NetFilterReplayState *nfrs = FILTER_REPLAY(nf);
+ switch (replay_mode) {
+ case REPLAY_MODE_RECORD:
+ if (nf->netdev == sndr) {
+ replay_net_packet_event(nfrs->rns, flags, iov, iovcnt);
+ return iov_size(iov, iovcnt);
+ }
+ return 0;
+ case REPLAY_MODE_PLAY:
+ /* Drop all packets in replay mode.
+ Packets from the log will be injected by the replay module. */
+ return iov_size(iov, iovcnt);
+ default:
+ /* Pass all the packets. */
+ return 0;
+ }
+}
+
+static void filter_replay_instance_init(Object *obj)
+{
+ NetFilterReplayState *nfrs = FILTER_REPLAY(obj);
+ nfrs->rns = replay_register_net(&nfrs->nfs);
+}
+
+static void filter_replay_instance_finalize(Object *obj)
+{
+ NetFilterReplayState *nfrs = FILTER_REPLAY(obj);
+ replay_unregister_net(nfrs->rns);
+}
+
+static void filter_replay_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ nfc->receive_iov = filter_replay_receive_iov;
+}
+
+static const TypeInfo filter_replay_info = {
+ .name = TYPE_FILTER_REPLAY,
+ .parent = TYPE_NETFILTER,
+ .class_init = filter_replay_class_init,
+ .instance_init = filter_replay_instance_init,
+ .instance_finalize = filter_replay_instance_finalize,
+ .instance_size = sizeof(NetFilterReplayState),
+};
+
+static void filter_replay_register_types(void)
+{
+ type_register_static(&filter_replay_info);
+}
+
+type_init(filter_replay_register_types);
diff --git a/net/filter-rewriter.c b/net/filter-rewriter.c
new file mode 100644
index 000000000..bf05023dc
--- /dev/null
+++ b/net/filter-rewriter.c
@@ -0,0 +1,441 @@
+/*
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Author: Zhang Chen <zhangchen.fnst@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "trace.h"
+#include "colo.h"
+#include "net/filter.h"
+#include "net/net.h"
+#include "qemu/error-report.h"
+#include "qom/object.h"
+#include "qemu/main-loop.h"
+#include "qemu/iov.h"
+#include "net/checksum.h"
+#include "net/colo.h"
+#include "migration/colo.h"
+#include "util.h"
+
+#define TYPE_FILTER_REWRITER "filter-rewriter"
+OBJECT_DECLARE_SIMPLE_TYPE(RewriterState, FILTER_REWRITER)
+
+#define FAILOVER_MODE_ON true
+#define FAILOVER_MODE_OFF false
+
+struct RewriterState {
+ NetFilterState parent_obj;
+ NetQueue *incoming_queue;
+ /* hashtable to save connection */
+ GHashTable *connection_track_table;
+ bool vnet_hdr;
+ bool failover_mode;
+};
+
+static void filter_rewriter_failover_mode(RewriterState *s)
+{
+ s->failover_mode = FAILOVER_MODE_ON;
+}
+
+static void filter_rewriter_flush(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_REWRITER(nf);
+
+ if (!qemu_net_queue_flush(s->incoming_queue)) {
+ /* Unable to empty the queue, purge remaining packets */
+ qemu_net_queue_purge(s->incoming_queue, nf->netdev);
+ }
+}
+
+/*
+ * Return 1 on success, if return 0 means the pkt
+ * is not TCP packet
+ */
+static int is_tcp_packet(Packet *pkt)
+{
+ if (!parse_packet_early(pkt) &&
+ pkt->ip->ip_p == IPPROTO_TCP) {
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/* handle tcp packet from primary guest */
+static int handle_primary_tcp_pkt(RewriterState *rf,
+ Connection *conn,
+ Packet *pkt, ConnectionKey *key)
+{
+ struct tcp_hdr *tcp_pkt;
+
+ tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
+ if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
+ trace_colo_filter_rewriter_pkt_info(__func__,
+ inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ }
+ if (trace_event_get_state_backends(
+ TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN)) &&
+ conn->tcp_state == TCPS_SYN_SENT) {
+ conn->tcp_state = TCPS_ESTABLISHED;
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+ /*
+ * we use this flag update offset func
+ * run once in independent tcp connection
+ */
+ conn->tcp_state = TCPS_SYN_RECEIVED;
+ }
+
+ if (((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK)) {
+ if (conn->tcp_state == TCPS_SYN_RECEIVED) {
+ /*
+ * offset = secondary_seq - primary seq
+ * ack packet sent by guest from primary node,
+ * so we use th_ack - 1 get primary_seq
+ */
+ conn->offset -= (ntohl(tcp_pkt->th_ack) - 1);
+ conn->tcp_state = TCPS_ESTABLISHED;
+ }
+ if (conn->offset) {
+ /* handle packets to the secondary from the primary */
+ tcp_pkt->th_ack = htonl(ntohl(tcp_pkt->th_ack) + conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
+ pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
+ }
+
+ /*
+ * Passive close step 3
+ */
+ if ((conn->tcp_state == TCPS_LAST_ACK) &&
+ (ntohl(tcp_pkt->th_ack) == (conn->fin_ack_seq + 1))) {
+ conn->tcp_state = TCPS_CLOSED;
+ g_hash_table_remove(rf->connection_track_table, key);
+ }
+ }
+
+ if ((tcp_pkt->th_flags & TH_FIN) == TH_FIN) {
+ /*
+ * Passive close.
+ * Step 1:
+ * The *server* side of this connect is VM, *client* tries to close
+ * the connection. We will into CLOSE_WAIT status.
+ *
+ * Step 2:
+ * In this step we will into LAST_ACK status.
+ *
+ * We got 'fin=1, ack=1' packet from server side, we need to
+ * record the seq of 'fin=1, ack=1' packet.
+ *
+ * Step 3:
+ * We got 'ack=1' packets from client side, it acks 'fin=1, ack=1'
+ * packet from server side. From this point, we can ensure that there
+ * will be no packets in the connection, except that, some errors
+ * happen between the path of 'filter object' and vNIC, if this rare
+ * case really happen, we can still create a new connection,
+ * So it is safe to remove the connection from connection_track_table.
+ *
+ */
+ if (conn->tcp_state == TCPS_ESTABLISHED) {
+ conn->tcp_state = TCPS_CLOSE_WAIT;
+ }
+
+ /*
+ * Active close step 2.
+ */
+ if (conn->tcp_state == TCPS_FIN_WAIT_1) {
+ /*
+ * For simplify implementation, we needn't wait 2MSL time
+ * in filter rewriter. Because guest kernel will track the
+ * TCP status and wait 2MSL time, if client resend the FIN
+ * packet, guest will apply the last ACK too.
+ * So, we skip the TCPS_TIME_WAIT state here and go straight
+ * to TCPS_CLOSED state.
+ */
+ conn->tcp_state = TCPS_CLOSED;
+ g_hash_table_remove(rf->connection_track_table, key);
+ }
+ }
+
+ return 0;
+}
+
+/* handle tcp packet from secondary guest */
+static int handle_secondary_tcp_pkt(RewriterState *rf,
+ Connection *conn,
+ Packet *pkt, ConnectionKey *key)
+{
+ struct tcp_hdr *tcp_pkt;
+
+ tcp_pkt = (struct tcp_hdr *)pkt->transport_header;
+
+ if (trace_event_get_state_backends(TRACE_COLO_FILTER_REWRITER_PKT_INFO)) {
+ trace_colo_filter_rewriter_pkt_info(__func__,
+ inet_ntoa(pkt->ip->ip_src), inet_ntoa(pkt->ip->ip_dst),
+ ntohl(tcp_pkt->th_seq), ntohl(tcp_pkt->th_ack),
+ tcp_pkt->th_flags);
+ }
+ if (trace_event_get_state_backends(
+ TRACE_COLO_FILTER_REWRITER_CONN_OFFSET)) {
+ trace_colo_filter_rewriter_conn_offset(conn->offset);
+ }
+
+ if (conn->tcp_state == TCPS_SYN_RECEIVED &&
+ ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == (TH_ACK | TH_SYN))) {
+ /*
+ * save offset = secondary_seq and then
+ * in handle_primary_tcp_pkt make offset
+ * = secondary_seq - primary_seq
+ */
+ conn->offset = ntohl(tcp_pkt->th_seq);
+ }
+
+ /* VM active connect */
+ if (conn->tcp_state == TCPS_CLOSED &&
+ ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_SYN)) {
+ conn->tcp_state = TCPS_SYN_SENT;
+ }
+
+ if ((tcp_pkt->th_flags & (TH_ACK | TH_SYN)) == TH_ACK) {
+ /* Only need to adjust seq while offset is Non-zero */
+ if (conn->offset) {
+ /* handle packets to the primary from the secondary*/
+ tcp_pkt->th_seq = htonl(ntohl(tcp_pkt->th_seq) - conn->offset);
+
+ net_checksum_calculate((uint8_t *)pkt->data + pkt->vnet_hdr_len,
+ pkt->size - pkt->vnet_hdr_len, CSUM_TCP);
+ }
+ }
+
+ /*
+ * Passive close step 2:
+ */
+ if (conn->tcp_state == TCPS_CLOSE_WAIT &&
+ (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == (TH_ACK | TH_FIN)) {
+ conn->fin_ack_seq = ntohl(tcp_pkt->th_seq);
+ conn->tcp_state = TCPS_LAST_ACK;
+ }
+
+ /*
+ * Active close
+ *
+ * Step 1:
+ * The *server* side of this connect is VM, *server* tries to close
+ * the connection.
+ *
+ * Step 2:
+ * We will into CLOSE_WAIT status.
+ * We simplify the TCPS_FIN_WAIT_2, TCPS_TIME_WAIT and
+ * CLOSING status.
+ */
+ if (conn->tcp_state == TCPS_ESTABLISHED &&
+ (tcp_pkt->th_flags & (TH_ACK | TH_FIN)) == TH_FIN) {
+ conn->tcp_state = TCPS_FIN_WAIT_1;
+ }
+
+ return 0;
+}
+
+static ssize_t colo_rewriter_receive_iov(NetFilterState *nf,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ RewriterState *s = FILTER_REWRITER(nf);
+ Connection *conn;
+ ConnectionKey key;
+ Packet *pkt;
+ ssize_t size = iov_size(iov, iovcnt);
+ ssize_t vnet_hdr_len = 0;
+ char *buf = g_malloc0(size);
+
+ iov_to_buf(iov, iovcnt, 0, buf, size);
+
+ if (s->vnet_hdr) {
+ vnet_hdr_len = nf->netdev->vnet_hdr_len;
+ }
+
+ pkt = packet_new_nocopy(buf, size, vnet_hdr_len);
+
+ /*
+ * if we get tcp packet
+ * we will rewrite it to make secondary guest's
+ * connection established successfully
+ */
+ if (pkt && is_tcp_packet(pkt)) {
+
+ fill_connection_key(pkt, &key, sender == nf->netdev);
+
+ /* After failover we needn't change new TCP packet */
+ if (s->failover_mode &&
+ !connection_has_tracked(s->connection_track_table, &key)) {
+ goto out;
+ }
+
+ conn = connection_get(s->connection_track_table,
+ &key,
+ NULL);
+
+ if (sender == nf->netdev) {
+ /* NET_FILTER_DIRECTION_TX */
+ if (!handle_primary_tcp_pkt(s, conn, pkt, &key)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ } else {
+ /* NET_FILTER_DIRECTION_RX */
+ if (!handle_secondary_tcp_pkt(s, conn, pkt, &key)) {
+ qemu_net_queue_send(s->incoming_queue, sender, 0,
+ (const uint8_t *)pkt->data, pkt->size, NULL);
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ /*
+ * We block the packet here,after rewrite pkt
+ * and will send it
+ */
+ return 1;
+ }
+ }
+ }
+
+out:
+ packet_destroy(pkt, NULL);
+ pkt = NULL;
+ return 0;
+}
+
+static void reset_seq_offset(gpointer key, gpointer value, gpointer user_data)
+{
+ Connection *conn = (Connection *)value;
+
+ conn->offset = 0;
+}
+
+static gboolean offset_is_nonzero(gpointer key,
+ gpointer value,
+ gpointer user_data)
+{
+ Connection *conn = (Connection *)value;
+
+ return conn->offset ? true : false;
+}
+
+static void colo_rewriter_handle_event(NetFilterState *nf, int event,
+ Error **errp)
+{
+ RewriterState *rs = FILTER_REWRITER(nf);
+
+ switch (event) {
+ case COLO_EVENT_CHECKPOINT:
+ g_hash_table_foreach(rs->connection_track_table,
+ reset_seq_offset, NULL);
+ break;
+ case COLO_EVENT_FAILOVER:
+ if (!g_hash_table_find(rs->connection_track_table,
+ offset_is_nonzero, NULL)) {
+ filter_rewriter_failover_mode(rs);
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+static void colo_rewriter_cleanup(NetFilterState *nf)
+{
+ RewriterState *s = FILTER_REWRITER(nf);
+
+ /* flush packets */
+ if (s->incoming_queue) {
+ filter_rewriter_flush(nf);
+ g_free(s->incoming_queue);
+ }
+
+ g_hash_table_destroy(s->connection_track_table);
+}
+
+static void colo_rewriter_setup(NetFilterState *nf, Error **errp)
+{
+ RewriterState *s = FILTER_REWRITER(nf);
+
+ s->connection_track_table = g_hash_table_new_full(connection_key_hash,
+ connection_key_equal,
+ g_free,
+ connection_destroy);
+ s->incoming_queue = qemu_new_net_queue(qemu_netfilter_pass_to_next, nf);
+}
+
+static bool filter_rewriter_get_vnet_hdr(Object *obj, Error **errp)
+{
+ RewriterState *s = FILTER_REWRITER(obj);
+
+ return s->vnet_hdr;
+}
+
+static void filter_rewriter_set_vnet_hdr(Object *obj,
+ bool value,
+ Error **errp)
+{
+ RewriterState *s = FILTER_REWRITER(obj);
+
+ s->vnet_hdr = value;
+}
+
+static void filter_rewriter_init(Object *obj)
+{
+ RewriterState *s = FILTER_REWRITER(obj);
+
+ s->vnet_hdr = false;
+ s->failover_mode = FAILOVER_MODE_OFF;
+}
+
+static void colo_rewriter_class_init(ObjectClass *oc, void *data)
+{
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add_bool(oc, "vnet_hdr_support",
+ filter_rewriter_get_vnet_hdr,
+ filter_rewriter_set_vnet_hdr);
+
+ nfc->setup = colo_rewriter_setup;
+ nfc->cleanup = colo_rewriter_cleanup;
+ nfc->receive_iov = colo_rewriter_receive_iov;
+ nfc->handle_event = colo_rewriter_handle_event;
+}
+
+static const TypeInfo colo_rewriter_info = {
+ .name = TYPE_FILTER_REWRITER,
+ .parent = TYPE_NETFILTER,
+ .class_init = colo_rewriter_class_init,
+ .instance_init = filter_rewriter_init,
+ .instance_size = sizeof(RewriterState),
+};
+
+static void register_types(void)
+{
+ type_register_static(&colo_rewriter_info);
+}
+
+type_init(register_types);
diff --git a/net/filter.c b/net/filter.c
new file mode 100644
index 000000000..3fe88fa43
--- /dev/null
+++ b/net/filter.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2015 FUJITSU LIMITED
+ * Author: Yang Hongyang <yanghy@cn.fujitsu.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later. See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+
+#include "net/filter.h"
+#include "net/net.h"
+#include "net/vhost_net.h"
+#include "qom/object_interfaces.h"
+#include "qemu/iov.h"
+#include "qemu/module.h"
+#include "net/colo.h"
+#include "migration/colo.h"
+
+static inline bool qemu_can_skip_netfilter(NetFilterState *nf)
+{
+ return !nf->on;
+}
+
+ssize_t qemu_netfilter_receive(NetFilterState *nf,
+ NetFilterDirection direction,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ if (qemu_can_skip_netfilter(nf)) {
+ return 0;
+ }
+ if (nf->direction == direction ||
+ nf->direction == NET_FILTER_DIRECTION_ALL) {
+ return NETFILTER_GET_CLASS(OBJECT(nf))->receive_iov(
+ nf, sender, flags, iov, iovcnt, sent_cb);
+ }
+
+ return 0;
+}
+
+static NetFilterState *netfilter_next(NetFilterState *nf,
+ NetFilterDirection dir)
+{
+ NetFilterState *next;
+
+ if (dir == NET_FILTER_DIRECTION_TX) {
+ /* forward walk through filters */
+ next = QTAILQ_NEXT(nf, next);
+ } else {
+ /* reverse order */
+ next = QTAILQ_PREV(nf, next);
+ }
+
+ return next;
+}
+
+ssize_t qemu_netfilter_pass_to_next(NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ void *opaque)
+{
+ int ret = 0;
+ int direction;
+ NetFilterState *nf = opaque;
+ NetFilterState *next = NULL;
+
+ if (!sender || !sender->peer) {
+ /* no receiver, or sender been deleted, no need to pass it further */
+ goto out;
+ }
+
+ if (nf->direction == NET_FILTER_DIRECTION_ALL) {
+ if (sender == nf->netdev) {
+ /* This packet is sent by netdev itself */
+ direction = NET_FILTER_DIRECTION_TX;
+ } else {
+ direction = NET_FILTER_DIRECTION_RX;
+ }
+ } else {
+ direction = nf->direction;
+ }
+
+ next = netfilter_next(nf, direction);
+ while (next) {
+ /*
+ * if qemu_netfilter_pass_to_next been called, means that
+ * the packet has been hold by filter and has already retured size
+ * to the sender, so sent_cb shouldn't be called later, just
+ * pass NULL to next.
+ */
+ ret = qemu_netfilter_receive(next, direction, sender, flags, iov,
+ iovcnt, NULL);
+ if (ret) {
+ return ret;
+ }
+ next = netfilter_next(next, direction);
+ }
+
+ /*
+ * We have gone through all filters, pass it to receiver.
+ * Do the valid check again incase sender or receiver been
+ * deleted while we go through filters.
+ */
+ if (sender && sender->peer) {
+ qemu_net_queue_send_iov(sender->peer->incoming_queue,
+ sender, flags, iov, iovcnt, NULL);
+ }
+
+out:
+ /* no receiver, or sender been deleted */
+ return iov_size(iov, iovcnt);
+}
+
+static char *netfilter_get_netdev_id(Object *obj, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ return g_strdup(nf->netdev_id);
+}
+
+static void netfilter_set_netdev_id(Object *obj, const char *str, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ nf->netdev_id = g_strdup(str);
+}
+
+static int netfilter_get_direction(Object *obj, Error **errp G_GNUC_UNUSED)
+{
+ NetFilterState *nf = NETFILTER(obj);
+ return nf->direction;
+}
+
+static void netfilter_set_direction(Object *obj, int direction, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+ nf->direction = direction;
+}
+
+static char *netfilter_get_status(Object *obj, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ return nf->on ? g_strdup("on") : g_strdup("off");
+}
+
+static void netfilter_set_status(Object *obj, const char *str, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+ NetFilterClass *nfc = NETFILTER_GET_CLASS(obj);
+
+ if (strcmp(str, "on") && strcmp(str, "off")) {
+ error_setg(errp, "Invalid value for netfilter status, "
+ "should be 'on' or 'off'");
+ return;
+ }
+ if (nf->on == !strcmp(str, "on")) {
+ return;
+ }
+ nf->on = !nf->on;
+ if (nf->netdev && nfc->status_changed) {
+ nfc->status_changed(nf, errp);
+ }
+}
+
+static char *netfilter_get_position(Object *obj, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ return g_strdup(nf->position);
+}
+
+static void netfilter_set_position(Object *obj, const char *str, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ nf->position = g_strdup(str);
+}
+
+static char *netfilter_get_insert(Object *obj, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ return nf->insert_before_flag ? g_strdup("before") : g_strdup("behind");
+}
+
+static void netfilter_set_insert(Object *obj, const char *str, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ if (strcmp(str, "before") && strcmp(str, "behind")) {
+ error_setg(errp, "Invalid value for netfilter insert, "
+ "should be 'before' or 'behind'");
+ return;
+ }
+
+ nf->insert_before_flag = !strcmp(str, "before");
+}
+
+static void netfilter_init(Object *obj)
+{
+ NetFilterState *nf = NETFILTER(obj);
+
+ nf->on = true;
+ nf->insert_before_flag = false;
+ nf->position = g_strdup("tail");
+}
+
+static void netfilter_complete(UserCreatable *uc, Error **errp)
+{
+ NetFilterState *nf = NETFILTER(uc);
+ NetFilterState *position = NULL;
+ NetClientState *ncs[MAX_QUEUE_NUM];
+ NetFilterClass *nfc = NETFILTER_GET_CLASS(uc);
+ int queues;
+ Error *local_err = NULL;
+
+ if (!nf->netdev_id) {
+ error_setg(errp, "Parameter 'netdev' is required");
+ return;
+ }
+
+ queues = qemu_find_net_clients_except(nf->netdev_id, ncs,
+ NET_CLIENT_DRIVER_NIC,
+ MAX_QUEUE_NUM);
+ if (queues < 1) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "netdev",
+ "a network backend id");
+ return;
+ } else if (queues > 1) {
+ error_setg(errp, "multiqueue is not supported");
+ return;
+ }
+
+ if (get_vhost_net(ncs[0])) {
+ error_setg(errp, "Vhost is not supported");
+ return;
+ }
+
+ if (strcmp(nf->position, "head") && strcmp(nf->position, "tail")) {
+ Object *container;
+ Object *obj;
+ char *position_id;
+
+ if (!g_str_has_prefix(nf->position, "id=")) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "position",
+ "'head', 'tail' or 'id=<id>'");
+ return;
+ }
+
+ /* get the id from the string */
+ position_id = g_strndup(nf->position + 3, strlen(nf->position) - 3);
+
+ /* Search for the position to insert before/behind */
+ container = object_get_objects_root();
+ obj = object_resolve_path_component(container, position_id);
+ if (!obj) {
+ error_setg(errp, "filter '%s' not found", position_id);
+ g_free(position_id);
+ return;
+ }
+
+ position = NETFILTER(obj);
+
+ if (position->netdev != ncs[0]) {
+ error_setg(errp, "filter '%s' belongs to a different netdev",
+ position_id);
+ g_free(position_id);
+ return;
+ }
+
+ g_free(position_id);
+ }
+
+ nf->netdev = ncs[0];
+
+ if (nfc->setup) {
+ nfc->setup(nf, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+
+ if (position) {
+ if (nf->insert_before_flag) {
+ QTAILQ_INSERT_BEFORE(position, nf, next);
+ } else {
+ QTAILQ_INSERT_AFTER(&nf->netdev->filters, position, nf, next);
+ }
+ } else if (!strcmp(nf->position, "head")) {
+ QTAILQ_INSERT_HEAD(&nf->netdev->filters, nf, next);
+ } else if (!strcmp(nf->position, "tail")) {
+ QTAILQ_INSERT_TAIL(&nf->netdev->filters, nf, next);
+ }
+}
+
+static void netfilter_finalize(Object *obj)
+{
+ NetFilterState *nf = NETFILTER(obj);
+ NetFilterClass *nfc = NETFILTER_GET_CLASS(obj);
+
+ if (nfc->cleanup) {
+ nfc->cleanup(nf);
+ }
+
+ if (nf->netdev && !QTAILQ_EMPTY(&nf->netdev->filters) &&
+ QTAILQ_IN_USE(nf, next)) {
+ QTAILQ_REMOVE(&nf->netdev->filters, nf, next);
+ }
+ g_free(nf->netdev_id);
+ g_free(nf->position);
+}
+
+static void default_handle_event(NetFilterState *nf, int event, Error **errp)
+{
+ switch (event) {
+ case COLO_EVENT_CHECKPOINT:
+ break;
+ case COLO_EVENT_FAILOVER:
+ object_property_set_str(OBJECT(nf), "status", "off", errp);
+ break;
+ default:
+ break;
+ }
+}
+
+static void netfilter_class_init(ObjectClass *oc, void *data)
+{
+ UserCreatableClass *ucc = USER_CREATABLE_CLASS(oc);
+ NetFilterClass *nfc = NETFILTER_CLASS(oc);
+
+ object_class_property_add_str(oc, "netdev",
+ netfilter_get_netdev_id, netfilter_set_netdev_id);
+ object_class_property_add_enum(oc, "queue", "NetFilterDirection",
+ &NetFilterDirection_lookup,
+ netfilter_get_direction, netfilter_set_direction);
+ object_class_property_add_str(oc, "status",
+ netfilter_get_status, netfilter_set_status);
+ object_class_property_add_str(oc, "position",
+ netfilter_get_position, netfilter_set_position);
+ object_class_property_add_str(oc, "insert",
+ netfilter_get_insert, netfilter_set_insert);
+
+ ucc->complete = netfilter_complete;
+ nfc->handle_event = default_handle_event;
+}
+
+static const TypeInfo netfilter_info = {
+ .name = TYPE_NETFILTER,
+ .parent = TYPE_OBJECT,
+ .abstract = true,
+ .class_size = sizeof(NetFilterClass),
+ .class_init = netfilter_class_init,
+ .instance_size = sizeof(NetFilterState),
+ .instance_init = netfilter_init,
+ .instance_finalize = netfilter_finalize,
+ .interfaces = (InterfaceInfo[]) {
+ { TYPE_USER_CREATABLE },
+ { }
+ }
+};
+
+static void register_types(void)
+{
+ type_register_static(&netfilter_info);
+}
+
+type_init(register_types);
diff --git a/net/hub.c b/net/hub.c
new file mode 100644
index 000000000..1375738bf
--- /dev/null
+++ b/net/hub.c
@@ -0,0 +1,345 @@
+/*
+ * Hub net client
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "monitor/monitor.h"
+#include "net/net.h"
+#include "clients.h"
+#include "hub.h"
+#include "qemu/iov.h"
+#include "qemu/error-report.h"
+#include "sysemu/qtest.h"
+
+/*
+ * A hub broadcasts incoming packets to all its ports except the source port.
+ * Hubs can be used to provide independent emulated network segments.
+ */
+
+typedef struct NetHub NetHub;
+
+typedef struct NetHubPort {
+ NetClientState nc;
+ QLIST_ENTRY(NetHubPort) next;
+ NetHub *hub;
+ int id;
+} NetHubPort;
+
+struct NetHub {
+ int id;
+ QLIST_ENTRY(NetHub) next;
+ int num_ports;
+ QLIST_HEAD(, NetHubPort) ports;
+};
+
+static QLIST_HEAD(, NetHub) hubs = QLIST_HEAD_INITIALIZER(&hubs);
+
+static ssize_t net_hub_receive(NetHub *hub, NetHubPort *source_port,
+ const uint8_t *buf, size_t len)
+{
+ NetHubPort *port;
+
+ QLIST_FOREACH(port, &hub->ports, next) {
+ if (port == source_port) {
+ continue;
+ }
+
+ qemu_send_packet(&port->nc, buf, len);
+ }
+ return len;
+}
+
+static ssize_t net_hub_receive_iov(NetHub *hub, NetHubPort *source_port,
+ const struct iovec *iov, int iovcnt)
+{
+ NetHubPort *port;
+ ssize_t len = iov_size(iov, iovcnt);
+
+ QLIST_FOREACH(port, &hub->ports, next) {
+ if (port == source_port) {
+ continue;
+ }
+
+ qemu_sendv_packet(&port->nc, iov, iovcnt);
+ }
+ return len;
+}
+
+static NetHub *net_hub_new(int id)
+{
+ NetHub *hub;
+
+ hub = g_malloc(sizeof(*hub));
+ hub->id = id;
+ hub->num_ports = 0;
+ QLIST_INIT(&hub->ports);
+
+ QLIST_INSERT_HEAD(&hubs, hub, next);
+
+ return hub;
+}
+
+static bool net_hub_port_can_receive(NetClientState *nc)
+{
+ NetHubPort *port;
+ NetHubPort *src_port = DO_UPCAST(NetHubPort, nc, nc);
+ NetHub *hub = src_port->hub;
+
+ QLIST_FOREACH(port, &hub->ports, next) {
+ if (port == src_port) {
+ continue;
+ }
+
+ if (qemu_can_send_packet(&port->nc)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static ssize_t net_hub_port_receive(NetClientState *nc,
+ const uint8_t *buf, size_t len)
+{
+ NetHubPort *port = DO_UPCAST(NetHubPort, nc, nc);
+
+ return net_hub_receive(port->hub, port, buf, len);
+}
+
+static ssize_t net_hub_port_receive_iov(NetClientState *nc,
+ const struct iovec *iov, int iovcnt)
+{
+ NetHubPort *port = DO_UPCAST(NetHubPort, nc, nc);
+
+ return net_hub_receive_iov(port->hub, port, iov, iovcnt);
+}
+
+static void net_hub_port_cleanup(NetClientState *nc)
+{
+ NetHubPort *port = DO_UPCAST(NetHubPort, nc, nc);
+
+ QLIST_REMOVE(port, next);
+}
+
+static NetClientInfo net_hub_port_info = {
+ .type = NET_CLIENT_DRIVER_HUBPORT,
+ .size = sizeof(NetHubPort),
+ .can_receive = net_hub_port_can_receive,
+ .receive = net_hub_port_receive,
+ .receive_iov = net_hub_port_receive_iov,
+ .cleanup = net_hub_port_cleanup,
+};
+
+static NetHubPort *net_hub_port_new(NetHub *hub, const char *name,
+ NetClientState *hubpeer)
+{
+ NetClientState *nc;
+ NetHubPort *port;
+ int id = hub->num_ports++;
+ char default_name[128];
+
+ if (!name) {
+ snprintf(default_name, sizeof(default_name),
+ "hub%dport%d", hub->id, id);
+ name = default_name;
+ }
+
+ nc = qemu_new_net_client(&net_hub_port_info, hubpeer, "hub", name);
+ port = DO_UPCAST(NetHubPort, nc, nc);
+ port->id = id;
+ port->hub = hub;
+
+ QLIST_INSERT_HEAD(&hub->ports, port, next);
+
+ return port;
+}
+
+/**
+ * Create a port on a given hub
+ * @hub_id: Number of the hub
+ * @name: Net client name or NULL for default name.
+ * @hubpeer: Peer to use (if "netdev=id" has been specified)
+ *
+ * If there is no existing hub with the given id then a new hub is created.
+ */
+NetClientState *net_hub_add_port(int hub_id, const char *name,
+ NetClientState *hubpeer)
+{
+ NetHub *hub;
+ NetHubPort *port;
+
+ QLIST_FOREACH(hub, &hubs, next) {
+ if (hub->id == hub_id) {
+ break;
+ }
+ }
+
+ if (!hub) {
+ hub = net_hub_new(hub_id);
+ }
+
+ port = net_hub_port_new(hub, name, hubpeer);
+ return &port->nc;
+}
+
+/**
+ * Find a available port on a hub; otherwise create one new port
+ */
+NetClientState *net_hub_port_find(int hub_id)
+{
+ NetHub *hub;
+ NetHubPort *port;
+ NetClientState *nc;
+
+ QLIST_FOREACH(hub, &hubs, next) {
+ if (hub->id == hub_id) {
+ QLIST_FOREACH(port, &hub->ports, next) {
+ nc = port->nc.peer;
+ if (!nc) {
+ return &(port->nc);
+ }
+ }
+ break;
+ }
+ }
+
+ nc = net_hub_add_port(hub_id, NULL, NULL);
+ return nc;
+}
+
+/**
+ * Print hub configuration
+ */
+void net_hub_info(Monitor *mon)
+{
+ NetHub *hub;
+ NetHubPort *port;
+
+ QLIST_FOREACH(hub, &hubs, next) {
+ monitor_printf(mon, "hub %d\n", hub->id);
+ QLIST_FOREACH(port, &hub->ports, next) {
+ monitor_printf(mon, " \\ %s", port->nc.name);
+ if (port->nc.peer) {
+ monitor_printf(mon, ": ");
+ print_net_client(mon, port->nc.peer);
+ } else {
+ monitor_printf(mon, "\n");
+ }
+ }
+ }
+}
+
+/**
+ * Get the hub id that a client is connected to
+ *
+ * @id: Pointer for hub id output, may be NULL
+ */
+int net_hub_id_for_client(NetClientState *nc, int *id)
+{
+ NetHubPort *port;
+
+ if (nc->info->type == NET_CLIENT_DRIVER_HUBPORT) {
+ port = DO_UPCAST(NetHubPort, nc, nc);
+ } else if (nc->peer != NULL && nc->peer->info->type ==
+ NET_CLIENT_DRIVER_HUBPORT) {
+ port = DO_UPCAST(NetHubPort, nc, nc->peer);
+ } else {
+ return -ENOENT;
+ }
+
+ if (id) {
+ *id = port->hub->id;
+ }
+ return 0;
+}
+
+int net_init_hubport(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevHubPortOptions *hubport;
+ NetClientState *hubpeer = NULL;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_HUBPORT);
+ assert(!peer);
+ hubport = &netdev->u.hubport;
+
+ if (hubport->has_netdev) {
+ hubpeer = qemu_find_netdev(hubport->netdev);
+ if (!hubpeer) {
+ error_setg(errp, "netdev '%s' not found", hubport->netdev);
+ return -1;
+ }
+ }
+
+ net_hub_add_port(hubport->hubid, name, hubpeer);
+
+ return 0;
+}
+
+/**
+ * Warn if hub configurations are likely wrong
+ */
+void net_hub_check_clients(void)
+{
+ NetHub *hub;
+ NetHubPort *port;
+ NetClientState *peer;
+
+ QLIST_FOREACH(hub, &hubs, next) {
+ int has_nic = 0, has_host_dev = 0;
+
+ QLIST_FOREACH(port, &hub->ports, next) {
+ peer = port->nc.peer;
+ if (!peer) {
+ warn_report("hub port %s has no peer", port->nc.name);
+ continue;
+ }
+
+ switch (peer->info->type) {
+ case NET_CLIENT_DRIVER_NIC:
+ has_nic = 1;
+ break;
+ case NET_CLIENT_DRIVER_USER:
+ case NET_CLIENT_DRIVER_TAP:
+ case NET_CLIENT_DRIVER_SOCKET:
+ case NET_CLIENT_DRIVER_VDE:
+ case NET_CLIENT_DRIVER_VHOST_USER:
+ has_host_dev = 1;
+ break;
+ default:
+ break;
+ }
+ }
+ if (has_host_dev && !has_nic) {
+ warn_report("hub %d with no nics", hub->id);
+ }
+ if (has_nic && !has_host_dev && !qtest_enabled()) {
+ warn_report("hub %d is not connected to host network", hub->id);
+ }
+ }
+}
+
+bool net_hub_flush(NetClientState *nc)
+{
+ NetHubPort *port;
+ NetHubPort *source_port = DO_UPCAST(NetHubPort, nc, nc);
+ int ret = 0;
+
+ QLIST_FOREACH(port, &source_port->hub->ports, next) {
+ if (port != source_port) {
+ ret += qemu_net_queue_flush(port->nc.incoming_queue);
+ }
+ }
+ return ret ? true : false;
+}
diff --git a/net/hub.h b/net/hub.h
new file mode 100644
index 000000000..ce45f7b39
--- /dev/null
+++ b/net/hub.h
@@ -0,0 +1,24 @@
+/*
+ * Hub net client
+ *
+ * Copyright IBM, Corp. 2012
+ *
+ * Authors:
+ * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
+ * Zhi Yong Wu <wuzhy@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#ifndef NET_HUB_H
+#define NET_HUB_H
+
+NetClientState *net_hub_add_port(int hub_id, const char *name,
+ NetClientState *hubpeer);
+void net_hub_info(Monitor *mon);
+void net_hub_check_clients(void);
+bool net_hub_flush(NetClientState *nc);
+
+#endif /* NET_HUB_H */
diff --git a/net/l2tpv3.c b/net/l2tpv3.c
new file mode 100644
index 000000000..e4d4218db
--- /dev/null
+++ b/net/l2tpv3.c
@@ -0,0 +1,739 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2012-2014 Cisco Systems
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include <linux/ip.h>
+#include <netdb.h>
+#include "net/net.h"
+#include "clients.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+
+
+/* The buffer size needs to be investigated for optimum numbers and
+ * optimum means of paging in on different systems. This size is
+ * chosen to be sufficient to accommodate one packet with some headers
+ */
+
+#define BUFFER_ALIGN sysconf(_SC_PAGESIZE)
+#define BUFFER_SIZE 2048
+#define IOVSIZE 2
+#define MAX_L2TPV3_MSGCNT 64
+#define MAX_L2TPV3_IOVCNT (MAX_L2TPV3_MSGCNT * IOVSIZE)
+
+/* Header set to 0x30000 signifies a data packet */
+
+#define L2TPV3_DATA_PACKET 0x30000
+
+/* IANA-assigned IP protocol ID for L2TPv3 */
+
+#ifndef IPPROTO_L2TP
+#define IPPROTO_L2TP 0x73
+#endif
+
+typedef struct NetL2TPV3State {
+ NetClientState nc;
+ int fd;
+
+ /*
+ * these are used for xmit - that happens packet a time
+ * and for first sign of life packet (easier to parse that once)
+ */
+
+ uint8_t *header_buf;
+ struct iovec *vec;
+
+ /*
+ * these are used for receive - try to "eat" up to 32 packets at a time
+ */
+
+ struct mmsghdr *msgvec;
+
+ /*
+ * peer address
+ */
+
+ struct sockaddr_storage *dgram_dst;
+ uint32_t dst_size;
+
+ /*
+ * L2TPv3 parameters
+ */
+
+ uint64_t rx_cookie;
+ uint64_t tx_cookie;
+ uint32_t rx_session;
+ uint32_t tx_session;
+ uint32_t header_size;
+ uint32_t counter;
+
+ /*
+ * DOS avoidance in error handling
+ */
+
+ bool header_mismatch;
+
+ /*
+ * Ring buffer handling
+ */
+
+ int queue_head;
+ int queue_tail;
+ int queue_depth;
+
+ /*
+ * Precomputed offsets
+ */
+
+ uint32_t offset;
+ uint32_t cookie_offset;
+ uint32_t counter_offset;
+ uint32_t session_offset;
+
+ /* Poll Control */
+
+ bool read_poll;
+ bool write_poll;
+
+ /* Flags */
+
+ bool ipv6;
+ bool udp;
+ bool has_counter;
+ bool pin_counter;
+ bool cookie;
+ bool cookie_is_64;
+
+} NetL2TPV3State;
+
+static void net_l2tpv3_send(void *opaque);
+static void l2tpv3_writable(void *opaque);
+
+static void l2tpv3_update_fd_handler(NetL2TPV3State *s)
+{
+ qemu_set_fd_handler(s->fd,
+ s->read_poll ? net_l2tpv3_send : NULL,
+ s->write_poll ? l2tpv3_writable : NULL,
+ s);
+}
+
+static void l2tpv3_read_poll(NetL2TPV3State *s, bool enable)
+{
+ if (s->read_poll != enable) {
+ s->read_poll = enable;
+ l2tpv3_update_fd_handler(s);
+ }
+}
+
+static void l2tpv3_write_poll(NetL2TPV3State *s, bool enable)
+{
+ if (s->write_poll != enable) {
+ s->write_poll = enable;
+ l2tpv3_update_fd_handler(s);
+ }
+}
+
+static void l2tpv3_writable(void *opaque)
+{
+ NetL2TPV3State *s = opaque;
+ l2tpv3_write_poll(s, false);
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static void l2tpv3_send_completed(NetClientState *nc, ssize_t len)
+{
+ NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
+ l2tpv3_read_poll(s, true);
+}
+
+static void l2tpv3_poll(NetClientState *nc, bool enable)
+{
+ NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
+ l2tpv3_write_poll(s, enable);
+ l2tpv3_read_poll(s, enable);
+}
+
+static void l2tpv3_form_header(NetL2TPV3State *s)
+{
+ uint32_t *counter;
+
+ if (s->udp) {
+ stl_be_p((uint32_t *) s->header_buf, L2TPV3_DATA_PACKET);
+ }
+ stl_be_p(
+ (uint32_t *) (s->header_buf + s->session_offset),
+ s->tx_session
+ );
+ if (s->cookie) {
+ if (s->cookie_is_64) {
+ stq_be_p(
+ (uint64_t *)(s->header_buf + s->cookie_offset),
+ s->tx_cookie
+ );
+ } else {
+ stl_be_p(
+ (uint32_t *) (s->header_buf + s->cookie_offset),
+ s->tx_cookie
+ );
+ }
+ }
+ if (s->has_counter) {
+ counter = (uint32_t *)(s->header_buf + s->counter_offset);
+ if (s->pin_counter) {
+ *counter = 0;
+ } else {
+ stl_be_p(counter, ++s->counter);
+ }
+ }
+}
+
+static ssize_t net_l2tpv3_receive_dgram_iov(NetClientState *nc,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
+
+ struct msghdr message;
+ int ret;
+
+ if (iovcnt > MAX_L2TPV3_IOVCNT - 1) {
+ error_report(
+ "iovec too long %d > %d, change l2tpv3.h",
+ iovcnt, MAX_L2TPV3_IOVCNT
+ );
+ return -1;
+ }
+ l2tpv3_form_header(s);
+ memcpy(s->vec + 1, iov, iovcnt * sizeof(struct iovec));
+ s->vec->iov_base = s->header_buf;
+ s->vec->iov_len = s->offset;
+ message.msg_name = s->dgram_dst;
+ message.msg_namelen = s->dst_size;
+ message.msg_iov = s->vec;
+ message.msg_iovlen = iovcnt + 1;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = 0;
+ do {
+ ret = sendmsg(s->fd, &message, 0);
+ } while ((ret == -1) && (errno == EINTR));
+ if (ret > 0) {
+ ret -= s->offset;
+ } else if (ret == 0) {
+ /* belt and braces - should not occur on DGRAM
+ * we should get an error and never a 0 send
+ */
+ ret = iov_size(iov, iovcnt);
+ } else {
+ /* signal upper layer that socket buffer is full */
+ ret = -errno;
+ if (ret == -EAGAIN || ret == -ENOBUFS) {
+ l2tpv3_write_poll(s, true);
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static ssize_t net_l2tpv3_receive_dgram(NetClientState *nc,
+ const uint8_t *buf,
+ size_t size)
+{
+ NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
+
+ struct iovec *vec;
+ struct msghdr message;
+ ssize_t ret = 0;
+
+ l2tpv3_form_header(s);
+ vec = s->vec;
+ vec->iov_base = s->header_buf;
+ vec->iov_len = s->offset;
+ vec++;
+ vec->iov_base = (void *) buf;
+ vec->iov_len = size;
+ message.msg_name = s->dgram_dst;
+ message.msg_namelen = s->dst_size;
+ message.msg_iov = s->vec;
+ message.msg_iovlen = 2;
+ message.msg_control = NULL;
+ message.msg_controllen = 0;
+ message.msg_flags = 0;
+ do {
+ ret = sendmsg(s->fd, &message, 0);
+ } while ((ret == -1) && (errno == EINTR));
+ if (ret > 0) {
+ ret -= s->offset;
+ } else if (ret == 0) {
+ /* belt and braces - should not occur on DGRAM
+ * we should get an error and never a 0 send
+ */
+ ret = size;
+ } else {
+ ret = -errno;
+ if (ret == -EAGAIN || ret == -ENOBUFS) {
+ /* signal upper layer that socket buffer is full */
+ l2tpv3_write_poll(s, true);
+ ret = 0;
+ }
+ }
+ return ret;
+}
+
+static int l2tpv3_verify_header(NetL2TPV3State *s, uint8_t *buf)
+{
+
+ uint32_t *session;
+ uint64_t cookie;
+
+ if ((!s->udp) && (!s->ipv6)) {
+ buf += sizeof(struct iphdr) /* fix for ipv4 raw */;
+ }
+
+ /* we do not do a strict check for "data" packets as per
+ * the RFC spec because the pure IP spec does not have
+ * that anyway.
+ */
+
+ if (s->cookie) {
+ if (s->cookie_is_64) {
+ cookie = ldq_be_p(buf + s->cookie_offset);
+ } else {
+ cookie = ldl_be_p(buf + s->cookie_offset) & 0xffffffffULL;
+ }
+ if (cookie != s->rx_cookie) {
+ if (!s->header_mismatch) {
+ error_report("unknown cookie id");
+ }
+ return -1;
+ }
+ }
+ session = (uint32_t *) (buf + s->session_offset);
+ if (ldl_be_p(session) != s->rx_session) {
+ if (!s->header_mismatch) {
+ error_report("session mismatch");
+ }
+ return -1;
+ }
+ return 0;
+}
+
+static void net_l2tpv3_process_queue(NetL2TPV3State *s)
+{
+ int size = 0;
+ struct iovec *vec;
+ bool bad_read;
+ int data_size;
+ struct mmsghdr *msgvec;
+
+ /* go into ring mode only if there is a "pending" tail */
+ if (s->queue_depth > 0) {
+ do {
+ msgvec = s->msgvec + s->queue_tail;
+ if (msgvec->msg_len > 0) {
+ data_size = msgvec->msg_len - s->header_size;
+ vec = msgvec->msg_hdr.msg_iov;
+ if ((data_size > 0) &&
+ (l2tpv3_verify_header(s, vec->iov_base) == 0)) {
+ vec++;
+ /* Use the legacy delivery for now, we will
+ * switch to using our own ring as a queueing mechanism
+ * at a later date
+ */
+ size = qemu_send_packet_async(
+ &s->nc,
+ vec->iov_base,
+ data_size,
+ l2tpv3_send_completed
+ );
+ if (size == 0) {
+ l2tpv3_read_poll(s, false);
+ }
+ bad_read = false;
+ } else {
+ bad_read = true;
+ if (!s->header_mismatch) {
+ /* report error only once */
+ error_report("l2tpv3 header verification failed");
+ s->header_mismatch = true;
+ }
+ }
+ } else {
+ bad_read = true;
+ }
+ s->queue_tail = (s->queue_tail + 1) % MAX_L2TPV3_MSGCNT;
+ s->queue_depth--;
+ } while (
+ (s->queue_depth > 0) &&
+ qemu_can_send_packet(&s->nc) &&
+ ((size > 0) || bad_read)
+ );
+ }
+}
+
+static void net_l2tpv3_send(void *opaque)
+{
+ NetL2TPV3State *s = opaque;
+ int target_count, count;
+ struct mmsghdr *msgvec;
+
+ /* go into ring mode only if there is a "pending" tail */
+
+ if (s->queue_depth) {
+
+ /* The ring buffer we use has variable intake
+ * count of how much we can read varies - adjust accordingly
+ */
+
+ target_count = MAX_L2TPV3_MSGCNT - s->queue_depth;
+
+ /* Ensure we do not overrun the ring when we have
+ * a lot of enqueued packets
+ */
+
+ if (s->queue_head + target_count > MAX_L2TPV3_MSGCNT) {
+ target_count = MAX_L2TPV3_MSGCNT - s->queue_head;
+ }
+ } else {
+
+ /* we do not have any pending packets - we can use
+ * the whole message vector linearly instead of using
+ * it as a ring
+ */
+
+ s->queue_head = 0;
+ s->queue_tail = 0;
+ target_count = MAX_L2TPV3_MSGCNT;
+ }
+
+ msgvec = s->msgvec + s->queue_head;
+ if (target_count > 0) {
+ do {
+ count = recvmmsg(
+ s->fd,
+ msgvec,
+ target_count, MSG_DONTWAIT, NULL);
+ } while ((count == -1) && (errno == EINTR));
+ if (count < 0) {
+ /* Recv error - we still need to flush packets here,
+ * (re)set queue head to current position
+ */
+ count = 0;
+ }
+ s->queue_head = (s->queue_head + count) % MAX_L2TPV3_MSGCNT;
+ s->queue_depth += count;
+ }
+ net_l2tpv3_process_queue(s);
+}
+
+static void destroy_vector(struct mmsghdr *msgvec, int count, int iovcount)
+{
+ int i, j;
+ struct iovec *iov;
+ struct mmsghdr *cleanup = msgvec;
+ if (cleanup) {
+ for (i = 0; i < count; i++) {
+ if (cleanup->msg_hdr.msg_iov) {
+ iov = cleanup->msg_hdr.msg_iov;
+ for (j = 0; j < iovcount; j++) {
+ g_free(iov->iov_base);
+ iov++;
+ }
+ g_free(cleanup->msg_hdr.msg_iov);
+ }
+ cleanup++;
+ }
+ g_free(msgvec);
+ }
+}
+
+static struct mmsghdr *build_l2tpv3_vector(NetL2TPV3State *s, int count)
+{
+ int i;
+ struct iovec *iov;
+ struct mmsghdr *msgvec, *result;
+
+ msgvec = g_new(struct mmsghdr, count);
+ result = msgvec;
+ for (i = 0; i < count ; i++) {
+ msgvec->msg_hdr.msg_name = NULL;
+ msgvec->msg_hdr.msg_namelen = 0;
+ iov = g_new(struct iovec, IOVSIZE);
+ msgvec->msg_hdr.msg_iov = iov;
+ iov->iov_base = g_malloc(s->header_size);
+ iov->iov_len = s->header_size;
+ iov++ ;
+ iov->iov_base = qemu_memalign(BUFFER_ALIGN, BUFFER_SIZE);
+ iov->iov_len = BUFFER_SIZE;
+ msgvec->msg_hdr.msg_iovlen = 2;
+ msgvec->msg_hdr.msg_control = NULL;
+ msgvec->msg_hdr.msg_controllen = 0;
+ msgvec->msg_hdr.msg_flags = 0;
+ msgvec++;
+ }
+ return result;
+}
+
+static void net_l2tpv3_cleanup(NetClientState *nc)
+{
+ NetL2TPV3State *s = DO_UPCAST(NetL2TPV3State, nc, nc);
+ qemu_purge_queued_packets(nc);
+ l2tpv3_read_poll(s, false);
+ l2tpv3_write_poll(s, false);
+ if (s->fd >= 0) {
+ close(s->fd);
+ }
+ destroy_vector(s->msgvec, MAX_L2TPV3_MSGCNT, IOVSIZE);
+ g_free(s->vec);
+ g_free(s->header_buf);
+ g_free(s->dgram_dst);
+}
+
+static NetClientInfo net_l2tpv3_info = {
+ .type = NET_CLIENT_DRIVER_L2TPV3,
+ .size = sizeof(NetL2TPV3State),
+ .receive = net_l2tpv3_receive_dgram,
+ .receive_iov = net_l2tpv3_receive_dgram_iov,
+ .poll = l2tpv3_poll,
+ .cleanup = net_l2tpv3_cleanup,
+};
+
+int net_init_l2tpv3(const Netdev *netdev,
+ const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevL2TPv3Options *l2tpv3;
+ NetL2TPV3State *s;
+ NetClientState *nc;
+ int fd = -1, gairet;
+ struct addrinfo hints;
+ struct addrinfo *result = NULL;
+ char *srcport, *dstport;
+
+ nc = qemu_new_net_client(&net_l2tpv3_info, peer, "l2tpv3", name);
+
+ s = DO_UPCAST(NetL2TPV3State, nc, nc);
+
+ s->queue_head = 0;
+ s->queue_tail = 0;
+ s->header_mismatch = false;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_L2TPV3);
+ l2tpv3 = &netdev->u.l2tpv3;
+
+ if (l2tpv3->has_ipv6 && l2tpv3->ipv6) {
+ s->ipv6 = l2tpv3->ipv6;
+ } else {
+ s->ipv6 = false;
+ }
+
+ if ((l2tpv3->has_offset) && (l2tpv3->offset > 256)) {
+ error_setg(errp, "offset must be less than 256 bytes");
+ goto outerr;
+ }
+
+ if (l2tpv3->has_rxcookie || l2tpv3->has_txcookie) {
+ if (l2tpv3->has_rxcookie && l2tpv3->has_txcookie) {
+ s->cookie = true;
+ } else {
+ error_setg(errp,
+ "require both 'rxcookie' and 'txcookie' or neither");
+ goto outerr;
+ }
+ } else {
+ s->cookie = false;
+ }
+
+ if (l2tpv3->has_cookie64 || l2tpv3->cookie64) {
+ s->cookie_is_64 = true;
+ } else {
+ s->cookie_is_64 = false;
+ }
+
+ if (l2tpv3->has_udp && l2tpv3->udp) {
+ s->udp = true;
+ if (!(l2tpv3->has_srcport && l2tpv3->has_dstport)) {
+ error_setg(errp, "need both src and dst port for udp");
+ goto outerr;
+ } else {
+ srcport = l2tpv3->srcport;
+ dstport = l2tpv3->dstport;
+ }
+ } else {
+ s->udp = false;
+ srcport = NULL;
+ dstport = NULL;
+ }
+
+
+ s->offset = 4;
+ s->session_offset = 0;
+ s->cookie_offset = 4;
+ s->counter_offset = 4;
+
+ s->tx_session = l2tpv3->txsession;
+ if (l2tpv3->has_rxsession) {
+ s->rx_session = l2tpv3->rxsession;
+ } else {
+ s->rx_session = s->tx_session;
+ }
+
+ if (s->cookie) {
+ s->rx_cookie = l2tpv3->rxcookie;
+ s->tx_cookie = l2tpv3->txcookie;
+ if (s->cookie_is_64 == true) {
+ /* 64 bit cookie */
+ s->offset += 8;
+ s->counter_offset += 8;
+ } else {
+ /* 32 bit cookie */
+ s->offset += 4;
+ s->counter_offset += 4;
+ }
+ }
+
+ memset(&hints, 0, sizeof(hints));
+
+ if (s->ipv6) {
+ hints.ai_family = AF_INET6;
+ } else {
+ hints.ai_family = AF_INET;
+ }
+ if (s->udp) {
+ hints.ai_socktype = SOCK_DGRAM;
+ hints.ai_protocol = 0;
+ s->offset += 4;
+ s->counter_offset += 4;
+ s->session_offset += 4;
+ s->cookie_offset += 4;
+ } else {
+ hints.ai_socktype = SOCK_RAW;
+ hints.ai_protocol = IPPROTO_L2TP;
+ }
+
+ gairet = getaddrinfo(l2tpv3->src, srcport, &hints, &result);
+
+ if ((gairet != 0) || (result == NULL)) {
+ error_setg(errp, "could not resolve src, errno = %s",
+ gai_strerror(gairet));
+ goto outerr;
+ }
+ fd = socket(result->ai_family, result->ai_socktype, result->ai_protocol);
+ if (fd == -1) {
+ fd = -errno;
+ error_setg(errp, "socket creation failed, errno = %d",
+ -fd);
+ goto outerr;
+ }
+ if (bind(fd, (struct sockaddr *) result->ai_addr, result->ai_addrlen)) {
+ error_setg(errp, "could not bind socket err=%i", errno);
+ goto outerr;
+ }
+
+ freeaddrinfo(result);
+
+ memset(&hints, 0, sizeof(hints));
+
+ if (s->ipv6) {
+ hints.ai_family = AF_INET6;
+ } else {
+ hints.ai_family = AF_INET;
+ }
+ if (s->udp) {
+ hints.ai_socktype = SOCK_DGRAM;
+ hints.ai_protocol = 0;
+ } else {
+ hints.ai_socktype = SOCK_RAW;
+ hints.ai_protocol = IPPROTO_L2TP;
+ }
+
+ result = NULL;
+ gairet = getaddrinfo(l2tpv3->dst, dstport, &hints, &result);
+ if ((gairet != 0) || (result == NULL)) {
+ error_setg(errp, "could not resolve dst, error = %s",
+ gai_strerror(gairet));
+ goto outerr;
+ }
+
+ s->dgram_dst = g_new0(struct sockaddr_storage, 1);
+ memcpy(s->dgram_dst, result->ai_addr, result->ai_addrlen);
+ s->dst_size = result->ai_addrlen;
+
+ freeaddrinfo(result);
+
+ if (l2tpv3->has_counter && l2tpv3->counter) {
+ s->has_counter = true;
+ s->offset += 4;
+ } else {
+ s->has_counter = false;
+ }
+
+ if (l2tpv3->has_pincounter && l2tpv3->pincounter) {
+ s->has_counter = true; /* pin counter implies that there is counter */
+ s->pin_counter = true;
+ } else {
+ s->pin_counter = false;
+ }
+
+ if (l2tpv3->has_offset) {
+ /* extra offset */
+ s->offset += l2tpv3->offset;
+ }
+
+ if ((s->ipv6) || (s->udp)) {
+ s->header_size = s->offset;
+ } else {
+ s->header_size = s->offset + sizeof(struct iphdr);
+ }
+
+ s->msgvec = build_l2tpv3_vector(s, MAX_L2TPV3_MSGCNT);
+ s->vec = g_new(struct iovec, MAX_L2TPV3_IOVCNT);
+ s->header_buf = g_malloc(s->header_size);
+
+ qemu_set_nonblock(fd);
+
+ s->fd = fd;
+ s->counter = 0;
+
+ l2tpv3_read_poll(s, true);
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "l2tpv3: connected");
+ return 0;
+outerr:
+ qemu_del_net_client(nc);
+ if (fd >= 0) {
+ close(fd);
+ }
+ if (result) {
+ freeaddrinfo(result);
+ }
+ return -1;
+}
+
diff --git a/net/meson.build b/net/meson.build
new file mode 100644
index 000000000..847bc2ac8
--- /dev/null
+++ b/net/meson.build
@@ -0,0 +1,45 @@
+softmmu_ss.add(files(
+ 'announce.c',
+ 'checksum.c',
+ 'colo-compare.c',
+ 'colo.c',
+ 'dump.c',
+ 'eth.c',
+ 'filter-buffer.c',
+ 'filter-mirror.c',
+ 'filter-rewriter.c',
+ 'filter.c',
+ 'hub.c',
+ 'net.c',
+ 'queue.c',
+ 'socket.c',
+ 'util.c',
+))
+
+softmmu_ss.add(when: 'CONFIG_TCG', if_true: files('filter-replay.c'))
+
+if have_l2tpv3
+ softmmu_ss.add(files('l2tpv3.c'))
+endif
+softmmu_ss.add(when: slirp, if_true: files('slirp.c'))
+softmmu_ss.add(when: vde, if_true: files('vde.c'))
+if have_netmap
+ softmmu_ss.add(files('netmap.c'))
+endif
+vhost_user_ss = ss.source_set()
+vhost_user_ss.add(when: 'CONFIG_VIRTIO_NET', if_true: files('vhost-user.c'), if_false: files('vhost-user-stub.c'))
+softmmu_ss.add_all(when: 'CONFIG_VHOST_NET_USER', if_true: vhost_user_ss)
+softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-user-stub.c'))
+
+softmmu_ss.add(when: 'CONFIG_LINUX', if_true: files('tap-linux.c'))
+softmmu_ss.add(when: 'CONFIG_BSD', if_true: files('tap-bsd.c'))
+softmmu_ss.add(when: 'CONFIG_SOLARIS', if_true: files('tap-solaris.c'))
+tap_posix = ['tap.c']
+if not config_host.has_key('CONFIG_LINUX') and not config_host.has_key('CONFIG_BSD') and not config_host.has_key('CONFIG_SOLARIS')
+ tap_posix += 'tap-stub.c'
+endif
+softmmu_ss.add(when: 'CONFIG_POSIX', if_true: files(tap_posix))
+softmmu_ss.add(when: 'CONFIG_WIN32', if_true: files('tap-win32.c'))
+softmmu_ss.add(when: 'CONFIG_VHOST_NET_VDPA', if_true: files('vhost-vdpa.c'))
+
+subdir('can')
diff --git a/net/net.c b/net/net.c
new file mode 100644
index 000000000..f0d14dbfc
--- /dev/null
+++ b/net/net.c
@@ -0,0 +1,1766 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+
+#include "net/net.h"
+#include "clients.h"
+#include "hub.h"
+#include "hw/qdev-properties.h"
+#include "net/slirp.h"
+#include "net/eth.h"
+#include "util.h"
+
+#include "monitor/monitor.h"
+#include "qemu/help_option.h"
+#include "qapi/qapi-commands-net.h"
+#include "qapi/qapi-visit-net.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qerror.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include "qemu/cutils.h"
+#include "qemu/config-file.h"
+#include "qemu/ctype.h"
+#include "qemu/id.h"
+#include "qemu/iov.h"
+#include "qemu/qemu-print.h"
+#include "qemu/main-loop.h"
+#include "qemu/option.h"
+#include "qapi/error.h"
+#include "qapi/opts-visitor.h"
+#include "sysemu/runstate.h"
+#include "net/colo-compare.h"
+#include "net/filter.h"
+#include "qapi/string-output-visitor.h"
+
+/* Net bridge is currently not supported for W32. */
+#if !defined(_WIN32)
+# define CONFIG_NET_BRIDGE
+#endif
+
+static VMChangeStateEntry *net_change_state_entry;
+static QTAILQ_HEAD(, NetClientState) net_clients;
+
+/***********************************************************/
+/* network device redirectors */
+
+int parse_host_port(struct sockaddr_in *saddr, const char *str,
+ Error **errp)
+{
+ gchar **substrings;
+ struct hostent *he;
+ const char *addr, *p, *r;
+ int port, ret = 0;
+
+ memset(saddr, 0, sizeof(*saddr));
+
+ substrings = g_strsplit(str, ":", 2);
+ if (!substrings || !substrings[0] || !substrings[1]) {
+ error_setg(errp, "host address '%s' doesn't contain ':' "
+ "separating host from port", str);
+ ret = -1;
+ goto out;
+ }
+
+ addr = substrings[0];
+ p = substrings[1];
+
+ saddr->sin_family = AF_INET;
+ if (addr[0] == '\0') {
+ saddr->sin_addr.s_addr = 0;
+ } else {
+ if (qemu_isdigit(addr[0])) {
+ if (!inet_aton(addr, &saddr->sin_addr)) {
+ error_setg(errp, "host address '%s' is not a valid "
+ "IPv4 address", addr);
+ ret = -1;
+ goto out;
+ }
+ } else {
+ he = gethostbyname(addr);
+ if (he == NULL) {
+ error_setg(errp, "can't resolve host address '%s'", addr);
+ ret = -1;
+ goto out;
+ }
+ saddr->sin_addr = *(struct in_addr *)he->h_addr;
+ }
+ }
+ port = strtol(p, (char **)&r, 0);
+ if (r == p) {
+ error_setg(errp, "port number '%s' is invalid", p);
+ ret = -1;
+ goto out;
+ }
+ saddr->sin_port = htons(port);
+
+out:
+ g_strfreev(substrings);
+ return ret;
+}
+
+char *qemu_mac_strdup_printf(const uint8_t *macaddr)
+{
+ return g_strdup_printf("%.2x:%.2x:%.2x:%.2x:%.2x:%.2x",
+ macaddr[0], macaddr[1], macaddr[2],
+ macaddr[3], macaddr[4], macaddr[5]);
+}
+
+void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6])
+{
+ snprintf(nc->info_str, sizeof(nc->info_str),
+ "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
+ nc->model,
+ macaddr[0], macaddr[1], macaddr[2],
+ macaddr[3], macaddr[4], macaddr[5]);
+}
+
+static int mac_table[256] = {0};
+
+static void qemu_macaddr_set_used(MACAddr *macaddr)
+{
+ int index;
+
+ for (index = 0x56; index < 0xFF; index++) {
+ if (macaddr->a[5] == index) {
+ mac_table[index]++;
+ }
+ }
+}
+
+static void qemu_macaddr_set_free(MACAddr *macaddr)
+{
+ int index;
+ static const MACAddr base = { .a = { 0x52, 0x54, 0x00, 0x12, 0x34, 0 } };
+
+ if (memcmp(macaddr->a, &base.a, (sizeof(base.a) - 1)) != 0) {
+ return;
+ }
+ for (index = 0x56; index < 0xFF; index++) {
+ if (macaddr->a[5] == index) {
+ mac_table[index]--;
+ }
+ }
+}
+
+static int qemu_macaddr_get_free(void)
+{
+ int index;
+
+ for (index = 0x56; index < 0xFF; index++) {
+ if (mac_table[index] == 0) {
+ return index;
+ }
+ }
+
+ return -1;
+}
+
+void qemu_macaddr_default_if_unset(MACAddr *macaddr)
+{
+ static const MACAddr zero = { .a = { 0,0,0,0,0,0 } };
+ static const MACAddr base = { .a = { 0x52, 0x54, 0x00, 0x12, 0x34, 0 } };
+
+ if (memcmp(macaddr, &zero, sizeof(zero)) != 0) {
+ if (memcmp(macaddr->a, &base.a, (sizeof(base.a) - 1)) != 0) {
+ return;
+ } else {
+ qemu_macaddr_set_used(macaddr);
+ return;
+ }
+ }
+
+ macaddr->a[0] = 0x52;
+ macaddr->a[1] = 0x54;
+ macaddr->a[2] = 0x00;
+ macaddr->a[3] = 0x12;
+ macaddr->a[4] = 0x34;
+ macaddr->a[5] = qemu_macaddr_get_free();
+ qemu_macaddr_set_used(macaddr);
+}
+
+/**
+ * Generate a name for net client
+ *
+ * Only net clients created with the legacy -net option and NICs need this.
+ */
+static char *assign_name(NetClientState *nc1, const char *model)
+{
+ NetClientState *nc;
+ int id = 0;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (nc == nc1) {
+ continue;
+ }
+ if (strcmp(nc->model, model) == 0) {
+ id++;
+ }
+ }
+
+ return g_strdup_printf("%s.%d", model, id);
+}
+
+static void qemu_net_client_destructor(NetClientState *nc)
+{
+ g_free(nc);
+}
+static ssize_t qemu_deliver_packet_iov(NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ void *opaque);
+
+static void qemu_net_client_setup(NetClientState *nc,
+ NetClientInfo *info,
+ NetClientState *peer,
+ const char *model,
+ const char *name,
+ NetClientDestructor *destructor,
+ bool is_datapath)
+{
+ nc->info = info;
+ nc->model = g_strdup(model);
+ if (name) {
+ nc->name = g_strdup(name);
+ } else {
+ nc->name = assign_name(nc, model);
+ }
+
+ if (peer) {
+ assert(!peer->peer);
+ nc->peer = peer;
+ peer->peer = nc;
+ }
+ QTAILQ_INSERT_TAIL(&net_clients, nc, next);
+
+ nc->incoming_queue = qemu_new_net_queue(qemu_deliver_packet_iov, nc);
+ nc->destructor = destructor;
+ nc->is_datapath = is_datapath;
+ QTAILQ_INIT(&nc->filters);
+}
+
+NetClientState *qemu_new_net_client(NetClientInfo *info,
+ NetClientState *peer,
+ const char *model,
+ const char *name)
+{
+ NetClientState *nc;
+
+ assert(info->size >= sizeof(NetClientState));
+
+ nc = g_malloc0(info->size);
+ qemu_net_client_setup(nc, info, peer, model, name,
+ qemu_net_client_destructor, true);
+
+ return nc;
+}
+
+NetClientState *qemu_new_net_control_client(NetClientInfo *info,
+ NetClientState *peer,
+ const char *model,
+ const char *name)
+{
+ NetClientState *nc;
+
+ assert(info->size >= sizeof(NetClientState));
+
+ nc = g_malloc0(info->size);
+ qemu_net_client_setup(nc, info, peer, model, name,
+ qemu_net_client_destructor, false);
+
+ return nc;
+}
+
+NICState *qemu_new_nic(NetClientInfo *info,
+ NICConf *conf,
+ const char *model,
+ const char *name,
+ void *opaque)
+{
+ NetClientState **peers = conf->peers.ncs;
+ NICState *nic;
+ int i, queues = MAX(1, conf->peers.queues);
+
+ assert(info->type == NET_CLIENT_DRIVER_NIC);
+ assert(info->size >= sizeof(NICState));
+
+ nic = g_malloc0(info->size + sizeof(NetClientState) * queues);
+ nic->ncs = (void *)nic + info->size;
+ nic->conf = conf;
+ nic->opaque = opaque;
+
+ for (i = 0; i < queues; i++) {
+ qemu_net_client_setup(&nic->ncs[i], info, peers[i], model, name,
+ NULL, true);
+ nic->ncs[i].queue_index = i;
+ }
+
+ return nic;
+}
+
+NetClientState *qemu_get_subqueue(NICState *nic, int queue_index)
+{
+ return nic->ncs + queue_index;
+}
+
+NetClientState *qemu_get_queue(NICState *nic)
+{
+ return qemu_get_subqueue(nic, 0);
+}
+
+NICState *qemu_get_nic(NetClientState *nc)
+{
+ NetClientState *nc0 = nc - nc->queue_index;
+
+ return (NICState *)((void *)nc0 - nc->info->size);
+}
+
+void *qemu_get_nic_opaque(NetClientState *nc)
+{
+ NICState *nic = qemu_get_nic(nc);
+
+ return nic->opaque;
+}
+
+NetClientState *qemu_get_peer(NetClientState *nc, int queue_index)
+{
+ assert(nc != NULL);
+ NetClientState *ncs = nc + queue_index;
+ return ncs->peer;
+}
+
+static void qemu_cleanup_net_client(NetClientState *nc)
+{
+ QTAILQ_REMOVE(&net_clients, nc, next);
+
+ if (nc->info->cleanup) {
+ nc->info->cleanup(nc);
+ }
+}
+
+static void qemu_free_net_client(NetClientState *nc)
+{
+ if (nc->incoming_queue) {
+ qemu_del_net_queue(nc->incoming_queue);
+ }
+ if (nc->peer) {
+ nc->peer->peer = NULL;
+ }
+ g_free(nc->name);
+ g_free(nc->model);
+ if (nc->destructor) {
+ nc->destructor(nc);
+ }
+}
+
+void qemu_del_net_client(NetClientState *nc)
+{
+ NetClientState *ncs[MAX_QUEUE_NUM];
+ int queues, i;
+ NetFilterState *nf, *next;
+
+ assert(nc->info->type != NET_CLIENT_DRIVER_NIC);
+
+ /* If the NetClientState belongs to a multiqueue backend, we will change all
+ * other NetClientStates also.
+ */
+ queues = qemu_find_net_clients_except(nc->name, ncs,
+ NET_CLIENT_DRIVER_NIC,
+ MAX_QUEUE_NUM);
+ assert(queues != 0);
+
+ QTAILQ_FOREACH_SAFE(nf, &nc->filters, next, next) {
+ object_unparent(OBJECT(nf));
+ }
+
+ /* If there is a peer NIC, delete and cleanup client, but do not free. */
+ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_NIC) {
+ NICState *nic = qemu_get_nic(nc->peer);
+ if (nic->peer_deleted) {
+ return;
+ }
+ nic->peer_deleted = true;
+
+ for (i = 0; i < queues; i++) {
+ ncs[i]->peer->link_down = true;
+ }
+
+ if (nc->peer->info->link_status_changed) {
+ nc->peer->info->link_status_changed(nc->peer);
+ }
+
+ for (i = 0; i < queues; i++) {
+ qemu_cleanup_net_client(ncs[i]);
+ }
+
+ return;
+ }
+
+ for (i = 0; i < queues; i++) {
+ qemu_cleanup_net_client(ncs[i]);
+ qemu_free_net_client(ncs[i]);
+ }
+}
+
+void qemu_del_nic(NICState *nic)
+{
+ int i, queues = MAX(nic->conf->peers.queues, 1);
+
+ qemu_macaddr_set_free(&nic->conf->macaddr);
+
+ for (i = 0; i < queues; i++) {
+ NetClientState *nc = qemu_get_subqueue(nic, i);
+ /* If this is a peer NIC and peer has already been deleted, free it now. */
+ if (nic->peer_deleted) {
+ qemu_free_net_client(nc->peer);
+ } else if (nc->peer) {
+ /* if there are RX packets pending, complete them */
+ qemu_purge_queued_packets(nc->peer);
+ }
+ }
+
+ for (i = queues - 1; i >= 0; i--) {
+ NetClientState *nc = qemu_get_subqueue(nic, i);
+
+ qemu_cleanup_net_client(nc);
+ qemu_free_net_client(nc);
+ }
+
+ g_free(nic);
+}
+
+void qemu_foreach_nic(qemu_nic_foreach func, void *opaque)
+{
+ NetClientState *nc;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (nc->info->type == NET_CLIENT_DRIVER_NIC) {
+ if (nc->queue_index == 0) {
+ func(qemu_get_nic(nc), opaque);
+ }
+ }
+ }
+}
+
+bool qemu_has_ufo(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_ufo) {
+ return false;
+ }
+
+ return nc->info->has_ufo(nc);
+}
+
+bool qemu_has_vnet_hdr(NetClientState *nc)
+{
+ if (!nc || !nc->info->has_vnet_hdr) {
+ return false;
+ }
+
+ return nc->info->has_vnet_hdr(nc);
+}
+
+bool qemu_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ if (!nc || !nc->info->has_vnet_hdr_len) {
+ return false;
+ }
+
+ return nc->info->has_vnet_hdr_len(nc, len);
+}
+
+void qemu_using_vnet_hdr(NetClientState *nc, bool enable)
+{
+ if (!nc || !nc->info->using_vnet_hdr) {
+ return;
+ }
+
+ nc->info->using_vnet_hdr(nc, enable);
+}
+
+void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
+ int ecn, int ufo)
+{
+ if (!nc || !nc->info->set_offload) {
+ return;
+ }
+
+ nc->info->set_offload(nc, csum, tso4, tso6, ecn, ufo);
+}
+
+void qemu_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ if (!nc || !nc->info->set_vnet_hdr_len) {
+ return;
+ }
+
+ nc->vnet_hdr_len = len;
+ nc->info->set_vnet_hdr_len(nc, len);
+}
+
+int qemu_set_vnet_le(NetClientState *nc, bool is_le)
+{
+#ifdef HOST_WORDS_BIGENDIAN
+ if (!nc || !nc->info->set_vnet_le) {
+ return -ENOSYS;
+ }
+
+ return nc->info->set_vnet_le(nc, is_le);
+#else
+ return 0;
+#endif
+}
+
+int qemu_set_vnet_be(NetClientState *nc, bool is_be)
+{
+#ifdef HOST_WORDS_BIGENDIAN
+ return 0;
+#else
+ if (!nc || !nc->info->set_vnet_be) {
+ return -ENOSYS;
+ }
+
+ return nc->info->set_vnet_be(nc, is_be);
+#endif
+}
+
+int qemu_can_receive_packet(NetClientState *nc)
+{
+ if (nc->receive_disabled) {
+ return 0;
+ } else if (nc->info->can_receive &&
+ !nc->info->can_receive(nc)) {
+ return 0;
+ }
+ return 1;
+}
+
+int qemu_can_send_packet(NetClientState *sender)
+{
+ int vm_running = runstate_is_running();
+
+ if (!vm_running) {
+ return 0;
+ }
+
+ if (!sender->peer) {
+ return 1;
+ }
+
+ return qemu_can_receive_packet(sender->peer);
+}
+
+static ssize_t filter_receive_iov(NetClientState *nc,
+ NetFilterDirection direction,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ ssize_t ret = 0;
+ NetFilterState *nf = NULL;
+
+ if (direction == NET_FILTER_DIRECTION_TX) {
+ QTAILQ_FOREACH(nf, &nc->filters, next) {
+ ret = qemu_netfilter_receive(nf, direction, sender, flags, iov,
+ iovcnt, sent_cb);
+ if (ret) {
+ return ret;
+ }
+ }
+ } else {
+ QTAILQ_FOREACH_REVERSE(nf, &nc->filters, next) {
+ ret = qemu_netfilter_receive(nf, direction, sender, flags, iov,
+ iovcnt, sent_cb);
+ if (ret) {
+ return ret;
+ }
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t filter_receive(NetClientState *nc,
+ NetFilterDirection direction,
+ NetClientState *sender,
+ unsigned flags,
+ const uint8_t *data,
+ size_t size,
+ NetPacketSent *sent_cb)
+{
+ struct iovec iov = {
+ .iov_base = (void *)data,
+ .iov_len = size
+ };
+
+ return filter_receive_iov(nc, direction, sender, flags, &iov, 1, sent_cb);
+}
+
+void qemu_purge_queued_packets(NetClientState *nc)
+{
+ if (!nc->peer) {
+ return;
+ }
+
+ qemu_net_queue_purge(nc->peer->incoming_queue, nc);
+}
+
+void qemu_flush_or_purge_queued_packets(NetClientState *nc, bool purge)
+{
+ nc->receive_disabled = 0;
+
+ if (nc->peer && nc->peer->info->type == NET_CLIENT_DRIVER_HUBPORT) {
+ if (net_hub_flush(nc->peer)) {
+ qemu_notify_event();
+ }
+ }
+ if (qemu_net_queue_flush(nc->incoming_queue)) {
+ /* We emptied the queue successfully, signal to the IO thread to repoll
+ * the file descriptor (for tap, for example).
+ */
+ qemu_notify_event();
+ } else if (purge) {
+ /* Unable to empty the queue, purge remaining packets */
+ qemu_net_queue_purge(nc->incoming_queue, nc->peer);
+ }
+}
+
+void qemu_flush_queued_packets(NetClientState *nc)
+{
+ qemu_flush_or_purge_queued_packets(nc, false);
+}
+
+static ssize_t qemu_send_packet_async_with_flags(NetClientState *sender,
+ unsigned flags,
+ const uint8_t *buf, int size,
+ NetPacketSent *sent_cb)
+{
+ NetQueue *queue;
+ int ret;
+
+#ifdef DEBUG_NET
+ printf("qemu_send_packet_async:\n");
+ qemu_hexdump(stdout, "net", buf, size);
+#endif
+
+ if (sender->link_down || !sender->peer) {
+ return size;
+ }
+
+ /* Let filters handle the packet first */
+ ret = filter_receive(sender, NET_FILTER_DIRECTION_TX,
+ sender, flags, buf, size, sent_cb);
+ if (ret) {
+ return ret;
+ }
+
+ ret = filter_receive(sender->peer, NET_FILTER_DIRECTION_RX,
+ sender, flags, buf, size, sent_cb);
+ if (ret) {
+ return ret;
+ }
+
+ queue = sender->peer->incoming_queue;
+
+ return qemu_net_queue_send(queue, sender, flags, buf, size, sent_cb);
+}
+
+ssize_t qemu_send_packet_async(NetClientState *sender,
+ const uint8_t *buf, int size,
+ NetPacketSent *sent_cb)
+{
+ return qemu_send_packet_async_with_flags(sender, QEMU_NET_PACKET_FLAG_NONE,
+ buf, size, sent_cb);
+}
+
+ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size)
+{
+ return qemu_send_packet_async(nc, buf, size, NULL);
+}
+
+ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size)
+{
+ if (!qemu_can_receive_packet(nc)) {
+ return 0;
+ }
+
+ return qemu_net_queue_receive(nc->incoming_queue, buf, size);
+}
+
+ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov,
+ int iovcnt)
+{
+ if (!qemu_can_receive_packet(nc)) {
+ return 0;
+ }
+
+ return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt);
+}
+
+ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size)
+{
+ return qemu_send_packet_async_with_flags(nc, QEMU_NET_PACKET_FLAG_RAW,
+ buf, size, NULL);
+}
+
+static ssize_t nc_sendv_compat(NetClientState *nc, const struct iovec *iov,
+ int iovcnt, unsigned flags)
+{
+ uint8_t *buf = NULL;
+ uint8_t *buffer;
+ size_t offset;
+ ssize_t ret;
+
+ if (iovcnt == 1) {
+ buffer = iov[0].iov_base;
+ offset = iov[0].iov_len;
+ } else {
+ offset = iov_size(iov, iovcnt);
+ if (offset > NET_BUFSIZE) {
+ return -1;
+ }
+ buf = g_malloc(offset);
+ buffer = buf;
+ offset = iov_to_buf(iov, iovcnt, 0, buf, offset);
+ }
+
+ if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) {
+ ret = nc->info->receive_raw(nc, buffer, offset);
+ } else {
+ ret = nc->info->receive(nc, buffer, offset);
+ }
+
+ g_free(buf);
+ return ret;
+}
+
+static ssize_t qemu_deliver_packet_iov(NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ void *opaque)
+{
+ NetClientState *nc = opaque;
+ int ret;
+
+
+ if (nc->link_down) {
+ return iov_size(iov, iovcnt);
+ }
+
+ if (nc->receive_disabled) {
+ return 0;
+ }
+
+ if (nc->info->receive_iov && !(flags & QEMU_NET_PACKET_FLAG_RAW)) {
+ ret = nc->info->receive_iov(nc, iov, iovcnt);
+ } else {
+ ret = nc_sendv_compat(nc, iov, iovcnt, flags);
+ }
+
+ if (ret == 0) {
+ nc->receive_disabled = 1;
+ }
+
+ return ret;
+}
+
+ssize_t qemu_sendv_packet_async(NetClientState *sender,
+ const struct iovec *iov, int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ NetQueue *queue;
+ size_t size = iov_size(iov, iovcnt);
+ int ret;
+
+ if (size > NET_BUFSIZE) {
+ return size;
+ }
+
+ if (sender->link_down || !sender->peer) {
+ return size;
+ }
+
+ /* Let filters handle the packet first */
+ ret = filter_receive_iov(sender, NET_FILTER_DIRECTION_TX, sender,
+ QEMU_NET_PACKET_FLAG_NONE, iov, iovcnt, sent_cb);
+ if (ret) {
+ return ret;
+ }
+
+ ret = filter_receive_iov(sender->peer, NET_FILTER_DIRECTION_RX, sender,
+ QEMU_NET_PACKET_FLAG_NONE, iov, iovcnt, sent_cb);
+ if (ret) {
+ return ret;
+ }
+
+ queue = sender->peer->incoming_queue;
+
+ return qemu_net_queue_send_iov(queue, sender,
+ QEMU_NET_PACKET_FLAG_NONE,
+ iov, iovcnt, sent_cb);
+}
+
+ssize_t
+qemu_sendv_packet(NetClientState *nc, const struct iovec *iov, int iovcnt)
+{
+ return qemu_sendv_packet_async(nc, iov, iovcnt, NULL);
+}
+
+NetClientState *qemu_find_netdev(const char *id)
+{
+ NetClientState *nc;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (nc->info->type == NET_CLIENT_DRIVER_NIC)
+ continue;
+ if (!strcmp(nc->name, id)) {
+ return nc;
+ }
+ }
+
+ return NULL;
+}
+
+int qemu_find_net_clients_except(const char *id, NetClientState **ncs,
+ NetClientDriver type, int max)
+{
+ NetClientState *nc;
+ int ret = 0;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (nc->info->type == type) {
+ continue;
+ }
+ if (!id || !strcmp(nc->name, id)) {
+ if (ret < max) {
+ ncs[ret] = nc;
+ }
+ ret++;
+ }
+ }
+
+ return ret;
+}
+
+static int nic_get_free_idx(void)
+{
+ int index;
+
+ for (index = 0; index < MAX_NICS; index++)
+ if (!nd_table[index].used)
+ return index;
+ return -1;
+}
+
+int qemu_show_nic_models(const char *arg, const char *const *models)
+{
+ int i;
+
+ if (!arg || !is_help_option(arg)) {
+ return 0;
+ }
+
+ printf("Supported NIC models:\n");
+ for (i = 0 ; models[i]; i++) {
+ printf("%s\n", models[i]);
+ }
+ return 1;
+}
+
+void qemu_check_nic_model(NICInfo *nd, const char *model)
+{
+ const char *models[2];
+
+ models[0] = model;
+ models[1] = NULL;
+
+ if (qemu_show_nic_models(nd->model, models))
+ exit(0);
+ if (qemu_find_nic_model(nd, models, model) < 0)
+ exit(1);
+}
+
+int qemu_find_nic_model(NICInfo *nd, const char * const *models,
+ const char *default_model)
+{
+ int i;
+
+ if (!nd->model)
+ nd->model = g_strdup(default_model);
+
+ for (i = 0 ; models[i]; i++) {
+ if (strcmp(nd->model, models[i]) == 0)
+ return i;
+ }
+
+ error_report("Unsupported NIC model: %s", nd->model);
+ return -1;
+}
+
+static int net_init_nic(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ int idx;
+ NICInfo *nd;
+ const NetLegacyNicOptions *nic;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_NIC);
+ nic = &netdev->u.nic;
+
+ idx = nic_get_free_idx();
+ if (idx == -1 || nb_nics >= MAX_NICS) {
+ error_setg(errp, "too many NICs");
+ return -1;
+ }
+
+ nd = &nd_table[idx];
+
+ memset(nd, 0, sizeof(*nd));
+
+ if (nic->has_netdev) {
+ nd->netdev = qemu_find_netdev(nic->netdev);
+ if (!nd->netdev) {
+ error_setg(errp, "netdev '%s' not found", nic->netdev);
+ return -1;
+ }
+ } else {
+ assert(peer);
+ nd->netdev = peer;
+ }
+ nd->name = g_strdup(name);
+ if (nic->has_model) {
+ nd->model = g_strdup(nic->model);
+ }
+ if (nic->has_addr) {
+ nd->devaddr = g_strdup(nic->addr);
+ }
+
+ if (nic->has_macaddr &&
+ net_parse_macaddr(nd->macaddr.a, nic->macaddr) < 0) {
+ error_setg(errp, "invalid syntax for ethernet address");
+ return -1;
+ }
+ if (nic->has_macaddr &&
+ is_multicast_ether_addr(nd->macaddr.a)) {
+ error_setg(errp,
+ "NIC cannot have multicast MAC address (odd 1st byte)");
+ return -1;
+ }
+ qemu_macaddr_default_if_unset(&nd->macaddr);
+
+ if (nic->has_vectors) {
+ if (nic->vectors > 0x7ffffff) {
+ error_setg(errp, "invalid # of vectors: %"PRIu32, nic->vectors);
+ return -1;
+ }
+ nd->nvectors = nic->vectors;
+ } else {
+ nd->nvectors = DEV_NVECTORS_UNSPECIFIED;
+ }
+
+ nd->used = 1;
+ nb_nics++;
+
+ return idx;
+}
+
+
+static int (* const net_client_init_fun[NET_CLIENT_DRIVER__MAX])(
+ const Netdev *netdev,
+ const char *name,
+ NetClientState *peer, Error **errp) = {
+ [NET_CLIENT_DRIVER_NIC] = net_init_nic,
+#ifdef CONFIG_SLIRP
+ [NET_CLIENT_DRIVER_USER] = net_init_slirp,
+#endif
+ [NET_CLIENT_DRIVER_TAP] = net_init_tap,
+ [NET_CLIENT_DRIVER_SOCKET] = net_init_socket,
+#ifdef CONFIG_VDE
+ [NET_CLIENT_DRIVER_VDE] = net_init_vde,
+#endif
+#ifdef CONFIG_NETMAP
+ [NET_CLIENT_DRIVER_NETMAP] = net_init_netmap,
+#endif
+#ifdef CONFIG_NET_BRIDGE
+ [NET_CLIENT_DRIVER_BRIDGE] = net_init_bridge,
+#endif
+ [NET_CLIENT_DRIVER_HUBPORT] = net_init_hubport,
+#ifdef CONFIG_VHOST_NET_USER
+ [NET_CLIENT_DRIVER_VHOST_USER] = net_init_vhost_user,
+#endif
+#ifdef CONFIG_VHOST_NET_VDPA
+ [NET_CLIENT_DRIVER_VHOST_VDPA] = net_init_vhost_vdpa,
+#endif
+#ifdef CONFIG_L2TPV3
+ [NET_CLIENT_DRIVER_L2TPV3] = net_init_l2tpv3,
+#endif
+};
+
+
+static int net_client_init1(const Netdev *netdev, bool is_netdev, Error **errp)
+{
+ NetClientState *peer = NULL;
+ NetClientState *nc;
+
+ if (is_netdev) {
+ if (netdev->type == NET_CLIENT_DRIVER_NIC ||
+ !net_client_init_fun[netdev->type]) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
+ "a netdev backend type");
+ return -1;
+ }
+ } else {
+ if (netdev->type == NET_CLIENT_DRIVER_NONE) {
+ return 0; /* nothing to do */
+ }
+ if (netdev->type == NET_CLIENT_DRIVER_HUBPORT ||
+ !net_client_init_fun[netdev->type]) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "type",
+ "a net backend type (maybe it is not compiled "
+ "into this binary)");
+ return -1;
+ }
+
+ /* Do not add to a hub if it's a nic with a netdev= parameter. */
+ if (netdev->type != NET_CLIENT_DRIVER_NIC ||
+ !netdev->u.nic.has_netdev) {
+ peer = net_hub_add_port(0, NULL, NULL);
+ }
+ }
+
+ nc = qemu_find_netdev(netdev->id);
+ if (nc) {
+ error_setg(errp, "Duplicate ID '%s'", netdev->id);
+ return -1;
+ }
+
+ if (net_client_init_fun[netdev->type](netdev, netdev->id, peer, errp) < 0) {
+ /* FIXME drop when all init functions store an Error */
+ if (errp && !*errp) {
+ error_setg(errp, "Device '%s' could not be initialized",
+ NetClientDriver_str(netdev->type));
+ }
+ return -1;
+ }
+
+ if (is_netdev) {
+ nc = qemu_find_netdev(netdev->id);
+ assert(nc);
+ nc->is_netdev = true;
+ }
+
+ return 0;
+}
+
+void show_netdevs(void)
+{
+ int idx;
+ const char *available_netdevs[] = {
+ "socket",
+ "hubport",
+ "tap",
+#ifdef CONFIG_SLIRP
+ "user",
+#endif
+#ifdef CONFIG_L2TPV3
+ "l2tpv3",
+#endif
+#ifdef CONFIG_VDE
+ "vde",
+#endif
+#ifdef CONFIG_NET_BRIDGE
+ "bridge",
+#endif
+#ifdef CONFIG_NETMAP
+ "netmap",
+#endif
+#ifdef CONFIG_POSIX
+ "vhost-user",
+#endif
+#ifdef CONFIG_VHOST_VDPA
+ "vhost-vdpa",
+#endif
+ };
+
+ qemu_printf("Available netdev backend types:\n");
+ for (idx = 0; idx < ARRAY_SIZE(available_netdevs); idx++) {
+ qemu_printf("%s\n", available_netdevs[idx]);
+ }
+}
+
+static int net_client_init(QemuOpts *opts, bool is_netdev, Error **errp)
+{
+ gchar **substrings = NULL;
+ Netdev *object = NULL;
+ int ret = -1;
+ Visitor *v = opts_visitor_new(opts);
+
+ /* Parse convenience option format ip6-net=fec0::0[/64] */
+ const char *ip6_net = qemu_opt_get(opts, "ipv6-net");
+
+ if (ip6_net) {
+ char *prefix_addr;
+ unsigned long prefix_len = 64; /* Default 64bit prefix length. */
+
+ substrings = g_strsplit(ip6_net, "/", 2);
+ if (!substrings || !substrings[0]) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "ipv6-net",
+ "a valid IPv6 prefix");
+ goto out;
+ }
+
+ prefix_addr = substrings[0];
+
+ /* Handle user-specified prefix length. */
+ if (substrings[1] &&
+ qemu_strtoul(substrings[1], NULL, 10, &prefix_len))
+ {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+ "ipv6-prefixlen", "a number");
+ goto out;
+ }
+
+ qemu_opt_set(opts, "ipv6-prefix", prefix_addr, &error_abort);
+ qemu_opt_set_number(opts, "ipv6-prefixlen", prefix_len,
+ &error_abort);
+ qemu_opt_unset(opts, "ipv6-net");
+ }
+
+ /* Create an ID for -net if the user did not specify one */
+ if (!is_netdev && !qemu_opts_id(opts)) {
+ qemu_opts_set_id(opts, id_generate(ID_NET));
+ }
+
+ if (visit_type_Netdev(v, NULL, &object, errp)) {
+ ret = net_client_init1(object, is_netdev, errp);
+ }
+
+ qapi_free_Netdev(object);
+
+out:
+ g_strfreev(substrings);
+ visit_free(v);
+ return ret;
+}
+
+void netdev_add(QemuOpts *opts, Error **errp)
+{
+ net_client_init(opts, true, errp);
+}
+
+void qmp_netdev_add(Netdev *netdev, Error **errp)
+{
+ if (!id_wellformed(netdev->id)) {
+ error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier");
+ return;
+ }
+
+ net_client_init1(netdev, true, errp);
+}
+
+void qmp_netdev_del(const char *id, Error **errp)
+{
+ NetClientState *nc;
+ QemuOpts *opts;
+
+ nc = qemu_find_netdev(id);
+ if (!nc) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "Device '%s' not found", id);
+ return;
+ }
+
+ if (!nc->is_netdev) {
+ error_setg(errp, "Device '%s' is not a netdev", id);
+ return;
+ }
+
+ qemu_del_net_client(nc);
+
+ /*
+ * Wart: we need to delete the QemuOpts associated with netdevs
+ * created via CLI or HMP, to avoid bogus "Duplicate ID" errors in
+ * HMP netdev_add.
+ */
+ opts = qemu_opts_find(qemu_find_opts("netdev"), id);
+ if (opts) {
+ qemu_opts_del(opts);
+ }
+}
+
+static void netfilter_print_info(Monitor *mon, NetFilterState *nf)
+{
+ char *str;
+ ObjectProperty *prop;
+ ObjectPropertyIterator iter;
+ Visitor *v;
+
+ /* generate info str */
+ object_property_iter_init(&iter, OBJECT(nf));
+ while ((prop = object_property_iter_next(&iter))) {
+ if (!strcmp(prop->name, "type")) {
+ continue;
+ }
+ v = string_output_visitor_new(false, &str);
+ object_property_get(OBJECT(nf), prop->name, v, NULL);
+ visit_complete(v, &str);
+ visit_free(v);
+ monitor_printf(mon, ",%s=%s", prop->name, str);
+ g_free(str);
+ }
+ monitor_printf(mon, "\n");
+}
+
+void print_net_client(Monitor *mon, NetClientState *nc)
+{
+ NetFilterState *nf;
+
+ monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name,
+ nc->queue_index,
+ NetClientDriver_str(nc->info->type),
+ nc->info_str);
+ if (!QTAILQ_EMPTY(&nc->filters)) {
+ monitor_printf(mon, "filters:\n");
+ }
+ QTAILQ_FOREACH(nf, &nc->filters, next) {
+ monitor_printf(mon, " - %s: type=%s",
+ object_get_canonical_path_component(OBJECT(nf)),
+ object_get_typename(OBJECT(nf)));
+ netfilter_print_info(mon, nf);
+ }
+}
+
+RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name,
+ Error **errp)
+{
+ NetClientState *nc;
+ RxFilterInfoList *filter_list = NULL, **tail = &filter_list;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ RxFilterInfo *info;
+
+ if (has_name && strcmp(nc->name, name) != 0) {
+ continue;
+ }
+
+ /* only query rx-filter information of NIC */
+ if (nc->info->type != NET_CLIENT_DRIVER_NIC) {
+ if (has_name) {
+ error_setg(errp, "net client(%s) isn't a NIC", name);
+ assert(!filter_list);
+ return NULL;
+ }
+ continue;
+ }
+
+ /* only query information on queue 0 since the info is per nic,
+ * not per queue
+ */
+ if (nc->queue_index != 0)
+ continue;
+
+ if (nc->info->query_rx_filter) {
+ info = nc->info->query_rx_filter(nc);
+ QAPI_LIST_APPEND(tail, info);
+ } else if (has_name) {
+ error_setg(errp, "net client(%s) doesn't support"
+ " rx-filter querying", name);
+ assert(!filter_list);
+ return NULL;
+ }
+
+ if (has_name) {
+ break;
+ }
+ }
+
+ if (filter_list == NULL && has_name) {
+ error_setg(errp, "invalid net client name: %s", name);
+ }
+
+ return filter_list;
+}
+
+void hmp_info_network(Monitor *mon, const QDict *qdict)
+{
+ NetClientState *nc, *peer;
+ NetClientDriver type;
+
+ net_hub_info(mon);
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ peer = nc->peer;
+ type = nc->info->type;
+
+ /* Skip if already printed in hub info */
+ if (net_hub_id_for_client(nc, NULL) == 0) {
+ continue;
+ }
+
+ if (!peer || type == NET_CLIENT_DRIVER_NIC) {
+ print_net_client(mon, nc);
+ } /* else it's a netdev connected to a NIC, printed with the NIC */
+ if (peer && type == NET_CLIENT_DRIVER_NIC) {
+ monitor_printf(mon, " \\ ");
+ print_net_client(mon, peer);
+ }
+ }
+}
+
+void colo_notify_filters_event(int event, Error **errp)
+{
+ NetClientState *nc;
+ NetFilterState *nf;
+ NetFilterClass *nfc = NULL;
+ Error *local_err = NULL;
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ QTAILQ_FOREACH(nf, &nc->filters, next) {
+ nfc = NETFILTER_GET_CLASS(OBJECT(nf));
+ nfc->handle_event(nf, event, &local_err);
+ if (local_err) {
+ error_propagate(errp, local_err);
+ return;
+ }
+ }
+ }
+}
+
+void qmp_set_link(const char *name, bool up, Error **errp)
+{
+ NetClientState *ncs[MAX_QUEUE_NUM];
+ NetClientState *nc;
+ int queues, i;
+
+ queues = qemu_find_net_clients_except(name, ncs,
+ NET_CLIENT_DRIVER__MAX,
+ MAX_QUEUE_NUM);
+
+ if (queues == 0) {
+ error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+ "Device '%s' not found", name);
+ return;
+ }
+ nc = ncs[0];
+
+ for (i = 0; i < queues; i++) {
+ ncs[i]->link_down = !up;
+ }
+
+ if (nc->info->link_status_changed) {
+ nc->info->link_status_changed(nc);
+ }
+
+ if (nc->peer) {
+ /* Change peer link only if the peer is NIC and then notify peer.
+ * If the peer is a HUBPORT or a backend, we do not change the
+ * link status.
+ *
+ * This behavior is compatible with qemu hubs where there could be
+ * multiple clients that can still communicate with each other in
+ * disconnected mode. For now maintain this compatibility.
+ */
+ if (nc->peer->info->type == NET_CLIENT_DRIVER_NIC) {
+ for (i = 0; i < queues; i++) {
+ ncs[i]->peer->link_down = !up;
+ }
+ }
+ if (nc->peer->info->link_status_changed) {
+ nc->peer->info->link_status_changed(nc->peer);
+ }
+ }
+}
+
+static void net_vm_change_state_handler(void *opaque, bool running,
+ RunState state)
+{
+ NetClientState *nc;
+ NetClientState *tmp;
+
+ QTAILQ_FOREACH_SAFE(nc, &net_clients, next, tmp) {
+ if (running) {
+ /* Flush queued packets and wake up backends. */
+ if (nc->peer && qemu_can_send_packet(nc)) {
+ qemu_flush_queued_packets(nc->peer);
+ }
+ } else {
+ /* Complete all queued packets, to guarantee we don't modify
+ * state later when VM is not running.
+ */
+ qemu_flush_or_purge_queued_packets(nc, true);
+ }
+ }
+}
+
+void net_cleanup(void)
+{
+ NetClientState *nc;
+
+ /*cleanup colo compare module for COLO*/
+ colo_compare_cleanup();
+
+ /* We may del multiple entries during qemu_del_net_client(),
+ * so QTAILQ_FOREACH_SAFE() is also not safe here.
+ */
+ while (!QTAILQ_EMPTY(&net_clients)) {
+ nc = QTAILQ_FIRST(&net_clients);
+ if (nc->info->type == NET_CLIENT_DRIVER_NIC) {
+ qemu_del_nic(qemu_get_nic(nc));
+ } else {
+ qemu_del_net_client(nc);
+ }
+ }
+
+ qemu_del_vm_change_state_handler(net_change_state_entry);
+}
+
+void net_check_clients(void)
+{
+ NetClientState *nc;
+ int i;
+
+ net_hub_check_clients();
+
+ QTAILQ_FOREACH(nc, &net_clients, next) {
+ if (!nc->peer) {
+ warn_report("%s %s has no peer",
+ nc->info->type == NET_CLIENT_DRIVER_NIC
+ ? "nic" : "netdev",
+ nc->name);
+ }
+ }
+
+ /* Check that all NICs requested via -net nic actually got created.
+ * NICs created via -device don't need to be checked here because
+ * they are always instantiated.
+ */
+ for (i = 0; i < MAX_NICS; i++) {
+ NICInfo *nd = &nd_table[i];
+ if (nd->used && !nd->instantiated) {
+ warn_report("requested NIC (%s, model %s) "
+ "was not created (not supported by this machine?)",
+ nd->name ? nd->name : "anonymous",
+ nd->model ? nd->model : "unspecified");
+ }
+ }
+}
+
+static int net_init_client(void *dummy, QemuOpts *opts, Error **errp)
+{
+ return net_client_init(opts, false, errp);
+}
+
+static int net_init_netdev(void *dummy, QemuOpts *opts, Error **errp)
+{
+ const char *type = qemu_opt_get(opts, "type");
+
+ if (type && is_help_option(type)) {
+ show_netdevs();
+ exit(0);
+ }
+ return net_client_init(opts, true, errp);
+}
+
+/* For the convenience "--nic" parameter */
+static int net_param_nic(void *dummy, QemuOpts *opts, Error **errp)
+{
+ char *mac, *nd_id;
+ int idx, ret;
+ NICInfo *ni;
+ const char *type;
+
+ type = qemu_opt_get(opts, "type");
+ if (type && g_str_equal(type, "none")) {
+ return 0; /* Nothing to do, default_net is cleared in vl.c */
+ }
+
+ idx = nic_get_free_idx();
+ if (idx == -1 || nb_nics >= MAX_NICS) {
+ error_setg(errp, "no more on-board/default NIC slots available");
+ return -1;
+ }
+
+ if (!type) {
+ qemu_opt_set(opts, "type", "user", &error_abort);
+ }
+
+ ni = &nd_table[idx];
+ memset(ni, 0, sizeof(*ni));
+ ni->model = qemu_opt_get_del(opts, "model");
+
+ /* Create an ID if the user did not specify one */
+ nd_id = g_strdup(qemu_opts_id(opts));
+ if (!nd_id) {
+ nd_id = id_generate(ID_NET);
+ qemu_opts_set_id(opts, nd_id);
+ }
+
+ /* Handle MAC address */
+ mac = qemu_opt_get_del(opts, "mac");
+ if (mac) {
+ ret = net_parse_macaddr(ni->macaddr.a, mac);
+ g_free(mac);
+ if (ret) {
+ error_setg(errp, "invalid syntax for ethernet address");
+ goto out;
+ }
+ if (is_multicast_ether_addr(ni->macaddr.a)) {
+ error_setg(errp, "NIC cannot have multicast MAC address");
+ ret = -1;
+ goto out;
+ }
+ }
+ qemu_macaddr_default_if_unset(&ni->macaddr);
+
+ ret = net_client_init(opts, true, errp);
+ if (ret == 0) {
+ ni->netdev = qemu_find_netdev(nd_id);
+ ni->used = true;
+ nb_nics++;
+ }
+
+out:
+ g_free(nd_id);
+ return ret;
+}
+
+int net_init_clients(Error **errp)
+{
+ net_change_state_entry =
+ qemu_add_vm_change_state_handler(net_vm_change_state_handler, NULL);
+
+ QTAILQ_INIT(&net_clients);
+
+ if (qemu_opts_foreach(qemu_find_opts("netdev"),
+ net_init_netdev, NULL, errp)) {
+ return -1;
+ }
+
+ if (qemu_opts_foreach(qemu_find_opts("nic"), net_param_nic, NULL, errp)) {
+ return -1;
+ }
+
+ if (qemu_opts_foreach(qemu_find_opts("net"), net_init_client, NULL, errp)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+int net_client_parse(QemuOptsList *opts_list, const char *optarg)
+{
+ if (!qemu_opts_parse_noisily(opts_list, optarg, true)) {
+ return -1;
+ }
+
+ return 0;
+}
+
+/* From FreeBSD */
+/* XXX: optimize */
+uint32_t net_crc32(const uint8_t *p, int len)
+{
+ uint32_t crc;
+ int carry, i, j;
+ uint8_t b;
+
+ crc = 0xffffffff;
+ for (i = 0; i < len; i++) {
+ b = *p++;
+ for (j = 0; j < 8; j++) {
+ carry = ((crc & 0x80000000L) ? 1 : 0) ^ (b & 0x01);
+ crc <<= 1;
+ b >>= 1;
+ if (carry) {
+ crc = ((crc ^ POLYNOMIAL_BE) | carry);
+ }
+ }
+ }
+
+ return crc;
+}
+
+uint32_t net_crc32_le(const uint8_t *p, int len)
+{
+ uint32_t crc;
+ int carry, i, j;
+ uint8_t b;
+
+ crc = 0xffffffff;
+ for (i = 0; i < len; i++) {
+ b = *p++;
+ for (j = 0; j < 8; j++) {
+ carry = (crc & 0x1) ^ (b & 0x01);
+ crc >>= 1;
+ b >>= 1;
+ if (carry) {
+ crc ^= POLYNOMIAL_LE;
+ }
+ }
+ }
+
+ return crc;
+}
+
+QemuOptsList qemu_netdev_opts = {
+ .name = "netdev",
+ .implied_opt_name = "type",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_netdev_opts.head),
+ .desc = {
+ /*
+ * no elements => accept any params
+ * validation will happen later
+ */
+ { /* end of list */ }
+ },
+};
+
+QemuOptsList qemu_nic_opts = {
+ .name = "nic",
+ .implied_opt_name = "type",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_nic_opts.head),
+ .desc = {
+ /*
+ * no elements => accept any params
+ * validation will happen later
+ */
+ { /* end of list */ }
+ },
+};
+
+QemuOptsList qemu_net_opts = {
+ .name = "net",
+ .implied_opt_name = "type",
+ .head = QTAILQ_HEAD_INITIALIZER(qemu_net_opts.head),
+ .desc = {
+ /*
+ * no elements => accept any params
+ * validation will happen later
+ */
+ { /* end of list */ }
+ },
+};
+
+void net_socket_rs_init(SocketReadState *rs,
+ SocketReadStateFinalize *finalize,
+ bool vnet_hdr)
+{
+ rs->state = 0;
+ rs->vnet_hdr = vnet_hdr;
+ rs->index = 0;
+ rs->packet_len = 0;
+ rs->vnet_hdr_len = 0;
+ memset(rs->buf, 0, sizeof(rs->buf));
+ rs->finalize = finalize;
+}
+
+/*
+ * Returns
+ * 0: success
+ * -1: error occurs
+ */
+int net_fill_rstate(SocketReadState *rs, const uint8_t *buf, int size)
+{
+ unsigned int l;
+
+ while (size > 0) {
+ /* Reassemble a packet from the network.
+ * 0 = getting length.
+ * 1 = getting vnet header length.
+ * 2 = getting data.
+ */
+ switch (rs->state) {
+ case 0:
+ l = 4 - rs->index;
+ if (l > size) {
+ l = size;
+ }
+ memcpy(rs->buf + rs->index, buf, l);
+ buf += l;
+ size -= l;
+ rs->index += l;
+ if (rs->index == 4) {
+ /* got length */
+ rs->packet_len = ntohl(*(uint32_t *)rs->buf);
+ rs->index = 0;
+ if (rs->vnet_hdr) {
+ rs->state = 1;
+ } else {
+ rs->state = 2;
+ rs->vnet_hdr_len = 0;
+ }
+ }
+ break;
+ case 1:
+ l = 4 - rs->index;
+ if (l > size) {
+ l = size;
+ }
+ memcpy(rs->buf + rs->index, buf, l);
+ buf += l;
+ size -= l;
+ rs->index += l;
+ if (rs->index == 4) {
+ /* got vnet header length */
+ rs->vnet_hdr_len = ntohl(*(uint32_t *)rs->buf);
+ rs->index = 0;
+ rs->state = 2;
+ }
+ break;
+ case 2:
+ l = rs->packet_len - rs->index;
+ if (l > size) {
+ l = size;
+ }
+ if (rs->index + l <= sizeof(rs->buf)) {
+ memcpy(rs->buf + rs->index, buf, l);
+ } else {
+ fprintf(stderr, "serious error: oversized packet received,"
+ "connection terminated.\n");
+ rs->index = rs->state = 0;
+ return -1;
+ }
+
+ rs->index += l;
+ buf += l;
+ size -= l;
+ if (rs->index >= rs->packet_len) {
+ rs->index = 0;
+ rs->state = 0;
+ assert(rs->finalize);
+ rs->finalize(rs);
+ }
+ break;
+ }
+ }
+
+ assert(size == 0);
+ return 0;
+}
diff --git a/net/netmap.c b/net/netmap.c
new file mode 100644
index 000000000..9e0cec58d
--- /dev/null
+++ b/net/netmap.c
@@ -0,0 +1,431 @@
+/*
+ * netmap access for qemu
+ *
+ * Copyright (c) 2012-2013 Luigi Rizzo
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#include "qemu/osdep.h"
+#include <sys/ioctl.h>
+#include <net/if.h>
+#define NETMAP_WITH_LIBS
+#include <net/netmap.h>
+#include <net/netmap_user.h>
+
+#include "net/net.h"
+#include "net/tap.h"
+#include "clients.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+#include "qemu/iov.h"
+#include "qemu/cutils.h"
+#include "qemu/main-loop.h"
+
+typedef struct NetmapState {
+ NetClientState nc;
+ struct nm_desc *nmd;
+ char ifname[IFNAMSIZ];
+ struct netmap_ring *tx;
+ struct netmap_ring *rx;
+ bool read_poll;
+ bool write_poll;
+ struct iovec iov[IOV_MAX];
+ int vnet_hdr_len; /* Current virtio-net header length. */
+} NetmapState;
+
+#ifndef __FreeBSD__
+#define pkt_copy bcopy
+#else
+/* A fast copy routine only for multiples of 64 bytes, non overlapped. */
+static inline void
+pkt_copy(const void *_src, void *_dst, int l)
+{
+ const uint64_t *src = _src;
+ uint64_t *dst = _dst;
+ if (unlikely(l >= 1024)) {
+ bcopy(src, dst, l);
+ return;
+ }
+ for (; l > 0; l -= 64) {
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ *dst++ = *src++;
+ }
+}
+#endif /* __FreeBSD__ */
+
+/*
+ * Open a netmap device. We assume there is only one queue
+ * (which is the case for the VALE bridge).
+ */
+static struct nm_desc *netmap_open(const NetdevNetmapOptions *nm_opts,
+ Error **errp)
+{
+ struct nm_desc *nmd;
+ struct nmreq req;
+
+ memset(&req, 0, sizeof(req));
+
+ nmd = nm_open(nm_opts->ifname, &req, NETMAP_NO_TX_POLL,
+ NULL);
+ if (nmd == NULL) {
+ error_setg_errno(errp, errno, "Failed to nm_open() %s",
+ nm_opts->ifname);
+ return NULL;
+ }
+
+ return nmd;
+}
+
+static void netmap_send(void *opaque);
+static void netmap_writable(void *opaque);
+
+/* Set the event-loop handlers for the netmap backend. */
+static void netmap_update_fd_handler(NetmapState *s)
+{
+ qemu_set_fd_handler(s->nmd->fd,
+ s->read_poll ? netmap_send : NULL,
+ s->write_poll ? netmap_writable : NULL,
+ s);
+}
+
+/* Update the read handler. */
+static void netmap_read_poll(NetmapState *s, bool enable)
+{
+ if (s->read_poll != enable) { /* Do nothing if not changed. */
+ s->read_poll = enable;
+ netmap_update_fd_handler(s);
+ }
+}
+
+/* Update the write handler. */
+static void netmap_write_poll(NetmapState *s, bool enable)
+{
+ if (s->write_poll != enable) {
+ s->write_poll = enable;
+ netmap_update_fd_handler(s);
+ }
+}
+
+static void netmap_poll(NetClientState *nc, bool enable)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+
+ if (s->read_poll != enable || s->write_poll != enable) {
+ s->write_poll = enable;
+ s->read_poll = enable;
+ netmap_update_fd_handler(s);
+ }
+}
+
+/*
+ * The fd_write() callback, invoked if the fd is marked as
+ * writable after a poll. Unregister the handler and flush any
+ * buffered packets.
+ */
+static void netmap_writable(void *opaque)
+{
+ NetmapState *s = opaque;
+
+ netmap_write_poll(s, false);
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t netmap_receive_iov(NetClientState *nc,
+ const struct iovec *iov, int iovcnt)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+ struct netmap_ring *ring = s->tx;
+ unsigned int tail = ring->tail;
+ ssize_t totlen = 0;
+ uint32_t last;
+ uint32_t idx;
+ uint8_t *dst;
+ int j;
+ uint32_t i;
+
+ last = i = ring->head;
+
+ if (nm_ring_space(ring) < iovcnt) {
+ /* Not enough netmap slots. Tell the kernel that we have seen the new
+ * available slots (so that it notifies us again when it has more
+ * ones), but without publishing any new slots to be processed
+ * (e.g., we don't advance ring->head). */
+ ring->cur = tail;
+ netmap_write_poll(s, true);
+ return 0;
+ }
+
+ for (j = 0; j < iovcnt; j++) {
+ int iov_frag_size = iov[j].iov_len;
+ int offset = 0;
+ int nm_frag_size;
+
+ totlen += iov_frag_size;
+
+ /* Split each iovec fragment over more netmap slots, if
+ necessary. */
+ while (iov_frag_size) {
+ nm_frag_size = MIN(iov_frag_size, ring->nr_buf_size);
+
+ if (unlikely(i == tail)) {
+ /* We ran out of netmap slots while splitting the
+ iovec fragments. */
+ ring->cur = tail;
+ netmap_write_poll(s, true);
+ return 0;
+ }
+
+ idx = ring->slot[i].buf_idx;
+ dst = (uint8_t *)NETMAP_BUF(ring, idx);
+
+ ring->slot[i].len = nm_frag_size;
+ ring->slot[i].flags = NS_MOREFRAG;
+ pkt_copy(iov[j].iov_base + offset, dst, nm_frag_size);
+
+ last = i;
+ i = nm_ring_next(ring, i);
+
+ offset += nm_frag_size;
+ iov_frag_size -= nm_frag_size;
+ }
+ }
+ /* The last slot must not have NS_MOREFRAG set. */
+ ring->slot[last].flags &= ~NS_MOREFRAG;
+
+ /* Now update ring->head and ring->cur to publish the new slots and
+ * the new wakeup point. */
+ ring->head = ring->cur = i;
+
+ ioctl(s->nmd->fd, NIOCTXSYNC, NULL);
+
+ return totlen;
+}
+
+static ssize_t netmap_receive(NetClientState *nc,
+ const uint8_t *buf, size_t size)
+{
+ struct iovec iov;
+
+ iov.iov_base = (void *)buf;
+ iov.iov_len = size;
+
+ return netmap_receive_iov(nc, &iov, 1);
+}
+
+/* Complete a previous send (backend --> guest) and enable the
+ fd_read callback. */
+static void netmap_send_completed(NetClientState *nc, ssize_t len)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+
+ netmap_read_poll(s, true);
+}
+
+static void netmap_send(void *opaque)
+{
+ NetmapState *s = opaque;
+ struct netmap_ring *ring = s->rx;
+ unsigned int tail = ring->tail;
+
+ /* Keep sending while there are available slots in the netmap
+ RX ring and the forwarding path towards the peer is open. */
+ while (ring->head != tail) {
+ uint32_t i = ring->head;
+ uint32_t idx;
+ bool morefrag;
+ int iovcnt = 0;
+ int iovsize;
+
+ /* Get a (possibly multi-slot) packet. */
+ do {
+ idx = ring->slot[i].buf_idx;
+ morefrag = (ring->slot[i].flags & NS_MOREFRAG);
+ s->iov[iovcnt].iov_base = (void *)NETMAP_BUF(ring, idx);
+ s->iov[iovcnt].iov_len = ring->slot[i].len;
+ iovcnt++;
+ i = nm_ring_next(ring, i);
+ } while (i != tail && morefrag);
+
+ /* Advance ring->cur to tell the kernel that we have seen the slots. */
+ ring->cur = i;
+
+ if (unlikely(morefrag)) {
+ /* This is a truncated packet, so we can stop without releasing the
+ * incomplete slots by updating ring->head. We will hopefully
+ * re-read the complete packet the next time we are called. */
+ break;
+ }
+
+ iovsize = qemu_sendv_packet_async(&s->nc, s->iov, iovcnt,
+ netmap_send_completed);
+
+ /* Release the slots to the kernel. */
+ ring->head = i;
+
+ if (iovsize == 0) {
+ /* The peer does not receive anymore. Packet is queued, stop
+ * reading from the backend until netmap_send_completed(). */
+ netmap_read_poll(s, false);
+ break;
+ }
+ }
+}
+
+/* Flush and close. */
+static void netmap_cleanup(NetClientState *nc)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+
+ qemu_purge_queued_packets(nc);
+
+ netmap_poll(nc, false);
+ nm_close(s->nmd);
+ s->nmd = NULL;
+}
+
+/* Offloading manipulation support callbacks. */
+static int netmap_fd_set_vnet_hdr_len(NetmapState *s, int len)
+{
+ struct nmreq req;
+
+ /* Issue a NETMAP_BDG_VNET_HDR command to change the virtio-net header
+ * length for the netmap adapter associated to 's->ifname'.
+ */
+ memset(&req, 0, sizeof(req));
+ pstrcpy(req.nr_name, sizeof(req.nr_name), s->ifname);
+ req.nr_version = NETMAP_API;
+ req.nr_cmd = NETMAP_BDG_VNET_HDR;
+ req.nr_arg1 = len;
+
+ return ioctl(s->nmd->fd, NIOCREGIF, &req);
+}
+
+static bool netmap_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+ int prev_len = s->vnet_hdr_len;
+
+ /* Check that we can set the new length. */
+ if (netmap_fd_set_vnet_hdr_len(s, len)) {
+ return false;
+ }
+
+ /* Restore the previous length. */
+ if (netmap_fd_set_vnet_hdr_len(s, prev_len)) {
+ error_report("Failed to restore vnet-hdr length %d on %s: %s",
+ prev_len, s->ifname, strerror(errno));
+ abort();
+ }
+
+ return true;
+}
+
+/* A netmap interface that supports virtio-net headers always
+ * supports UFO, so we use this callback also for the has_ufo hook. */
+static bool netmap_has_vnet_hdr(NetClientState *nc)
+{
+ return netmap_has_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr));
+}
+
+static void netmap_using_vnet_hdr(NetClientState *nc, bool enable)
+{
+}
+
+static void netmap_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+ int err;
+
+ err = netmap_fd_set_vnet_hdr_len(s, len);
+ if (err) {
+ error_report("Unable to set vnet-hdr length %d on %s: %s",
+ len, s->ifname, strerror(errno));
+ } else {
+ /* Keep track of the current length. */
+ s->vnet_hdr_len = len;
+ }
+}
+
+static void netmap_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
+ int ecn, int ufo)
+{
+ NetmapState *s = DO_UPCAST(NetmapState, nc, nc);
+
+ /* Setting a virtio-net header length greater than zero automatically
+ * enables the offloadings. */
+ if (!s->vnet_hdr_len) {
+ netmap_set_vnet_hdr_len(nc, sizeof(struct virtio_net_hdr));
+ }
+}
+
+/* NetClientInfo methods */
+static NetClientInfo net_netmap_info = {
+ .type = NET_CLIENT_DRIVER_NETMAP,
+ .size = sizeof(NetmapState),
+ .receive = netmap_receive,
+ .receive_iov = netmap_receive_iov,
+ .poll = netmap_poll,
+ .cleanup = netmap_cleanup,
+ .has_ufo = netmap_has_vnet_hdr,
+ .has_vnet_hdr = netmap_has_vnet_hdr,
+ .has_vnet_hdr_len = netmap_has_vnet_hdr_len,
+ .using_vnet_hdr = netmap_using_vnet_hdr,
+ .set_offload = netmap_set_offload,
+ .set_vnet_hdr_len = netmap_set_vnet_hdr_len,
+};
+
+/* The exported init function
+ *
+ * ... -net netmap,ifname="..."
+ */
+int net_init_netmap(const Netdev *netdev,
+ const char *name, NetClientState *peer, Error **errp)
+{
+ const NetdevNetmapOptions *netmap_opts = &netdev->u.netmap;
+ struct nm_desc *nmd;
+ NetClientState *nc;
+ Error *err = NULL;
+ NetmapState *s;
+
+ nmd = netmap_open(netmap_opts, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return -1;
+ }
+ /* Create the object. */
+ nc = qemu_new_net_client(&net_netmap_info, peer, "netmap", name);
+ s = DO_UPCAST(NetmapState, nc, nc);
+ s->nmd = nmd;
+ s->tx = NETMAP_TXRING(nmd->nifp, 0);
+ s->rx = NETMAP_RXRING(nmd->nifp, 0);
+ s->vnet_hdr_len = 0;
+ pstrcpy(s->ifname, sizeof(s->ifname), netmap_opts->ifname);
+ netmap_read_poll(s, true); /* Initially only poll for reads. */
+
+ return 0;
+}
+
diff --git a/net/queue.c b/net/queue.c
new file mode 100644
index 000000000..c872d51df
--- /dev/null
+++ b/net/queue.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "net/queue.h"
+#include "qemu/queue.h"
+#include "net/net.h"
+
+/* The delivery handler may only return zero if it will call
+ * qemu_net_queue_flush() when it determines that it is once again able
+ * to deliver packets. It must also call qemu_net_queue_purge() in its
+ * cleanup path.
+ *
+ * If a sent callback is provided to send(), the caller must handle a
+ * zero return from the delivery handler by not sending any more packets
+ * until we have invoked the callback. Only in that case will we queue
+ * the packet.
+ *
+ * If a sent callback isn't provided, we just drop the packet to avoid
+ * unbounded queueing.
+ */
+
+struct NetPacket {
+ QTAILQ_ENTRY(NetPacket) entry;
+ NetClientState *sender;
+ unsigned flags;
+ int size;
+ NetPacketSent *sent_cb;
+ uint8_t data[];
+};
+
+struct NetQueue {
+ void *opaque;
+ uint32_t nq_maxlen;
+ uint32_t nq_count;
+ NetQueueDeliverFunc *deliver;
+
+ QTAILQ_HEAD(, NetPacket) packets;
+
+ unsigned delivering : 1;
+};
+
+NetQueue *qemu_new_net_queue(NetQueueDeliverFunc *deliver, void *opaque)
+{
+ NetQueue *queue;
+
+ queue = g_new0(NetQueue, 1);
+
+ queue->opaque = opaque;
+ queue->nq_maxlen = 10000;
+ queue->nq_count = 0;
+ queue->deliver = deliver;
+
+ QTAILQ_INIT(&queue->packets);
+
+ queue->delivering = 0;
+
+ return queue;
+}
+
+void qemu_del_net_queue(NetQueue *queue)
+{
+ NetPacket *packet, *next;
+
+ QTAILQ_FOREACH_SAFE(packet, &queue->packets, entry, next) {
+ QTAILQ_REMOVE(&queue->packets, packet, entry);
+ g_free(packet);
+ }
+
+ g_free(queue);
+}
+
+static void qemu_net_queue_append(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const uint8_t *buf,
+ size_t size,
+ NetPacketSent *sent_cb)
+{
+ NetPacket *packet;
+
+ if (queue->nq_count >= queue->nq_maxlen && !sent_cb) {
+ return; /* drop if queue full and no callback */
+ }
+ packet = g_malloc(sizeof(NetPacket) + size);
+ packet->sender = sender;
+ packet->flags = flags;
+ packet->size = size;
+ packet->sent_cb = sent_cb;
+ memcpy(packet->data, buf, size);
+
+ queue->nq_count++;
+ QTAILQ_INSERT_TAIL(&queue->packets, packet, entry);
+}
+
+void qemu_net_queue_append_iov(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ NetPacket *packet;
+ size_t max_len = 0;
+ int i;
+
+ if (queue->nq_count >= queue->nq_maxlen && !sent_cb) {
+ return; /* drop if queue full and no callback */
+ }
+ for (i = 0; i < iovcnt; i++) {
+ max_len += iov[i].iov_len;
+ }
+
+ packet = g_malloc(sizeof(NetPacket) + max_len);
+ packet->sender = sender;
+ packet->sent_cb = sent_cb;
+ packet->flags = flags;
+ packet->size = 0;
+
+ for (i = 0; i < iovcnt; i++) {
+ size_t len = iov[i].iov_len;
+
+ memcpy(packet->data + packet->size, iov[i].iov_base, len);
+ packet->size += len;
+ }
+
+ queue->nq_count++;
+ QTAILQ_INSERT_TAIL(&queue->packets, packet, entry);
+}
+
+static ssize_t qemu_net_queue_deliver(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const uint8_t *data,
+ size_t size)
+{
+ ssize_t ret = -1;
+ struct iovec iov = {
+ .iov_base = (void *)data,
+ .iov_len = size
+ };
+
+ queue->delivering = 1;
+ ret = queue->deliver(sender, flags, &iov, 1, queue->opaque);
+ queue->delivering = 0;
+
+ return ret;
+}
+
+static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ ssize_t ret = -1;
+
+ queue->delivering = 1;
+ ret = queue->deliver(sender, flags, iov, iovcnt, queue->opaque);
+ queue->delivering = 0;
+
+ return ret;
+}
+
+ssize_t qemu_net_queue_receive(NetQueue *queue,
+ const uint8_t *data,
+ size_t size)
+{
+ if (queue->delivering) {
+ return 0;
+ }
+
+ return qemu_net_queue_deliver(queue, NULL, 0, data, size);
+}
+
+ssize_t qemu_net_queue_receive_iov(NetQueue *queue,
+ const struct iovec *iov,
+ int iovcnt)
+{
+ if (queue->delivering) {
+ return 0;
+ }
+
+ return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt);
+}
+
+ssize_t qemu_net_queue_send(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const uint8_t *data,
+ size_t size,
+ NetPacketSent *sent_cb)
+{
+ ssize_t ret;
+
+ if (queue->delivering || !qemu_can_send_packet(sender)) {
+ qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);
+ return 0;
+ }
+
+ ret = qemu_net_queue_deliver(queue, sender, flags, data, size);
+ if (ret == 0) {
+ qemu_net_queue_append(queue, sender, flags, data, size, sent_cb);
+ return 0;
+ }
+
+ qemu_net_queue_flush(queue);
+
+ return ret;
+}
+
+ssize_t qemu_net_queue_send_iov(NetQueue *queue,
+ NetClientState *sender,
+ unsigned flags,
+ const struct iovec *iov,
+ int iovcnt,
+ NetPacketSent *sent_cb)
+{
+ ssize_t ret;
+
+ if (queue->delivering || !qemu_can_send_packet(sender)) {
+ qemu_net_queue_append_iov(queue, sender, flags, iov, iovcnt, sent_cb);
+ return 0;
+ }
+
+ ret = qemu_net_queue_deliver_iov(queue, sender, flags, iov, iovcnt);
+ if (ret == 0) {
+ qemu_net_queue_append_iov(queue, sender, flags, iov, iovcnt, sent_cb);
+ return 0;
+ }
+
+ qemu_net_queue_flush(queue);
+
+ return ret;
+}
+
+void qemu_net_queue_purge(NetQueue *queue, NetClientState *from)
+{
+ NetPacket *packet, *next;
+
+ QTAILQ_FOREACH_SAFE(packet, &queue->packets, entry, next) {
+ if (packet->sender == from) {
+ QTAILQ_REMOVE(&queue->packets, packet, entry);
+ queue->nq_count--;
+ if (packet->sent_cb) {
+ packet->sent_cb(packet->sender, 0);
+ }
+ g_free(packet);
+ }
+ }
+}
+
+bool qemu_net_queue_flush(NetQueue *queue)
+{
+ if (queue->delivering)
+ return false;
+
+ while (!QTAILQ_EMPTY(&queue->packets)) {
+ NetPacket *packet;
+ int ret;
+
+ packet = QTAILQ_FIRST(&queue->packets);
+ QTAILQ_REMOVE(&queue->packets, packet, entry);
+ queue->nq_count--;
+
+ ret = qemu_net_queue_deliver(queue,
+ packet->sender,
+ packet->flags,
+ packet->data,
+ packet->size);
+ if (ret == 0) {
+ queue->nq_count++;
+ QTAILQ_INSERT_HEAD(&queue->packets, packet, entry);
+ return false;
+ }
+
+ if (packet->sent_cb) {
+ packet->sent_cb(packet->sender, ret);
+ }
+
+ g_free(packet);
+ }
+ return true;
+}
diff --git a/net/slirp.c b/net/slirp.c
new file mode 100644
index 000000000..ad3a838e0
--- /dev/null
+++ b/net/slirp.c
@@ -0,0 +1,1124 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/log.h"
+#include "net/slirp.h"
+
+
+#if defined(CONFIG_SLIRP_SMBD)
+#include <pwd.h>
+#include <sys/wait.h>
+#endif
+#include "net/eth.h"
+#include "net/net.h"
+#include "clients.h"
+#include "hub.h"
+#include "monitor/monitor.h"
+#include "qemu/error-report.h"
+#include "qemu/sockets.h"
+#include <libslirp.h>
+#include "chardev/char-fe.h"
+#include "sysemu/sysemu.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
+#include "util.h"
+#include "migration/register.h"
+#include "migration/qemu-file-types.h"
+
+static int get_str_sep(char *buf, int buf_size, const char **pp, int sep)
+{
+ const char *p, *p1;
+ int len;
+ p = *pp;
+ p1 = strchr(p, sep);
+ if (!p1)
+ return -1;
+ len = p1 - p;
+ p1++;
+ if (buf_size > 0) {
+ if (len > buf_size - 1)
+ len = buf_size - 1;
+ memcpy(buf, p, len);
+ buf[len] = '\0';
+ }
+ *pp = p1;
+ return 0;
+}
+
+/* slirp network adapter */
+
+#define SLIRP_CFG_HOSTFWD 1
+
+struct slirp_config_str {
+ struct slirp_config_str *next;
+ int flags;
+ char str[1024];
+};
+
+struct GuestFwd {
+ CharBackend hd;
+ struct in_addr server;
+ int port;
+ Slirp *slirp;
+};
+
+typedef struct SlirpState {
+ NetClientState nc;
+ QTAILQ_ENTRY(SlirpState) entry;
+ Slirp *slirp;
+ Notifier poll_notifier;
+ Notifier exit_notifier;
+#if defined(CONFIG_SLIRP_SMBD)
+ gchar *smb_dir;
+#endif
+ GSList *fwd;
+} SlirpState;
+
+static struct slirp_config_str *slirp_configs;
+static QTAILQ_HEAD(, SlirpState) slirp_stacks =
+ QTAILQ_HEAD_INITIALIZER(slirp_stacks);
+
+static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp);
+static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp);
+
+#if defined(CONFIG_SLIRP_SMBD)
+static int slirp_smb(SlirpState *s, const char *exported_dir,
+ struct in_addr vserver_addr, Error **errp);
+static void slirp_smb_cleanup(SlirpState *s);
+#else
+static inline void slirp_smb_cleanup(SlirpState *s) { }
+#endif
+
+static ssize_t net_slirp_send_packet(const void *pkt, size_t pkt_len,
+ void *opaque)
+{
+ SlirpState *s = opaque;
+ uint8_t min_pkt[ETH_ZLEN];
+ size_t min_pktsz = sizeof(min_pkt);
+
+ if (net_peer_needs_padding(&s->nc)) {
+ if (eth_pad_short_frame(min_pkt, &min_pktsz, pkt, pkt_len)) {
+ pkt = min_pkt;
+ pkt_len = min_pktsz;
+ }
+ }
+
+ return qemu_send_packet(&s->nc, pkt, pkt_len);
+}
+
+static ssize_t net_slirp_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ SlirpState *s = DO_UPCAST(SlirpState, nc, nc);
+
+ slirp_input(s->slirp, buf, size);
+
+ return size;
+}
+
+static void slirp_smb_exit(Notifier *n, void *data)
+{
+ SlirpState *s = container_of(n, SlirpState, exit_notifier);
+ slirp_smb_cleanup(s);
+}
+
+static void slirp_free_fwd(gpointer data)
+{
+ struct GuestFwd *fwd = data;
+
+ qemu_chr_fe_deinit(&fwd->hd, true);
+ g_free(data);
+}
+
+static void net_slirp_cleanup(NetClientState *nc)
+{
+ SlirpState *s = DO_UPCAST(SlirpState, nc, nc);
+
+ g_slist_free_full(s->fwd, slirp_free_fwd);
+ main_loop_poll_remove_notifier(&s->poll_notifier);
+ unregister_savevm(NULL, "slirp", s->slirp);
+ slirp_cleanup(s->slirp);
+ if (s->exit_notifier.notify) {
+ qemu_remove_exit_notifier(&s->exit_notifier);
+ }
+ slirp_smb_cleanup(s);
+ QTAILQ_REMOVE(&slirp_stacks, s, entry);
+}
+
+static NetClientInfo net_slirp_info = {
+ .type = NET_CLIENT_DRIVER_USER,
+ .size = sizeof(SlirpState),
+ .receive = net_slirp_receive,
+ .cleanup = net_slirp_cleanup,
+};
+
+static void net_slirp_guest_error(const char *msg, void *opaque)
+{
+ qemu_log_mask(LOG_GUEST_ERROR, "%s", msg);
+}
+
+static int64_t net_slirp_clock_get_ns(void *opaque)
+{
+ return qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
+}
+
+static void *net_slirp_timer_new(SlirpTimerCb cb,
+ void *cb_opaque, void *opaque)
+{
+ return timer_new_full(NULL, QEMU_CLOCK_VIRTUAL,
+ SCALE_MS, QEMU_TIMER_ATTR_EXTERNAL,
+ cb, cb_opaque);
+}
+
+static void net_slirp_timer_free(void *timer, void *opaque)
+{
+ timer_free(timer);
+}
+
+static void net_slirp_timer_mod(void *timer, int64_t expire_timer,
+ void *opaque)
+{
+ timer_mod(timer, expire_timer);
+}
+
+static void net_slirp_register_poll_fd(int fd, void *opaque)
+{
+ qemu_fd_register(fd);
+}
+
+static void net_slirp_unregister_poll_fd(int fd, void *opaque)
+{
+ /* no qemu_fd_unregister */
+}
+
+static void net_slirp_notify(void *opaque)
+{
+ qemu_notify_event();
+}
+
+static const SlirpCb slirp_cb = {
+ .send_packet = net_slirp_send_packet,
+ .guest_error = net_slirp_guest_error,
+ .clock_get_ns = net_slirp_clock_get_ns,
+ .timer_new = net_slirp_timer_new,
+ .timer_free = net_slirp_timer_free,
+ .timer_mod = net_slirp_timer_mod,
+ .register_poll_fd = net_slirp_register_poll_fd,
+ .unregister_poll_fd = net_slirp_unregister_poll_fd,
+ .notify = net_slirp_notify,
+};
+
+static int slirp_poll_to_gio(int events)
+{
+ int ret = 0;
+
+ if (events & SLIRP_POLL_IN) {
+ ret |= G_IO_IN;
+ }
+ if (events & SLIRP_POLL_OUT) {
+ ret |= G_IO_OUT;
+ }
+ if (events & SLIRP_POLL_PRI) {
+ ret |= G_IO_PRI;
+ }
+ if (events & SLIRP_POLL_ERR) {
+ ret |= G_IO_ERR;
+ }
+ if (events & SLIRP_POLL_HUP) {
+ ret |= G_IO_HUP;
+ }
+
+ return ret;
+}
+
+static int net_slirp_add_poll(int fd, int events, void *opaque)
+{
+ GArray *pollfds = opaque;
+ GPollFD pfd = {
+ .fd = fd,
+ .events = slirp_poll_to_gio(events),
+ };
+ int idx = pollfds->len;
+ g_array_append_val(pollfds, pfd);
+ return idx;
+}
+
+static int slirp_gio_to_poll(int events)
+{
+ int ret = 0;
+
+ if (events & G_IO_IN) {
+ ret |= SLIRP_POLL_IN;
+ }
+ if (events & G_IO_OUT) {
+ ret |= SLIRP_POLL_OUT;
+ }
+ if (events & G_IO_PRI) {
+ ret |= SLIRP_POLL_PRI;
+ }
+ if (events & G_IO_ERR) {
+ ret |= SLIRP_POLL_ERR;
+ }
+ if (events & G_IO_HUP) {
+ ret |= SLIRP_POLL_HUP;
+ }
+
+ return ret;
+}
+
+static int net_slirp_get_revents(int idx, void *opaque)
+{
+ GArray *pollfds = opaque;
+
+ return slirp_gio_to_poll(g_array_index(pollfds, GPollFD, idx).revents);
+}
+
+static void net_slirp_poll_notify(Notifier *notifier, void *data)
+{
+ MainLoopPoll *poll = data;
+ SlirpState *s = container_of(notifier, SlirpState, poll_notifier);
+
+ switch (poll->state) {
+ case MAIN_LOOP_POLL_FILL:
+ slirp_pollfds_fill(s->slirp, &poll->timeout,
+ net_slirp_add_poll, poll->pollfds);
+ break;
+ case MAIN_LOOP_POLL_OK:
+ case MAIN_LOOP_POLL_ERR:
+ slirp_pollfds_poll(s->slirp, poll->state == MAIN_LOOP_POLL_ERR,
+ net_slirp_get_revents, poll->pollfds);
+ break;
+ default:
+ g_assert_not_reached();
+ }
+}
+
+static ssize_t
+net_slirp_stream_read(void *buf, size_t size, void *opaque)
+{
+ QEMUFile *f = opaque;
+
+ return qemu_get_buffer(f, buf, size);
+}
+
+static ssize_t
+net_slirp_stream_write(const void *buf, size_t size, void *opaque)
+{
+ QEMUFile *f = opaque;
+
+ qemu_put_buffer(f, buf, size);
+ if (qemu_file_get_error(f)) {
+ return -1;
+ }
+
+ return size;
+}
+
+static int net_slirp_state_load(QEMUFile *f, void *opaque, int version_id)
+{
+ Slirp *slirp = opaque;
+
+ return slirp_state_load(slirp, version_id, net_slirp_stream_read, f);
+}
+
+static void net_slirp_state_save(QEMUFile *f, void *opaque)
+{
+ Slirp *slirp = opaque;
+
+ slirp_state_save(slirp, net_slirp_stream_write, f);
+}
+
+static SaveVMHandlers savevm_slirp_state = {
+ .save_state = net_slirp_state_save,
+ .load_state = net_slirp_state_load,
+};
+
+static int net_slirp_init(NetClientState *peer, const char *model,
+ const char *name, int restricted,
+ bool ipv4, const char *vnetwork, const char *vhost,
+ bool ipv6, const char *vprefix6, int vprefix6_len,
+ const char *vhost6,
+ const char *vhostname, const char *tftp_export,
+ const char *bootfile, const char *vdhcp_start,
+ const char *vnameserver, const char *vnameserver6,
+ const char *smb_export, const char *vsmbserver,
+ const char **dnssearch, const char *vdomainname,
+ const char *tftp_server_name,
+ Error **errp)
+{
+ /* default settings according to historic slirp */
+ struct in_addr net = { .s_addr = htonl(0x0a000200) }; /* 10.0.2.0 */
+ struct in_addr mask = { .s_addr = htonl(0xffffff00) }; /* 255.255.255.0 */
+ struct in_addr host = { .s_addr = htonl(0x0a000202) }; /* 10.0.2.2 */
+ struct in_addr dhcp = { .s_addr = htonl(0x0a00020f) }; /* 10.0.2.15 */
+ struct in_addr dns = { .s_addr = htonl(0x0a000203) }; /* 10.0.2.3 */
+ struct in6_addr ip6_prefix;
+ struct in6_addr ip6_host;
+ struct in6_addr ip6_dns;
+#if defined(CONFIG_SLIRP_SMBD)
+ struct in_addr smbsrv = { .s_addr = 0 };
+#endif
+ NetClientState *nc;
+ SlirpState *s;
+ char buf[20];
+ uint32_t addr;
+ int shift;
+ char *end;
+ struct slirp_config_str *config;
+
+ if (!ipv4 && (vnetwork || vhost || vnameserver)) {
+ error_setg(errp, "IPv4 disabled but netmask/host/dns provided");
+ return -1;
+ }
+
+ if (!ipv6 && (vprefix6 || vhost6 || vnameserver6)) {
+ error_setg(errp, "IPv6 disabled but prefix/host6/dns6 provided");
+ return -1;
+ }
+
+ if (!ipv4 && !ipv6) {
+ /* It doesn't make sense to disable both */
+ error_setg(errp, "IPv4 and IPv6 disabled");
+ return -1;
+ }
+
+ if (vnetwork) {
+ if (get_str_sep(buf, sizeof(buf), &vnetwork, '/') < 0) {
+ if (!inet_aton(vnetwork, &net)) {
+ error_setg(errp, "Failed to parse netmask");
+ return -1;
+ }
+ addr = ntohl(net.s_addr);
+ if (!(addr & 0x80000000)) {
+ mask.s_addr = htonl(0xff000000); /* class A */
+ } else if ((addr & 0xfff00000) == 0xac100000) {
+ mask.s_addr = htonl(0xfff00000); /* priv. 172.16.0.0/12 */
+ } else if ((addr & 0xc0000000) == 0x80000000) {
+ mask.s_addr = htonl(0xffff0000); /* class B */
+ } else if ((addr & 0xffff0000) == 0xc0a80000) {
+ mask.s_addr = htonl(0xffff0000); /* priv. 192.168.0.0/16 */
+ } else if ((addr & 0xffff0000) == 0xc6120000) {
+ mask.s_addr = htonl(0xfffe0000); /* tests 198.18.0.0/15 */
+ } else if ((addr & 0xe0000000) == 0xe0000000) {
+ mask.s_addr = htonl(0xffffff00); /* class C */
+ } else {
+ mask.s_addr = htonl(0xfffffff0); /* multicast/reserved */
+ }
+ } else {
+ if (!inet_aton(buf, &net)) {
+ error_setg(errp, "Failed to parse netmask");
+ return -1;
+ }
+ shift = strtol(vnetwork, &end, 10);
+ if (*end != '\0') {
+ if (!inet_aton(vnetwork, &mask)) {
+ error_setg(errp,
+ "Failed to parse netmask (trailing chars)");
+ return -1;
+ }
+ } else if (shift < 4 || shift > 32) {
+ error_setg(errp,
+ "Invalid netmask provided (must be in range 4-32)");
+ return -1;
+ } else {
+ mask.s_addr = htonl(0xffffffff << (32 - shift));
+ }
+ }
+ net.s_addr &= mask.s_addr;
+ host.s_addr = net.s_addr | (htonl(0x0202) & ~mask.s_addr);
+ dhcp.s_addr = net.s_addr | (htonl(0x020f) & ~mask.s_addr);
+ dns.s_addr = net.s_addr | (htonl(0x0203) & ~mask.s_addr);
+ }
+
+ if (vhost && !inet_aton(vhost, &host)) {
+ error_setg(errp, "Failed to parse host");
+ return -1;
+ }
+ if ((host.s_addr & mask.s_addr) != net.s_addr) {
+ error_setg(errp, "Host doesn't belong to network");
+ return -1;
+ }
+
+ if (vnameserver && !inet_aton(vnameserver, &dns)) {
+ error_setg(errp, "Failed to parse DNS");
+ return -1;
+ }
+ if (restricted && (dns.s_addr & mask.s_addr) != net.s_addr) {
+ error_setg(errp, "DNS doesn't belong to network");
+ return -1;
+ }
+ if (dns.s_addr == host.s_addr) {
+ error_setg(errp, "DNS must be different from host");
+ return -1;
+ }
+
+ if (vdhcp_start && !inet_aton(vdhcp_start, &dhcp)) {
+ error_setg(errp, "Failed to parse DHCP start address");
+ return -1;
+ }
+ if ((dhcp.s_addr & mask.s_addr) != net.s_addr) {
+ error_setg(errp, "DHCP doesn't belong to network");
+ return -1;
+ }
+ if (dhcp.s_addr == host.s_addr || dhcp.s_addr == dns.s_addr) {
+ error_setg(errp, "DHCP must be different from host and DNS");
+ return -1;
+ }
+
+#if defined(CONFIG_SLIRP_SMBD)
+ if (vsmbserver && !inet_aton(vsmbserver, &smbsrv)) {
+ error_setg(errp, "Failed to parse SMB address");
+ return -1;
+ }
+#endif
+
+ if (!vprefix6) {
+ vprefix6 = "fec0::";
+ }
+ if (!inet_pton(AF_INET6, vprefix6, &ip6_prefix)) {
+ error_setg(errp, "Failed to parse IPv6 prefix");
+ return -1;
+ }
+
+ if (!vprefix6_len) {
+ vprefix6_len = 64;
+ }
+ if (vprefix6_len < 0 || vprefix6_len > 126) {
+ error_setg(errp,
+ "Invalid IPv6 prefix provided "
+ "(IPv6 prefix length must be between 0 and 126)");
+ return -1;
+ }
+
+ if (vhost6) {
+ if (!inet_pton(AF_INET6, vhost6, &ip6_host)) {
+ error_setg(errp, "Failed to parse IPv6 host");
+ return -1;
+ }
+ if (!in6_equal_net(&ip6_prefix, &ip6_host, vprefix6_len)) {
+ error_setg(errp, "IPv6 Host doesn't belong to network");
+ return -1;
+ }
+ } else {
+ ip6_host = ip6_prefix;
+ ip6_host.s6_addr[15] |= 2;
+ }
+
+ if (vnameserver6) {
+ if (!inet_pton(AF_INET6, vnameserver6, &ip6_dns)) {
+ error_setg(errp, "Failed to parse IPv6 DNS");
+ return -1;
+ }
+ if (restricted && !in6_equal_net(&ip6_prefix, &ip6_dns, vprefix6_len)) {
+ error_setg(errp, "IPv6 DNS doesn't belong to network");
+ return -1;
+ }
+ } else {
+ ip6_dns = ip6_prefix;
+ ip6_dns.s6_addr[15] |= 3;
+ }
+
+ if (vdomainname && !*vdomainname) {
+ error_setg(errp, "'domainname' parameter cannot be empty");
+ return -1;
+ }
+
+ if (vdomainname && strlen(vdomainname) > 255) {
+ error_setg(errp, "'domainname' parameter cannot exceed 255 bytes");
+ return -1;
+ }
+
+ if (vhostname && strlen(vhostname) > 255) {
+ error_setg(errp, "'vhostname' parameter cannot exceed 255 bytes");
+ return -1;
+ }
+
+ if (tftp_server_name && strlen(tftp_server_name) > 255) {
+ error_setg(errp, "'tftp-server-name' parameter cannot exceed 255 bytes");
+ return -1;
+ }
+
+ nc = qemu_new_net_client(&net_slirp_info, peer, model, name);
+
+ snprintf(nc->info_str, sizeof(nc->info_str),
+ "net=%s,restrict=%s", inet_ntoa(net),
+ restricted ? "on" : "off");
+
+ s = DO_UPCAST(SlirpState, nc, nc);
+
+ s->slirp = slirp_init(restricted, ipv4, net, mask, host,
+ ipv6, ip6_prefix, vprefix6_len, ip6_host,
+ vhostname, tftp_server_name,
+ tftp_export, bootfile, dhcp,
+ dns, ip6_dns, dnssearch, vdomainname,
+ &slirp_cb, s);
+ QTAILQ_INSERT_TAIL(&slirp_stacks, s, entry);
+
+ /*
+ * Make sure the current bitstream version of slirp is 4, to avoid
+ * QEMU migration incompatibilities, if upstream slirp bumped the
+ * version.
+ *
+ * FIXME: use bitfields of features? teach libslirp to save with
+ * specific version?
+ */
+ g_assert(slirp_state_version() == 4);
+ register_savevm_live("slirp", 0, slirp_state_version(),
+ &savevm_slirp_state, s->slirp);
+
+ s->poll_notifier.notify = net_slirp_poll_notify;
+ main_loop_poll_add_notifier(&s->poll_notifier);
+
+ for (config = slirp_configs; config; config = config->next) {
+ if (config->flags & SLIRP_CFG_HOSTFWD) {
+ if (slirp_hostfwd(s, config->str, errp) < 0) {
+ goto error;
+ }
+ } else {
+ if (slirp_guestfwd(s, config->str, errp) < 0) {
+ goto error;
+ }
+ }
+ }
+#if defined(CONFIG_SLIRP_SMBD)
+ if (smb_export) {
+ if (slirp_smb(s, smb_export, smbsrv, errp) < 0) {
+ goto error;
+ }
+ }
+#endif
+
+ s->exit_notifier.notify = slirp_smb_exit;
+ qemu_add_exit_notifier(&s->exit_notifier);
+ return 0;
+
+error:
+ qemu_del_net_client(nc);
+ return -1;
+}
+
+static SlirpState *slirp_lookup(Monitor *mon, const char *id)
+{
+ if (id) {
+ NetClientState *nc = qemu_find_netdev(id);
+ if (!nc) {
+ monitor_printf(mon, "unrecognized netdev id '%s'\n", id);
+ return NULL;
+ }
+ if (strcmp(nc->model, "user")) {
+ monitor_printf(mon, "invalid device specified\n");
+ return NULL;
+ }
+ return DO_UPCAST(SlirpState, nc, nc);
+ } else {
+ if (QTAILQ_EMPTY(&slirp_stacks)) {
+ monitor_printf(mon, "user mode network stack not in use\n");
+ return NULL;
+ }
+ return QTAILQ_FIRST(&slirp_stacks);
+ }
+}
+
+void hmp_hostfwd_remove(Monitor *mon, const QDict *qdict)
+{
+ struct in_addr host_addr = { .s_addr = INADDR_ANY };
+ int host_port;
+ char buf[256];
+ const char *src_str, *p;
+ SlirpState *s;
+ int is_udp = 0;
+ int err;
+ const char *arg1 = qdict_get_str(qdict, "arg1");
+ const char *arg2 = qdict_get_try_str(qdict, "arg2");
+
+ if (arg2) {
+ s = slirp_lookup(mon, arg1);
+ src_str = arg2;
+ } else {
+ s = slirp_lookup(mon, NULL);
+ src_str = arg1;
+ }
+ if (!s) {
+ return;
+ }
+
+ p = src_str;
+ if (!p || get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ goto fail_syntax;
+ }
+
+ if (!strcmp(buf, "tcp") || buf[0] == '\0') {
+ is_udp = 0;
+ } else if (!strcmp(buf, "udp")) {
+ is_udp = 1;
+ } else {
+ goto fail_syntax;
+ }
+
+ if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ goto fail_syntax;
+ }
+ if (buf[0] != '\0' && !inet_aton(buf, &host_addr)) {
+ goto fail_syntax;
+ }
+
+ if (qemu_strtoi(p, NULL, 10, &host_port)) {
+ goto fail_syntax;
+ }
+
+ err = slirp_remove_hostfwd(s->slirp, is_udp, host_addr, host_port);
+
+ monitor_printf(mon, "host forwarding rule for %s %s\n", src_str,
+ err ? "not found" : "removed");
+ return;
+
+ fail_syntax:
+ monitor_printf(mon, "invalid format\n");
+}
+
+static int slirp_hostfwd(SlirpState *s, const char *redir_str, Error **errp)
+{
+ struct in_addr host_addr = { .s_addr = INADDR_ANY };
+ struct in_addr guest_addr = { .s_addr = 0 };
+ int host_port, guest_port;
+ const char *p;
+ char buf[256];
+ int is_udp;
+ char *end;
+ const char *fail_reason = "Unknown reason";
+
+ p = redir_str;
+ if (!p || get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ fail_reason = "No : separators";
+ goto fail_syntax;
+ }
+ if (!strcmp(buf, "tcp") || buf[0] == '\0') {
+ is_udp = 0;
+ } else if (!strcmp(buf, "udp")) {
+ is_udp = 1;
+ } else {
+ fail_reason = "Bad protocol name";
+ goto fail_syntax;
+ }
+
+ if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ fail_reason = "Missing : separator";
+ goto fail_syntax;
+ }
+ if (buf[0] != '\0' && !inet_aton(buf, &host_addr)) {
+ fail_reason = "Bad host address";
+ goto fail_syntax;
+ }
+
+ if (get_str_sep(buf, sizeof(buf), &p, '-') < 0) {
+ fail_reason = "Bad host port separator";
+ goto fail_syntax;
+ }
+ host_port = strtol(buf, &end, 0);
+ if (*end != '\0' || host_port < 0 || host_port > 65535) {
+ fail_reason = "Bad host port";
+ goto fail_syntax;
+ }
+
+ if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ fail_reason = "Missing guest address";
+ goto fail_syntax;
+ }
+ if (buf[0] != '\0' && !inet_aton(buf, &guest_addr)) {
+ fail_reason = "Bad guest address";
+ goto fail_syntax;
+ }
+
+ guest_port = strtol(p, &end, 0);
+ if (*end != '\0' || guest_port < 1 || guest_port > 65535) {
+ fail_reason = "Bad guest port";
+ goto fail_syntax;
+ }
+
+ if (slirp_add_hostfwd(s->slirp, is_udp, host_addr, host_port, guest_addr,
+ guest_port) < 0) {
+ error_setg(errp, "Could not set up host forwarding rule '%s'",
+ redir_str);
+ return -1;
+ }
+ return 0;
+
+ fail_syntax:
+ error_setg(errp, "Invalid host forwarding rule '%s' (%s)", redir_str,
+ fail_reason);
+ return -1;
+}
+
+void hmp_hostfwd_add(Monitor *mon, const QDict *qdict)
+{
+ const char *redir_str;
+ SlirpState *s;
+ const char *arg1 = qdict_get_str(qdict, "arg1");
+ const char *arg2 = qdict_get_try_str(qdict, "arg2");
+
+ if (arg2) {
+ s = slirp_lookup(mon, arg1);
+ redir_str = arg2;
+ } else {
+ s = slirp_lookup(mon, NULL);
+ redir_str = arg1;
+ }
+ if (s) {
+ Error *err = NULL;
+ if (slirp_hostfwd(s, redir_str, &err) < 0) {
+ error_report_err(err);
+ }
+ }
+
+}
+
+#if defined(CONFIG_SLIRP_SMBD)
+
+/* automatic user mode samba server configuration */
+static void slirp_smb_cleanup(SlirpState *s)
+{
+ int ret;
+
+ if (s->smb_dir) {
+ gchar *cmd = g_strdup_printf("rm -rf %s", s->smb_dir);
+ ret = system(cmd);
+ if (ret == -1 || !WIFEXITED(ret)) {
+ error_report("'%s' failed.", cmd);
+ } else if (WEXITSTATUS(ret)) {
+ error_report("'%s' failed. Error code: %d",
+ cmd, WEXITSTATUS(ret));
+ }
+ g_free(cmd);
+ g_free(s->smb_dir);
+ s->smb_dir = NULL;
+ }
+}
+
+static int slirp_smb(SlirpState* s, const char *exported_dir,
+ struct in_addr vserver_addr, Error **errp)
+{
+ char *smb_conf;
+ char *smb_cmdline;
+ struct passwd *passwd;
+ FILE *f;
+
+ passwd = getpwuid(geteuid());
+ if (!passwd) {
+ error_setg(errp, "Failed to retrieve user name");
+ return -1;
+ }
+
+ if (access(CONFIG_SMBD_COMMAND, F_OK)) {
+ error_setg(errp, "Could not find '%s', please install it",
+ CONFIG_SMBD_COMMAND);
+ return -1;
+ }
+
+ if (access(exported_dir, R_OK | X_OK)) {
+ error_setg(errp, "Error accessing shared directory '%s': %s",
+ exported_dir, strerror(errno));
+ return -1;
+ }
+
+ s->smb_dir = g_dir_make_tmp("qemu-smb.XXXXXX", NULL);
+ if (!s->smb_dir) {
+ error_setg(errp, "Could not create samba server dir");
+ return -1;
+ }
+ smb_conf = g_strdup_printf("%s/%s", s->smb_dir, "smb.conf");
+
+ f = fopen(smb_conf, "w");
+ if (!f) {
+ slirp_smb_cleanup(s);
+ error_setg(errp,
+ "Could not create samba server configuration file '%s'",
+ smb_conf);
+ g_free(smb_conf);
+ return -1;
+ }
+ fprintf(f,
+ "[global]\n"
+ "private dir=%s\n"
+ "interfaces=127.0.0.1\n"
+ "bind interfaces only=yes\n"
+ "pid directory=%s\n"
+ "lock directory=%s\n"
+ "state directory=%s\n"
+ "cache directory=%s\n"
+ "ncalrpc dir=%s/ncalrpc\n"
+ "log file=%s/log.smbd\n"
+ "smb passwd file=%s/smbpasswd\n"
+ "security = user\n"
+ "map to guest = Bad User\n"
+ "load printers = no\n"
+ "printing = bsd\n"
+ "disable spoolss = yes\n"
+ "usershare max shares = 0\n"
+ "[qemu]\n"
+ "path=%s\n"
+ "read only=no\n"
+ "guest ok=yes\n"
+ "force user=%s\n",
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ s->smb_dir,
+ exported_dir,
+ passwd->pw_name
+ );
+ fclose(f);
+
+ smb_cmdline = g_strdup_printf("%s -l %s -s %s",
+ CONFIG_SMBD_COMMAND, s->smb_dir, smb_conf);
+ g_free(smb_conf);
+
+ if (slirp_add_exec(s->slirp, smb_cmdline, &vserver_addr, 139) < 0 ||
+ slirp_add_exec(s->slirp, smb_cmdline, &vserver_addr, 445) < 0) {
+ slirp_smb_cleanup(s);
+ g_free(smb_cmdline);
+ error_setg(errp, "Conflicting/invalid smbserver address");
+ return -1;
+ }
+ g_free(smb_cmdline);
+ return 0;
+}
+
+#endif /* defined(CONFIG_SLIRP_SMBD) */
+
+static int guestfwd_can_read(void *opaque)
+{
+ struct GuestFwd *fwd = opaque;
+ return slirp_socket_can_recv(fwd->slirp, fwd->server, fwd->port);
+}
+
+static void guestfwd_read(void *opaque, const uint8_t *buf, int size)
+{
+ struct GuestFwd *fwd = opaque;
+ slirp_socket_recv(fwd->slirp, fwd->server, fwd->port, buf, size);
+}
+
+static ssize_t guestfwd_write(const void *buf, size_t len, void *chr)
+{
+ return qemu_chr_fe_write_all(chr, buf, len);
+}
+
+static int slirp_guestfwd(SlirpState *s, const char *config_str, Error **errp)
+{
+ /* TODO: IPv6 */
+ struct in_addr server = { .s_addr = 0 };
+ struct GuestFwd *fwd;
+ const char *p;
+ char buf[128];
+ char *end;
+ int port;
+
+ p = config_str;
+ if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ goto fail_syntax;
+ }
+ if (strcmp(buf, "tcp") && buf[0] != '\0') {
+ goto fail_syntax;
+ }
+ if (get_str_sep(buf, sizeof(buf), &p, ':') < 0) {
+ goto fail_syntax;
+ }
+ if (buf[0] != '\0' && !inet_aton(buf, &server)) {
+ goto fail_syntax;
+ }
+ if (get_str_sep(buf, sizeof(buf), &p, '-') < 0) {
+ goto fail_syntax;
+ }
+ port = strtol(buf, &end, 10);
+ if (*end != '\0' || port < 1 || port > 65535) {
+ goto fail_syntax;
+ }
+
+ snprintf(buf, sizeof(buf), "guestfwd.tcp.%d", port);
+
+ if (g_str_has_prefix(p, "cmd:")) {
+ if (slirp_add_exec(s->slirp, &p[4], &server, port) < 0) {
+ error_setg(errp, "Conflicting/invalid host:port in guest "
+ "forwarding rule '%s'", config_str);
+ return -1;
+ }
+ } else {
+ Error *err = NULL;
+ /*
+ * FIXME: sure we want to support implicit
+ * muxed monitors here?
+ */
+ Chardev *chr = qemu_chr_new_mux_mon(buf, p, NULL);
+
+ if (!chr) {
+ error_setg(errp, "Could not open guest forwarding device '%s'",
+ buf);
+ return -1;
+ }
+
+ fwd = g_new(struct GuestFwd, 1);
+ qemu_chr_fe_init(&fwd->hd, chr, &err);
+ if (err) {
+ error_propagate(errp, err);
+ object_unparent(OBJECT(chr));
+ g_free(fwd);
+ return -1;
+ }
+
+ if (slirp_add_guestfwd(s->slirp, guestfwd_write, &fwd->hd,
+ &server, port) < 0) {
+ error_setg(errp, "Conflicting/invalid host:port in guest "
+ "forwarding rule '%s'", config_str);
+ qemu_chr_fe_deinit(&fwd->hd, true);
+ g_free(fwd);
+ return -1;
+ }
+ fwd->server = server;
+ fwd->port = port;
+ fwd->slirp = s->slirp;
+
+ qemu_chr_fe_set_handlers(&fwd->hd, guestfwd_can_read, guestfwd_read,
+ NULL, NULL, fwd, NULL, true);
+ s->fwd = g_slist_append(s->fwd, fwd);
+ }
+ return 0;
+
+ fail_syntax:
+ error_setg(errp, "Invalid guest forwarding rule '%s'", config_str);
+ return -1;
+}
+
+void hmp_info_usernet(Monitor *mon, const QDict *qdict)
+{
+ SlirpState *s;
+
+ QTAILQ_FOREACH(s, &slirp_stacks, entry) {
+ int id;
+ bool got_hub_id = net_hub_id_for_client(&s->nc, &id) == 0;
+ char *info = slirp_connection_info(s->slirp);
+ monitor_printf(mon, "Hub %d (%s):\n%s",
+ got_hub_id ? id : -1,
+ s->nc.name, info);
+ g_free(info);
+ }
+}
+
+static void
+net_init_slirp_configs(const StringList *fwd, int flags)
+{
+ while (fwd) {
+ struct slirp_config_str *config;
+
+ config = g_malloc0(sizeof(*config));
+ pstrcpy(config->str, sizeof(config->str), fwd->value->str);
+ config->flags = flags;
+ config->next = slirp_configs;
+ slirp_configs = config;
+
+ fwd = fwd->next;
+ }
+}
+
+static const char **slirp_dnssearch(const StringList *dnsname)
+{
+ const StringList *c = dnsname;
+ size_t i = 0, num_opts = 0;
+ const char **ret;
+
+ while (c) {
+ num_opts++;
+ c = c->next;
+ }
+
+ if (num_opts == 0) {
+ return NULL;
+ }
+
+ ret = g_malloc((num_opts + 1) * sizeof(*ret));
+ c = dnsname;
+ while (c) {
+ ret[i++] = c->value->str;
+ c = c->next;
+ }
+ ret[i] = NULL;
+ return ret;
+}
+
+int net_init_slirp(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ struct slirp_config_str *config;
+ char *vnet;
+ int ret;
+ const NetdevUserOptions *user;
+ const char **dnssearch;
+ bool ipv4 = true, ipv6 = true;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_USER);
+ user = &netdev->u.user;
+
+ if ((user->has_ipv6 && user->ipv6 && !user->has_ipv4) ||
+ (user->has_ipv4 && !user->ipv4)) {
+ ipv4 = 0;
+ }
+ if ((user->has_ipv4 && user->ipv4 && !user->has_ipv6) ||
+ (user->has_ipv6 && !user->ipv6)) {
+ ipv6 = 0;
+ }
+
+ vnet = user->has_net ? g_strdup(user->net) :
+ user->has_ip ? g_strdup_printf("%s/24", user->ip) :
+ NULL;
+
+ dnssearch = slirp_dnssearch(user->dnssearch);
+
+ /* all optional fields are initialized to "all bits zero" */
+
+ net_init_slirp_configs(user->hostfwd, SLIRP_CFG_HOSTFWD);
+ net_init_slirp_configs(user->guestfwd, 0);
+
+ ret = net_slirp_init(peer, "user", name, user->q_restrict,
+ ipv4, vnet, user->host,
+ ipv6, user->ipv6_prefix, user->ipv6_prefixlen,
+ user->ipv6_host, user->hostname, user->tftp,
+ user->bootfile, user->dhcpstart,
+ user->dns, user->ipv6_dns, user->smb,
+ user->smbserver, dnssearch, user->domainname,
+ user->tftp_server_name, errp);
+
+ while (slirp_configs) {
+ config = slirp_configs;
+ slirp_configs = config->next;
+ g_free(config);
+ }
+
+ g_free(vnet);
+ g_free(dnssearch);
+
+ return ret;
+}
diff --git a/net/socket.c b/net/socket.c
new file mode 100644
index 000000000..15b410e8d
--- /dev/null
+++ b/net/socket.c
@@ -0,0 +1,783 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+
+#include "net/net.h"
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qemu/sockets.h"
+#include "qemu/iov.h"
+#include "qemu/main-loop.h"
+
+typedef struct NetSocketState {
+ NetClientState nc;
+ int listen_fd;
+ int fd;
+ SocketReadState rs;
+ unsigned int send_index; /* number of bytes sent (only SOCK_STREAM) */
+ struct sockaddr_in dgram_dst; /* contains inet host and port destination iff connectionless (SOCK_DGRAM) */
+ IOHandler *send_fn; /* differs between SOCK_STREAM/SOCK_DGRAM */
+ bool read_poll; /* waiting to receive data? */
+ bool write_poll; /* waiting to transmit data? */
+} NetSocketState;
+
+static void net_socket_accept(void *opaque);
+static void net_socket_writable(void *opaque);
+
+static void net_socket_update_fd_handler(NetSocketState *s)
+{
+ qemu_set_fd_handler(s->fd,
+ s->read_poll ? s->send_fn : NULL,
+ s->write_poll ? net_socket_writable : NULL,
+ s);
+}
+
+static void net_socket_read_poll(NetSocketState *s, bool enable)
+{
+ s->read_poll = enable;
+ net_socket_update_fd_handler(s);
+}
+
+static void net_socket_write_poll(NetSocketState *s, bool enable)
+{
+ s->write_poll = enable;
+ net_socket_update_fd_handler(s);
+}
+
+static void net_socket_writable(void *opaque)
+{
+ NetSocketState *s = opaque;
+
+ net_socket_write_poll(s, false);
+
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t net_socket_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
+ uint32_t len = htonl(size);
+ struct iovec iov[] = {
+ {
+ .iov_base = &len,
+ .iov_len = sizeof(len),
+ }, {
+ .iov_base = (void *)buf,
+ .iov_len = size,
+ },
+ };
+ size_t remaining;
+ ssize_t ret;
+
+ remaining = iov_size(iov, 2) - s->send_index;
+ ret = iov_send(s->fd, iov, 2, s->send_index, remaining);
+
+ if (ret == -1 && errno == EAGAIN) {
+ ret = 0; /* handled further down */
+ }
+ if (ret == -1) {
+ s->send_index = 0;
+ return -errno;
+ }
+ if (ret < (ssize_t)remaining) {
+ s->send_index += ret;
+ net_socket_write_poll(s, true);
+ return 0;
+ }
+ s->send_index = 0;
+ return size;
+}
+
+static ssize_t net_socket_receive_dgram(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
+ ssize_t ret;
+
+ do {
+ if (s->dgram_dst.sin_family != AF_UNIX) {
+ ret = qemu_sendto(s->fd, buf, size, 0,
+ (struct sockaddr *)&s->dgram_dst,
+ sizeof(s->dgram_dst));
+ } else {
+ ret = send(s->fd, buf, size, 0);
+ }
+ } while (ret == -1 && errno == EINTR);
+
+ if (ret == -1 && errno == EAGAIN) {
+ net_socket_write_poll(s, true);
+ return 0;
+ }
+ return ret;
+}
+
+static void net_socket_send_completed(NetClientState *nc, ssize_t len)
+{
+ NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
+
+ if (!s->read_poll) {
+ net_socket_read_poll(s, true);
+ }
+}
+
+static void net_socket_rs_finalize(SocketReadState *rs)
+{
+ NetSocketState *s = container_of(rs, NetSocketState, rs);
+
+ if (qemu_send_packet_async(&s->nc, rs->buf,
+ rs->packet_len,
+ net_socket_send_completed) == 0) {
+ net_socket_read_poll(s, false);
+ }
+}
+
+static void net_socket_send(void *opaque)
+{
+ NetSocketState *s = opaque;
+ int size;
+ int ret;
+ uint8_t buf1[NET_BUFSIZE];
+ const uint8_t *buf;
+
+ size = qemu_recv(s->fd, buf1, sizeof(buf1), 0);
+ if (size < 0) {
+ if (errno != EWOULDBLOCK)
+ goto eoc;
+ } else if (size == 0) {
+ /* end of connection */
+ eoc:
+ net_socket_read_poll(s, false);
+ net_socket_write_poll(s, false);
+ if (s->listen_fd != -1) {
+ qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
+ }
+ closesocket(s->fd);
+
+ s->fd = -1;
+ net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
+ s->nc.link_down = true;
+ memset(s->nc.info_str, 0, sizeof(s->nc.info_str));
+
+ return;
+ }
+ buf = buf1;
+
+ ret = net_fill_rstate(&s->rs, buf, size);
+
+ if (ret == -1) {
+ goto eoc;
+ }
+}
+
+static void net_socket_send_dgram(void *opaque)
+{
+ NetSocketState *s = opaque;
+ int size;
+
+ size = qemu_recv(s->fd, s->rs.buf, sizeof(s->rs.buf), 0);
+ if (size < 0)
+ return;
+ if (size == 0) {
+ /* end of connection */
+ net_socket_read_poll(s, false);
+ net_socket_write_poll(s, false);
+ return;
+ }
+ if (qemu_send_packet_async(&s->nc, s->rs.buf, size,
+ net_socket_send_completed) == 0) {
+ net_socket_read_poll(s, false);
+ }
+}
+
+static int net_socket_mcast_create(struct sockaddr_in *mcastaddr,
+ struct in_addr *localaddr,
+ Error **errp)
+{
+ struct ip_mreq imr;
+ int fd;
+ int val, ret;
+#ifdef __OpenBSD__
+ unsigned char loop;
+#else
+ int loop;
+#endif
+
+ if (!IN_MULTICAST(ntohl(mcastaddr->sin_addr.s_addr))) {
+ error_setg(errp, "specified mcastaddr %s (0x%08x) "
+ "does not contain a multicast address",
+ inet_ntoa(mcastaddr->sin_addr),
+ (int)ntohl(mcastaddr->sin_addr.s_addr));
+ return -1;
+ }
+
+ fd = qemu_socket(PF_INET, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "can't create datagram socket");
+ return -1;
+ }
+
+ /* Allow multiple sockets to bind the same multicast ip and port by setting
+ * SO_REUSEADDR. This is the only situation where SO_REUSEADDR should be set
+ * on windows. Use socket_set_fast_reuse otherwise as it sets SO_REUSEADDR
+ * only on posix systems.
+ */
+ val = 1;
+ ret = qemu_setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
+ if (ret < 0) {
+ error_setg_errno(errp, errno,
+ "can't set socket option SO_REUSEADDR");
+ goto fail;
+ }
+
+ ret = bind(fd, (struct sockaddr *)mcastaddr, sizeof(*mcastaddr));
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "can't bind ip=%s to socket",
+ inet_ntoa(mcastaddr->sin_addr));
+ goto fail;
+ }
+
+ /* Add host to multicast group */
+ imr.imr_multiaddr = mcastaddr->sin_addr;
+ if (localaddr) {
+ imr.imr_interface = *localaddr;
+ } else {
+ imr.imr_interface.s_addr = htonl(INADDR_ANY);
+ }
+
+ ret = qemu_setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP,
+ &imr, sizeof(struct ip_mreq));
+ if (ret < 0) {
+ error_setg_errno(errp, errno,
+ "can't add socket to multicast group %s",
+ inet_ntoa(imr.imr_multiaddr));
+ goto fail;
+ }
+
+ /* Force mcast msgs to loopback (eg. several QEMUs in same host */
+ loop = 1;
+ ret = qemu_setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP,
+ &loop, sizeof(loop));
+ if (ret < 0) {
+ error_setg_errno(errp, errno,
+ "can't force multicast message to loopback");
+ goto fail;
+ }
+
+ /* If a bind address is given, only send packets from that address */
+ if (localaddr != NULL) {
+ ret = qemu_setsockopt(fd, IPPROTO_IP, IP_MULTICAST_IF,
+ localaddr, sizeof(*localaddr));
+ if (ret < 0) {
+ error_setg_errno(errp, errno,
+ "can't set the default network send interface");
+ goto fail;
+ }
+ }
+
+ qemu_set_nonblock(fd);
+ return fd;
+fail:
+ if (fd >= 0)
+ closesocket(fd);
+ return -1;
+}
+
+static void net_socket_cleanup(NetClientState *nc)
+{
+ NetSocketState *s = DO_UPCAST(NetSocketState, nc, nc);
+ if (s->fd != -1) {
+ net_socket_read_poll(s, false);
+ net_socket_write_poll(s, false);
+ close(s->fd);
+ s->fd = -1;
+ }
+ if (s->listen_fd != -1) {
+ qemu_set_fd_handler(s->listen_fd, NULL, NULL, NULL);
+ closesocket(s->listen_fd);
+ s->listen_fd = -1;
+ }
+}
+
+static NetClientInfo net_dgram_socket_info = {
+ .type = NET_CLIENT_DRIVER_SOCKET,
+ .size = sizeof(NetSocketState),
+ .receive = net_socket_receive_dgram,
+ .cleanup = net_socket_cleanup,
+};
+
+static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
+ const char *model,
+ const char *name,
+ int fd, int is_connected,
+ const char *mcast,
+ Error **errp)
+{
+ struct sockaddr_in saddr;
+ int newfd;
+ NetClientState *nc;
+ NetSocketState *s;
+ SocketAddress *sa;
+ SocketAddressType sa_type;
+
+ sa = socket_local_address(fd, errp);
+ if (!sa) {
+ return NULL;
+ }
+ sa_type = sa->type;
+ qapi_free_SocketAddress(sa);
+
+ /* fd passed: multicast: "learn" dgram_dst address from bound address and save it
+ * Because this may be "shared" socket from a "master" process, datagrams would be recv()
+ * by ONLY ONE process: we must "clone" this dgram socket --jjo
+ */
+
+ if (is_connected && mcast != NULL) {
+ if (parse_host_port(&saddr, mcast, errp) < 0) {
+ goto err;
+ }
+ /* must be bound */
+ if (saddr.sin_addr.s_addr == 0) {
+ error_setg(errp, "can't setup multicast destination address");
+ goto err;
+ }
+ /* clone dgram socket */
+ newfd = net_socket_mcast_create(&saddr, NULL, errp);
+ if (newfd < 0) {
+ goto err;
+ }
+ /* clone newfd to fd, close newfd */
+ dup2(newfd, fd);
+ close(newfd);
+
+ }
+
+ nc = qemu_new_net_client(&net_dgram_socket_info, peer, model, name);
+
+ s = DO_UPCAST(NetSocketState, nc, nc);
+
+ s->fd = fd;
+ s->listen_fd = -1;
+ s->send_fn = net_socket_send_dgram;
+ net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
+ net_socket_read_poll(s, true);
+
+ /* mcast: save bound address as dst */
+ if (is_connected && mcast != NULL) {
+ s->dgram_dst = saddr;
+ snprintf(nc->info_str, sizeof(nc->info_str),
+ "socket: fd=%d (cloned mcast=%s:%d)",
+ fd, inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+ } else {
+ if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) {
+ s->dgram_dst.sin_family = AF_UNIX;
+ }
+
+ snprintf(nc->info_str, sizeof(nc->info_str),
+ "socket: fd=%d %s", fd, SocketAddressType_str(sa_type));
+ }
+
+ return s;
+
+err:
+ closesocket(fd);
+ return NULL;
+}
+
+static void net_socket_connect(void *opaque)
+{
+ NetSocketState *s = opaque;
+ s->send_fn = net_socket_send;
+ net_socket_read_poll(s, true);
+}
+
+static NetClientInfo net_socket_info = {
+ .type = NET_CLIENT_DRIVER_SOCKET,
+ .size = sizeof(NetSocketState),
+ .receive = net_socket_receive,
+ .cleanup = net_socket_cleanup,
+};
+
+static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
+ const char *model,
+ const char *name,
+ int fd, int is_connected)
+{
+ NetClientState *nc;
+ NetSocketState *s;
+
+ nc = qemu_new_net_client(&net_socket_info, peer, model, name);
+
+ snprintf(nc->info_str, sizeof(nc->info_str), "socket: fd=%d", fd);
+
+ s = DO_UPCAST(NetSocketState, nc, nc);
+
+ s->fd = fd;
+ s->listen_fd = -1;
+ net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
+
+ /* Disable Nagle algorithm on TCP sockets to reduce latency */
+ socket_set_nodelay(fd);
+
+ if (is_connected) {
+ net_socket_connect(s);
+ } else {
+ qemu_set_fd_handler(s->fd, NULL, net_socket_connect, s);
+ }
+ return s;
+}
+
+static NetSocketState *net_socket_fd_init(NetClientState *peer,
+ const char *model, const char *name,
+ int fd, int is_connected,
+ const char *mc, Error **errp)
+{
+ int so_type = -1, optlen=sizeof(so_type);
+
+ if(getsockopt(fd, SOL_SOCKET, SO_TYPE, (char *)&so_type,
+ (socklen_t *)&optlen)< 0) {
+ error_setg(errp, "can't get socket option SO_TYPE");
+ closesocket(fd);
+ return NULL;
+ }
+ switch(so_type) {
+ case SOCK_DGRAM:
+ return net_socket_fd_init_dgram(peer, model, name, fd, is_connected,
+ mc, errp);
+ case SOCK_STREAM:
+ return net_socket_fd_init_stream(peer, model, name, fd, is_connected);
+ default:
+ error_setg(errp, "socket type=%d for fd=%d must be either"
+ " SOCK_DGRAM or SOCK_STREAM", so_type, fd);
+ closesocket(fd);
+ }
+ return NULL;
+}
+
+static void net_socket_accept(void *opaque)
+{
+ NetSocketState *s = opaque;
+ struct sockaddr_in saddr;
+ socklen_t len;
+ int fd;
+
+ for(;;) {
+ len = sizeof(saddr);
+ fd = qemu_accept(s->listen_fd, (struct sockaddr *)&saddr, &len);
+ if (fd < 0 && errno != EINTR) {
+ return;
+ } else if (fd >= 0) {
+ qemu_set_fd_handler(s->listen_fd, NULL, NULL, NULL);
+ break;
+ }
+ }
+
+ s->fd = fd;
+ s->nc.link_down = false;
+ net_socket_connect(s);
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "socket: connection from %s:%d",
+ inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+}
+
+static int net_socket_listen_init(NetClientState *peer,
+ const char *model,
+ const char *name,
+ const char *host_str,
+ Error **errp)
+{
+ NetClientState *nc;
+ NetSocketState *s;
+ struct sockaddr_in saddr;
+ int fd, ret;
+
+ if (parse_host_port(&saddr, host_str, errp) < 0) {
+ return -1;
+ }
+
+ fd = qemu_socket(PF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "can't create stream socket");
+ return -1;
+ }
+ qemu_set_nonblock(fd);
+
+ socket_set_fast_reuse(fd);
+
+ ret = bind(fd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "can't bind ip=%s to socket",
+ inet_ntoa(saddr.sin_addr));
+ closesocket(fd);
+ return -1;
+ }
+ ret = listen(fd, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "can't listen on socket");
+ closesocket(fd);
+ return -1;
+ }
+
+ nc = qemu_new_net_client(&net_socket_info, peer, model, name);
+ s = DO_UPCAST(NetSocketState, nc, nc);
+ s->fd = -1;
+ s->listen_fd = fd;
+ s->nc.link_down = true;
+ net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
+
+ qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
+ return 0;
+}
+
+static int net_socket_connect_init(NetClientState *peer,
+ const char *model,
+ const char *name,
+ const char *host_str,
+ Error **errp)
+{
+ NetSocketState *s;
+ int fd, connected, ret;
+ struct sockaddr_in saddr;
+
+ if (parse_host_port(&saddr, host_str, errp) < 0) {
+ return -1;
+ }
+
+ fd = qemu_socket(PF_INET, SOCK_STREAM, 0);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "can't create stream socket");
+ return -1;
+ }
+ qemu_set_nonblock(fd);
+
+ connected = 0;
+ for(;;) {
+ ret = connect(fd, (struct sockaddr *)&saddr, sizeof(saddr));
+ if (ret < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK) {
+ /* continue */
+ } else if (errno == EINPROGRESS ||
+ errno == EALREADY ||
+ errno == EINVAL) {
+ break;
+ } else {
+ error_setg_errno(errp, errno, "can't connect socket");
+ closesocket(fd);
+ return -1;
+ }
+ } else {
+ connected = 1;
+ break;
+ }
+ }
+ s = net_socket_fd_init(peer, model, name, fd, connected, NULL, errp);
+ if (!s) {
+ return -1;
+ }
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "socket: connect to %s:%d",
+ inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+ return 0;
+}
+
+static int net_socket_mcast_init(NetClientState *peer,
+ const char *model,
+ const char *name,
+ const char *host_str,
+ const char *localaddr_str,
+ Error **errp)
+{
+ NetSocketState *s;
+ int fd;
+ struct sockaddr_in saddr;
+ struct in_addr localaddr, *param_localaddr;
+
+ if (parse_host_port(&saddr, host_str, errp) < 0) {
+ return -1;
+ }
+
+ if (localaddr_str != NULL) {
+ if (inet_aton(localaddr_str, &localaddr) == 0) {
+ error_setg(errp, "localaddr '%s' is not a valid IPv4 address",
+ localaddr_str);
+ return -1;
+ }
+ param_localaddr = &localaddr;
+ } else {
+ param_localaddr = NULL;
+ }
+
+ fd = net_socket_mcast_create(&saddr, param_localaddr, errp);
+ if (fd < 0) {
+ return -1;
+ }
+
+ s = net_socket_fd_init(peer, model, name, fd, 0, NULL, errp);
+ if (!s) {
+ return -1;
+ }
+
+ s->dgram_dst = saddr;
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "socket: mcast=%s:%d",
+ inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+ return 0;
+
+}
+
+static int net_socket_udp_init(NetClientState *peer,
+ const char *model,
+ const char *name,
+ const char *rhost,
+ const char *lhost,
+ Error **errp)
+{
+ NetSocketState *s;
+ int fd, ret;
+ struct sockaddr_in laddr, raddr;
+
+ if (parse_host_port(&laddr, lhost, errp) < 0) {
+ return -1;
+ }
+
+ if (parse_host_port(&raddr, rhost, errp) < 0) {
+ return -1;
+ }
+
+ fd = qemu_socket(PF_INET, SOCK_DGRAM, 0);
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "can't create datagram socket");
+ return -1;
+ }
+
+ ret = socket_set_fast_reuse(fd);
+ if (ret < 0) {
+ error_setg_errno(errp, errno,
+ "can't set socket option SO_REUSEADDR");
+ closesocket(fd);
+ return -1;
+ }
+ ret = bind(fd, (struct sockaddr *)&laddr, sizeof(laddr));
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "can't bind ip=%s to socket",
+ inet_ntoa(laddr.sin_addr));
+ closesocket(fd);
+ return -1;
+ }
+ qemu_set_nonblock(fd);
+
+ s = net_socket_fd_init(peer, model, name, fd, 0, NULL, errp);
+ if (!s) {
+ return -1;
+ }
+
+ s->dgram_dst = raddr;
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "socket: udp=%s:%d",
+ inet_ntoa(raddr.sin_addr), ntohs(raddr.sin_port));
+ return 0;
+}
+
+int net_init_socket(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevSocketOptions *sock;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_SOCKET);
+ sock = &netdev->u.socket;
+
+ if (sock->has_fd + sock->has_listen + sock->has_connect + sock->has_mcast +
+ sock->has_udp != 1) {
+ error_setg(errp, "exactly one of listen=, connect=, mcast= or udp="
+ " is required");
+ return -1;
+ }
+
+ if (sock->has_localaddr && !sock->has_mcast && !sock->has_udp) {
+ error_setg(errp, "localaddr= is only valid with mcast= or udp=");
+ return -1;
+ }
+
+ if (sock->has_fd) {
+ int fd, ret;
+
+ fd = monitor_fd_param(monitor_cur(), sock->fd, errp);
+ if (fd == -1) {
+ return -1;
+ }
+ ret = qemu_try_set_nonblock(fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d",
+ name, fd);
+ return -1;
+ }
+ if (!net_socket_fd_init(peer, "socket", name, fd, 1, sock->mcast,
+ errp)) {
+ return -1;
+ }
+ return 0;
+ }
+
+ if (sock->has_listen) {
+ if (net_socket_listen_init(peer, "socket", name, sock->listen, errp)
+ < 0) {
+ return -1;
+ }
+ return 0;
+ }
+
+ if (sock->has_connect) {
+ if (net_socket_connect_init(peer, "socket", name, sock->connect, errp)
+ < 0) {
+ return -1;
+ }
+ return 0;
+ }
+
+ if (sock->has_mcast) {
+ /* if sock->localaddr is missing, it has been initialized to "all bits
+ * zero" */
+ if (net_socket_mcast_init(peer, "socket", name, sock->mcast,
+ sock->localaddr, errp) < 0) {
+ return -1;
+ }
+ return 0;
+ }
+
+ assert(sock->has_udp);
+ if (!sock->has_localaddr) {
+ error_setg(errp, "localaddr= is mandatory with udp=");
+ return -1;
+ }
+ if (net_socket_udp_init(peer, "socket", name, sock->udp, sock->localaddr,
+ errp) < 0) {
+ return -1;
+ }
+ return 0;
+}
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
new file mode 100644
index 000000000..e45a6d124
--- /dev/null
+++ b/net/tap-bsd.c
@@ -0,0 +1,258 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "qapi/error.h"
+#include "tap_int.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+
+#if defined(__NetBSD__) || defined(__FreeBSD__)
+#include <sys/ioctl.h>
+#include <net/if.h>
+#include <net/if_tap.h>
+#endif
+
+#ifndef __FreeBSD__
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp)
+{
+ int fd;
+#ifdef TAPGIFNAME
+ struct ifreq ifr;
+#else
+ char *dev;
+ struct stat s;
+#endif
+
+ /* if no ifname is given, always start the search from tap0/tun0. */
+ int i;
+ char dname[100];
+
+ for (i = 0; i < 10; i++) {
+ if (*ifname) {
+ snprintf(dname, sizeof dname, "/dev/%s", ifname);
+ } else {
+ snprintf(dname, sizeof dname, "/dev/tap%d", i);
+ }
+ TFR(fd = open(dname, O_RDWR));
+ if (fd >= 0) {
+ break;
+ }
+ else if (errno == ENXIO || errno == ENOENT) {
+ break;
+ }
+ if (*ifname) {
+ break;
+ }
+ }
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "could not open %s", dname);
+ return -1;
+ }
+
+#ifdef TAPGIFNAME
+ if (ioctl(fd, TAPGIFNAME, (void *)&ifr) < 0) {
+ error_setg_errno(errp, errno, "could not get tap name");
+ return -1;
+ }
+ pstrcpy(ifname, ifname_size, ifr.ifr_name);
+#else
+ if (fstat(fd, &s) < 0) {
+ error_setg_errno(errp, errno, "could not stat %s", dname);
+ return -1;
+ }
+ dev = devname(s.st_rdev, S_IFCHR);
+ pstrcpy(ifname, ifname_size, dev);
+#endif
+
+ if (*vnet_hdr) {
+ /* BSD doesn't have IFF_VNET_HDR */
+ *vnet_hdr = 0;
+
+ if (vnet_hdr_required && !*vnet_hdr) {
+ error_setg(errp, "vnet_hdr=1 requested, but no kernel "
+ "support for IFF_VNET_HDR available");
+ close(fd);
+ return -1;
+ }
+ }
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+ return fd;
+}
+
+#else /* __FreeBSD__ */
+
+#define PATH_NET_TAP "/dev/tap"
+
+static int tap_open_clone(char *ifname, int ifname_size, Error **errp)
+{
+ int fd, s, ret;
+ struct ifreq ifr;
+
+ TFR(fd = open(PATH_NET_TAP, O_RDWR));
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "could not open %s", PATH_NET_TAP);
+ return -1;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ret = ioctl(fd, TAPGIFNAME, (void *)&ifr);
+ if (ret < 0) {
+ error_setg_errno(errp, errno, "could not get tap interface name");
+ close(fd);
+ return -1;
+ }
+
+ if (ifname[0] != '\0') {
+ /* User requested the interface to have a specific name */
+ s = socket(AF_LOCAL, SOCK_DGRAM, 0);
+ if (s < 0) {
+ error_setg_errno(errp, errno,
+ "could not open socket to set interface name");
+ close(fd);
+ return -1;
+ }
+ ifr.ifr_data = ifname;
+ ret = ioctl(s, SIOCSIFNAME, (void *)&ifr);
+ close(s);
+ if (ret < 0) {
+ error_setg(errp, "could not set tap interface name");
+ close(fd);
+ return -1;
+ }
+ } else {
+ pstrcpy(ifname, ifname_size, ifr.ifr_name);
+ }
+
+ return fd;
+}
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp)
+{
+ int fd = -1;
+
+ /* If the specified tap device already exists just use it. */
+ if (ifname[0] != '\0') {
+ char dname[100];
+ snprintf(dname, sizeof dname, "/dev/%s", ifname);
+ TFR(fd = open(dname, O_RDWR));
+ if (fd < 0 && errno != ENOENT) {
+ error_setg_errno(errp, errno, "could not open %s", dname);
+ return -1;
+ }
+ }
+
+ if (fd < 0) {
+ /* Tap device not specified or does not exist. */
+ if ((fd = tap_open_clone(ifname, ifname_size, errp)) < 0) {
+ return -1;
+ }
+ }
+
+ if (*vnet_hdr) {
+ /* BSD doesn't have IFF_VNET_HDR */
+ *vnet_hdr = 0;
+
+ if (vnet_hdr_required && !*vnet_hdr) {
+ error_setg(errp, "vnet_hdr=1 requested, but no kernel "
+ "support for IFF_VNET_HDR available");
+ goto error;
+ }
+ }
+ if (mq_required) {
+ error_setg(errp, "mq_required requested, but no kernel support"
+ " for IFF_MULTI_QUEUE available");
+ goto error;
+ }
+
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+ return fd;
+
+error:
+ close(fd);
+ return -1;
+}
+#endif /* __FreeBSD__ */
+
+void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp)
+{
+}
+
+int tap_probe_vnet_hdr(int fd, Error **errp)
+{
+ return 0;
+}
+
+int tap_probe_has_ufo(int fd)
+{
+ return 0;
+}
+
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ return 0;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+}
+
+int tap_fd_set_vnet_le(int fd, int is_le)
+{
+ return -EINVAL;
+}
+
+int tap_fd_set_vnet_be(int fd, int is_be)
+{
+ return -EINVAL;
+}
+
+void tap_fd_set_offload(int fd, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+}
+
+int tap_fd_enable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_disable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_get_ifname(int fd, char *ifname)
+{
+ return -1;
+}
+
+int tap_fd_set_steering_ebpf(int fd, int prog_fd)
+{
+ return -1;
+}
diff --git a/net/tap-linux.c b/net/tap-linux.c
new file mode 100644
index 000000000..958476974
--- /dev/null
+++ b/net/tap-linux.c
@@ -0,0 +1,331 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "tap_int.h"
+#include "tap-linux.h"
+#include "net/tap.h"
+
+#include <net/if.h>
+#include <sys/ioctl.h>
+
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/cutils.h"
+
+#define PATH_NET_TUN "/dev/net/tun"
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp)
+{
+ struct ifreq ifr;
+ int fd, ret;
+ int len = sizeof(struct virtio_net_hdr);
+ unsigned int features;
+
+ TFR(fd = open(PATH_NET_TUN, O_RDWR));
+ if (fd < 0) {
+ error_setg_errno(errp, errno, "could not open %s", PATH_NET_TUN);
+ return -1;
+ }
+ memset(&ifr, 0, sizeof(ifr));
+ ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+
+ if (ioctl(fd, TUNGETFEATURES, &features) == -1) {
+ warn_report("TUNGETFEATURES failed: %s", strerror(errno));
+ features = 0;
+ }
+
+ if (features & IFF_ONE_QUEUE) {
+ ifr.ifr_flags |= IFF_ONE_QUEUE;
+ }
+
+ if (*vnet_hdr) {
+ if (features & IFF_VNET_HDR) {
+ *vnet_hdr = 1;
+ ifr.ifr_flags |= IFF_VNET_HDR;
+ } else {
+ *vnet_hdr = 0;
+ }
+
+ if (vnet_hdr_required && !*vnet_hdr) {
+ error_setg(errp, "vnet_hdr=1 requested, but no kernel "
+ "support for IFF_VNET_HDR available");
+ close(fd);
+ return -1;
+ }
+ /*
+ * Make sure vnet header size has the default value: for a persistent
+ * tap it might have been modified e.g. by another instance of qemu.
+ * Ignore errors since old kernels do not support this ioctl: in this
+ * case the header size implicitly has the correct value.
+ */
+ ioctl(fd, TUNSETVNETHDRSZ, &len);
+ }
+
+ if (mq_required) {
+ if (!(features & IFF_MULTI_QUEUE)) {
+ error_setg(errp, "multiqueue required, but no kernel "
+ "support for IFF_MULTI_QUEUE available");
+ close(fd);
+ return -1;
+ } else {
+ ifr.ifr_flags |= IFF_MULTI_QUEUE;
+ }
+ }
+
+ if (ifname[0] != '\0')
+ pstrcpy(ifr.ifr_name, IFNAMSIZ, ifname);
+ else
+ pstrcpy(ifr.ifr_name, IFNAMSIZ, "tap%d");
+ ret = ioctl(fd, TUNSETIFF, (void *) &ifr);
+ if (ret != 0) {
+ if (ifname[0] != '\0') {
+ error_setg_errno(errp, errno, "could not configure %s (%s)",
+ PATH_NET_TUN, ifr.ifr_name);
+ } else {
+ error_setg_errno(errp, errno, "could not configure %s",
+ PATH_NET_TUN);
+ }
+ close(fd);
+ return -1;
+ }
+ pstrcpy(ifname, ifname_size, ifr.ifr_name);
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+ return fd;
+}
+
+/* sndbuf implements a kind of flow control for tap.
+ * Unfortunately when it's enabled, and packets are sent
+ * to other guests on the same host, the receiver
+ * can lock up the transmitter indefinitely.
+ *
+ * To avoid packet loss, sndbuf should be set to a value lower than the tx
+ * queue capacity of any destination network interface.
+ * Ethernet NICs generally have txqueuelen=1000, so 1Mb is
+ * a good value, given a 1500 byte MTU.
+ */
+#define TAP_DEFAULT_SNDBUF 0
+
+void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp)
+{
+ int sndbuf;
+
+ sndbuf = !tap->has_sndbuf ? TAP_DEFAULT_SNDBUF :
+ tap->sndbuf > INT_MAX ? INT_MAX :
+ tap->sndbuf;
+
+ if (!sndbuf) {
+ sndbuf = INT_MAX;
+ }
+
+ if (ioctl(fd, TUNSETSNDBUF, &sndbuf) == -1 && tap->has_sndbuf) {
+ error_setg_errno(errp, errno, "TUNSETSNDBUF ioctl failed");
+ }
+}
+
+int tap_probe_vnet_hdr(int fd, Error **errp)
+{
+ struct ifreq ifr;
+
+ if (ioctl(fd, TUNGETIFF, &ifr) != 0) {
+ /* TUNGETIFF is available since kernel v2.6.27 */
+ error_setg_errno(errp, errno,
+ "Unable to query TUNGETIFF on FD %d", fd);
+ return -1;
+ }
+
+ return ifr.ifr_flags & IFF_VNET_HDR;
+}
+
+int tap_probe_has_ufo(int fd)
+{
+ unsigned offload;
+
+ offload = TUN_F_CSUM | TUN_F_UFO;
+
+ if (ioctl(fd, TUNSETOFFLOAD, offload) < 0)
+ return 0;
+
+ return 1;
+}
+
+/* Verify that we can assign given length */
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ int orig;
+ if (ioctl(fd, TUNGETVNETHDRSZ, &orig) == -1) {
+ return 0;
+ }
+ if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1) {
+ return 0;
+ }
+ /* Restore original length: we can't handle failure. */
+ if (ioctl(fd, TUNSETVNETHDRSZ, &orig) == -1) {
+ fprintf(stderr, "TUNGETVNETHDRSZ ioctl() failed: %s. Exiting.\n",
+ strerror(errno));
+ abort();
+ return -errno;
+ }
+ return 1;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+ if (ioctl(fd, TUNSETVNETHDRSZ, &len) == -1) {
+ fprintf(stderr, "TUNSETVNETHDRSZ ioctl() failed: %s. Exiting.\n",
+ strerror(errno));
+ abort();
+ }
+}
+
+int tap_fd_set_vnet_le(int fd, int is_le)
+{
+ int arg = is_le ? 1 : 0;
+
+ if (!ioctl(fd, TUNSETVNETLE, &arg)) {
+ return 0;
+ }
+
+ /* Check if our kernel supports TUNSETVNETLE */
+ if (errno == EINVAL) {
+ return -errno;
+ }
+
+ error_report("TUNSETVNETLE ioctl() failed: %s.", strerror(errno));
+ abort();
+}
+
+int tap_fd_set_vnet_be(int fd, int is_be)
+{
+ int arg = is_be ? 1 : 0;
+
+ if (!ioctl(fd, TUNSETVNETBE, &arg)) {
+ return 0;
+ }
+
+ /* Check if our kernel supports TUNSETVNETBE */
+ if (errno == EINVAL) {
+ return -errno;
+ }
+
+ error_report("TUNSETVNETBE ioctl() failed: %s.", strerror(errno));
+ abort();
+}
+
+void tap_fd_set_offload(int fd, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+ unsigned int offload = 0;
+
+ /* Check if our kernel supports TUNSETOFFLOAD */
+ if (ioctl(fd, TUNSETOFFLOAD, 0) != 0 && errno == EINVAL) {
+ return;
+ }
+
+ if (csum) {
+ offload |= TUN_F_CSUM;
+ if (tso4)
+ offload |= TUN_F_TSO4;
+ if (tso6)
+ offload |= TUN_F_TSO6;
+ if ((tso4 || tso6) && ecn)
+ offload |= TUN_F_TSO_ECN;
+ if (ufo)
+ offload |= TUN_F_UFO;
+ }
+
+ if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
+ offload &= ~TUN_F_UFO;
+ if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) {
+ fprintf(stderr, "TUNSETOFFLOAD ioctl() failed: %s\n",
+ strerror(errno));
+ }
+ }
+}
+
+/* Enable a specific queue of tap. */
+int tap_fd_enable(int fd)
+{
+ struct ifreq ifr;
+ int ret;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_flags = IFF_ATTACH_QUEUE;
+ ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr);
+
+ if (ret != 0) {
+ error_report("could not enable queue");
+ }
+
+ return ret;
+}
+
+/* Disable a specific queue of tap/ */
+int tap_fd_disable(int fd)
+{
+ struct ifreq ifr;
+ int ret;
+
+ memset(&ifr, 0, sizeof(ifr));
+
+ ifr.ifr_flags = IFF_DETACH_QUEUE;
+ ret = ioctl(fd, TUNSETQUEUE, (void *) &ifr);
+
+ if (ret != 0) {
+ error_report("could not disable queue");
+ }
+
+ return ret;
+}
+
+int tap_fd_get_ifname(int fd, char *ifname)
+{
+ struct ifreq ifr;
+
+ if (ioctl(fd, TUNGETIFF, &ifr) != 0) {
+ error_report("TUNGETIFF ioctl() failed: %s",
+ strerror(errno));
+ return -1;
+ }
+
+ pstrcpy(ifname, sizeof(ifr.ifr_name), ifr.ifr_name);
+ return 0;
+}
+
+int tap_fd_set_steering_ebpf(int fd, int prog_fd)
+{
+ if (ioctl(fd, TUNSETSTEERINGEBPF, (void *) &prog_fd) != 0) {
+ error_report("Issue while setting TUNSETSTEERINGEBPF:"
+ " %s with fd: %d, prog_fd: %d",
+ strerror(errno), fd, prog_fd);
+
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/net/tap-linux.h b/net/tap-linux.h
new file mode 100644
index 000000000..1d06fe0de
--- /dev/null
+++ b/net/tap-linux.h
@@ -0,0 +1,54 @@
+/*
+ * Universal TUN/TAP device driver.
+ * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef QEMU_TAP_LINUX_H
+#define QEMU_TAP_LINUX_H
+
+#ifdef __linux__
+
+#include <linux/ioctl.h>
+
+/* Ioctl defines */
+#define TUNSETIFF _IOW('T', 202, int)
+#define TUNGETFEATURES _IOR('T', 207, unsigned int)
+#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
+#define TUNGETIFF _IOR('T', 210, unsigned int)
+#define TUNSETSNDBUF _IOW('T', 212, int)
+#define TUNGETVNETHDRSZ _IOR('T', 215, int)
+#define TUNSETVNETHDRSZ _IOW('T', 216, int)
+#define TUNSETQUEUE _IOW('T', 217, int)
+#define TUNSETVNETLE _IOW('T', 220, int)
+#define TUNSETVNETBE _IOW('T', 222, int)
+#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
+
+#endif
+
+/* TUNSETIFF ifr flags */
+#define IFF_TAP 0x0002
+#define IFF_NO_PI 0x1000
+#define IFF_ONE_QUEUE 0x2000
+#define IFF_VNET_HDR 0x4000
+#define IFF_MULTI_QUEUE 0x0100
+#define IFF_ATTACH_QUEUE 0x0200
+#define IFF_DETACH_QUEUE 0x0400
+
+/* Features for GSO (TUNSETOFFLOAD). */
+#define TUN_F_CSUM 0x01 /* You can hand me unchecksummed packets. */
+#define TUN_F_TSO4 0x02 /* I can handle TSO for IPv4 packets */
+#define TUN_F_TSO6 0x04 /* I can handle TSO for IPv6 packets */
+#define TUN_F_TSO_ECN 0x08 /* I can handle TSO with ECN bits. */
+#define TUN_F_UFO 0x10 /* I can handle UFO packets */
+
+#endif /* QEMU_TAP_LINUX_H */
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
new file mode 100644
index 000000000..d85224242
--- /dev/null
+++ b/net/tap-solaris.c
@@ -0,0 +1,262 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "tap_int.h"
+#include "qemu/ctype.h"
+#include "qemu/cutils.h"
+#include "qemu-common.h"
+
+#include <sys/ethernet.h>
+#include <sys/sockio.h>
+#include <netinet/arp.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h> // must come after ip.h
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <net/if.h>
+#include <stropts.h>
+#include "qemu/error-report.h"
+
+ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen)
+{
+ struct strbuf sbuf;
+ int f = 0;
+
+ sbuf.maxlen = maxlen;
+ sbuf.buf = (char *)buf;
+
+ return getmsg(tapfd, NULL, &sbuf, &f) >= 0 ? sbuf.len : -1;
+}
+
+#define TUNNEWPPA (('T'<<16) | 0x0001)
+/*
+ * Allocate TAP device, returns opened fd.
+ * Stores dev name in the first arg(must be large enough).
+ */
+static int tap_alloc(char *dev, size_t dev_size, Error **errp)
+{
+ /* FIXME leaks like a sieve on error paths */
+ /* FIXME suspicious: many errors are reported, then ignored */
+ int tap_fd, if_fd, ppa = -1;
+ static int ip_fd = 0;
+ char *ptr;
+
+ static int arp_fd = 0;
+ int ip_muxid, arp_muxid;
+ struct strioctl strioc_if, strioc_ppa;
+ int link_type = I_PLINK;
+ struct lifreq ifr;
+ char actual_name[32] = "";
+
+ memset(&ifr, 0x0, sizeof(ifr));
+
+ if( *dev ){
+ ptr = dev;
+ while( *ptr && !qemu_isdigit((int)*ptr) ) ptr++;
+ ppa = atoi(ptr);
+ }
+
+ /* Check if IP device was opened */
+ if( ip_fd )
+ close(ip_fd);
+
+ TFR(ip_fd = open("/dev/udp", O_RDWR, 0));
+ if (ip_fd < 0) {
+ error_setg(errp, "Can't open /dev/ip (actually /dev/udp)");
+ return -1;
+ }
+
+ TFR(tap_fd = open("/dev/tap", O_RDWR, 0));
+ if (tap_fd < 0) {
+ error_setg(errp, "Can't open /dev/tap");
+ return -1;
+ }
+
+ /* Assign a new PPA and get its unit number. */
+ strioc_ppa.ic_cmd = TUNNEWPPA;
+ strioc_ppa.ic_timout = 0;
+ strioc_ppa.ic_len = sizeof(ppa);
+ strioc_ppa.ic_dp = (char *)&ppa;
+ if ((ppa = ioctl (tap_fd, I_STR, &strioc_ppa)) < 0)
+ error_report("Can't assign new interface");
+
+ TFR(if_fd = open("/dev/tap", O_RDWR, 0));
+ if (if_fd < 0) {
+ error_setg(errp, "Can't open /dev/tap (2)");
+ return -1;
+ }
+ if(ioctl(if_fd, I_PUSH, "ip") < 0){
+ error_setg(errp, "Can't push IP module");
+ return -1;
+ }
+
+ if (ioctl(if_fd, SIOCGLIFFLAGS, &ifr) < 0)
+ error_report("Can't get flags");
+
+ snprintf (actual_name, 32, "tap%d", ppa);
+ pstrcpy(ifr.lifr_name, sizeof(ifr.lifr_name), actual_name);
+
+ ifr.lifr_ppa = ppa;
+ /* Assign ppa according to the unit number returned by tun device */
+
+ if (ioctl (if_fd, SIOCSLIFNAME, &ifr) < 0)
+ error_report("Can't set PPA %d", ppa);
+ if (ioctl(if_fd, SIOCGLIFFLAGS, &ifr) <0)
+ error_report("Can't get flags");
+ /* Push arp module to if_fd */
+ if (ioctl (if_fd, I_PUSH, "arp") < 0)
+ error_report("Can't push ARP module (2)");
+
+ /* Push arp module to ip_fd */
+ if (ioctl (ip_fd, I_POP, NULL) < 0)
+ error_report("I_POP failed");
+ if (ioctl (ip_fd, I_PUSH, "arp") < 0)
+ error_report("Can't push ARP module (3)");
+ /* Open arp_fd */
+ TFR(arp_fd = open ("/dev/tap", O_RDWR, 0));
+ if (arp_fd < 0)
+ error_report("Can't open %s", "/dev/tap");
+
+ /* Set ifname to arp */
+ strioc_if.ic_cmd = SIOCSLIFNAME;
+ strioc_if.ic_timout = 0;
+ strioc_if.ic_len = sizeof(ifr);
+ strioc_if.ic_dp = (char *)&ifr;
+ if (ioctl(arp_fd, I_STR, &strioc_if) < 0){
+ error_report("Can't set ifname to arp");
+ }
+
+ if((ip_muxid = ioctl(ip_fd, I_LINK, if_fd)) < 0){
+ error_setg(errp, "Can't link TAP device to IP");
+ return -1;
+ }
+
+ if ((arp_muxid = ioctl (ip_fd, link_type, arp_fd)) < 0)
+ error_report("Can't link TAP device to ARP");
+
+ close (if_fd);
+
+ memset(&ifr, 0x0, sizeof(ifr));
+ pstrcpy(ifr.lifr_name, sizeof(ifr.lifr_name), actual_name);
+ ifr.lifr_ip_muxid = ip_muxid;
+ ifr.lifr_arp_muxid = arp_muxid;
+
+ if (ioctl (ip_fd, SIOCSLIFMUXID, &ifr) < 0)
+ {
+ ioctl (ip_fd, I_PUNLINK , arp_muxid);
+ ioctl (ip_fd, I_PUNLINK, ip_muxid);
+ error_report("Can't set multiplexor id");
+ }
+
+ snprintf(dev, dev_size, "tap%d", ppa);
+ return tap_fd;
+}
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp)
+{
+ char dev[10]="";
+ int fd;
+
+ fd = tap_alloc(dev, sizeof(dev), errp);
+ if (fd < 0) {
+ return -1;
+ }
+ pstrcpy(ifname, ifname_size, dev);
+ if (*vnet_hdr) {
+ /* Solaris doesn't have IFF_VNET_HDR */
+ *vnet_hdr = 0;
+
+ if (vnet_hdr_required && !*vnet_hdr) {
+ error_setg(errp, "vnet_hdr=1 requested, but no kernel "
+ "support for IFF_VNET_HDR available");
+ close(fd);
+ return -1;
+ }
+ }
+ fcntl(fd, F_SETFL, O_NONBLOCK);
+ return fd;
+}
+
+void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp)
+{
+}
+
+int tap_probe_vnet_hdr(int fd, Error **errp)
+{
+ return 0;
+}
+
+int tap_probe_has_ufo(int fd)
+{
+ return 0;
+}
+
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ return 0;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+}
+
+int tap_fd_set_vnet_le(int fd, int is_le)
+{
+ return -EINVAL;
+}
+
+int tap_fd_set_vnet_be(int fd, int is_be)
+{
+ return -EINVAL;
+}
+
+void tap_fd_set_offload(int fd, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+}
+
+int tap_fd_enable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_disable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_get_ifname(int fd, char *ifname)
+{
+ return -1;
+}
+
+int tap_fd_set_steering_ebpf(int fd, int prog_fd)
+{
+ return -1;
+}
diff --git a/net/tap-stub.c b/net/tap-stub.c
new file mode 100644
index 000000000..a0fa25804
--- /dev/null
+++ b/net/tap-stub.c
@@ -0,0 +1,92 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "tap_int.h"
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp)
+{
+ error_setg(errp, "tap is not supported in this build");
+ return -1;
+}
+
+void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp)
+{
+}
+
+int tap_probe_vnet_hdr(int fd, Error **errp)
+{
+ return 0;
+}
+
+int tap_probe_has_ufo(int fd)
+{
+ return 0;
+}
+
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ return 0;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+}
+
+int tap_fd_set_vnet_le(int fd, int is_le)
+{
+ return -EINVAL;
+}
+
+int tap_fd_set_vnet_be(int fd, int is_be)
+{
+ return -EINVAL;
+}
+
+void tap_fd_set_offload(int fd, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+}
+
+int tap_fd_enable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_disable(int fd)
+{
+ return -1;
+}
+
+int tap_fd_get_ifname(int fd, char *ifname)
+{
+ return -1;
+}
+
+int tap_fd_set_steering_ebpf(int fd, int prog_fd)
+{
+ return -1;
+}
diff --git a/net/tap-win32.c b/net/tap-win32.c
new file mode 100644
index 000000000..6096972f5
--- /dev/null
+++ b/net/tap-win32.c
@@ -0,0 +1,832 @@
+/*
+ * TAP-Win32 -- A kernel driver to provide virtual tap device functionality
+ * on Windows. Originally derived from the CIPE-Win32
+ * project by Damion K. Wilson, with extensive modifications by
+ * James Yonan.
+ *
+ * All source code which derives from the CIPE-Win32 project is
+ * Copyright (C) Damion K. Wilson, 2003, and is released under the
+ * GPL version 2 (see below).
+ *
+ * All other source code is Copyright (C) James Yonan, 2003-2004,
+ * and is released under the GPL version 2 (see below).
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program (see the file COPYING included with this
+ * distribution); if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "tap_int.h"
+
+#include "qemu-common.h"
+#include "clients.h" /* net_init_tap */
+#include "net/eth.h"
+#include "net/net.h"
+#include "net/tap.h" /* tap_has_ufo, ... */
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include <windows.h>
+#include <winioctl.h>
+
+//=============
+// TAP IOCTLs
+//=============
+
+#define TAP_CONTROL_CODE(request,method) \
+ CTL_CODE (FILE_DEVICE_UNKNOWN, request, method, FILE_ANY_ACCESS)
+
+#define TAP_IOCTL_GET_MAC TAP_CONTROL_CODE (1, METHOD_BUFFERED)
+#define TAP_IOCTL_GET_VERSION TAP_CONTROL_CODE (2, METHOD_BUFFERED)
+#define TAP_IOCTL_GET_MTU TAP_CONTROL_CODE (3, METHOD_BUFFERED)
+#define TAP_IOCTL_GET_INFO TAP_CONTROL_CODE (4, METHOD_BUFFERED)
+#define TAP_IOCTL_CONFIG_POINT_TO_POINT TAP_CONTROL_CODE (5, METHOD_BUFFERED)
+#define TAP_IOCTL_SET_MEDIA_STATUS TAP_CONTROL_CODE (6, METHOD_BUFFERED)
+#define TAP_IOCTL_CONFIG_DHCP_MASQ TAP_CONTROL_CODE (7, METHOD_BUFFERED)
+#define TAP_IOCTL_GET_LOG_LINE TAP_CONTROL_CODE (8, METHOD_BUFFERED)
+#define TAP_IOCTL_CONFIG_DHCP_SET_OPT TAP_CONTROL_CODE (9, METHOD_BUFFERED)
+
+//=================
+// Registry keys
+//=================
+
+#define ADAPTER_KEY "SYSTEM\\CurrentControlSet\\Control\\Class\\{4D36E972-E325-11CE-BFC1-08002BE10318}"
+
+#define NETWORK_CONNECTIONS_KEY "SYSTEM\\CurrentControlSet\\Control\\Network\\{4D36E972-E325-11CE-BFC1-08002BE10318}"
+
+//======================
+// Filesystem prefixes
+//======================
+
+#define USERMODEDEVICEDIR "\\\\.\\Global\\"
+#define TAPSUFFIX ".tap"
+
+
+//======================
+// Compile time configuration
+//======================
+
+//#define DEBUG_TAP_WIN32
+
+/* FIXME: The asynch write path appears to be broken at
+ * present. WriteFile() ignores the lpNumberOfBytesWritten parameter
+ * for overlapped writes, with the result we return zero bytes sent,
+ * and after handling a single packet, receive is disabled for this
+ * interface. */
+/* #define TUN_ASYNCHRONOUS_WRITES 1 */
+
+#define TUN_BUFFER_SIZE 1560
+#define TUN_MAX_BUFFER_COUNT 32
+
+/*
+ * The data member "buffer" must be the first element in the tun_buffer
+ * structure. See the function, tap_win32_free_buffer.
+ */
+typedef struct tun_buffer_s {
+ unsigned char buffer [TUN_BUFFER_SIZE];
+ unsigned long read_size;
+ struct tun_buffer_s* next;
+} tun_buffer_t;
+
+typedef struct tap_win32_overlapped {
+ HANDLE handle;
+ HANDLE read_event;
+ HANDLE write_event;
+ HANDLE output_queue_semaphore;
+ HANDLE free_list_semaphore;
+ HANDLE tap_semaphore;
+ CRITICAL_SECTION output_queue_cs;
+ CRITICAL_SECTION free_list_cs;
+ OVERLAPPED read_overlapped;
+ OVERLAPPED write_overlapped;
+ tun_buffer_t buffers[TUN_MAX_BUFFER_COUNT];
+ tun_buffer_t* free_list;
+ tun_buffer_t* output_queue_front;
+ tun_buffer_t* output_queue_back;
+} tap_win32_overlapped_t;
+
+static tap_win32_overlapped_t tap_overlapped;
+
+static tun_buffer_t* get_buffer_from_free_list(tap_win32_overlapped_t* const overlapped)
+{
+ tun_buffer_t* buffer = NULL;
+ WaitForSingleObject(overlapped->free_list_semaphore, INFINITE);
+ EnterCriticalSection(&overlapped->free_list_cs);
+ buffer = overlapped->free_list;
+// assert(buffer != NULL);
+ overlapped->free_list = buffer->next;
+ LeaveCriticalSection(&overlapped->free_list_cs);
+ buffer->next = NULL;
+ return buffer;
+}
+
+static void put_buffer_on_free_list(tap_win32_overlapped_t* const overlapped, tun_buffer_t* const buffer)
+{
+ EnterCriticalSection(&overlapped->free_list_cs);
+ buffer->next = overlapped->free_list;
+ overlapped->free_list = buffer;
+ LeaveCriticalSection(&overlapped->free_list_cs);
+ ReleaseSemaphore(overlapped->free_list_semaphore, 1, NULL);
+}
+
+static tun_buffer_t* get_buffer_from_output_queue(tap_win32_overlapped_t* const overlapped, const int block)
+{
+ tun_buffer_t* buffer = NULL;
+ DWORD result, timeout = block ? INFINITE : 0L;
+
+ // Non-blocking call
+ result = WaitForSingleObject(overlapped->output_queue_semaphore, timeout);
+
+ switch (result)
+ {
+ // The semaphore object was signaled.
+ case WAIT_OBJECT_0:
+ EnterCriticalSection(&overlapped->output_queue_cs);
+
+ buffer = overlapped->output_queue_front;
+ overlapped->output_queue_front = buffer->next;
+
+ if(overlapped->output_queue_front == NULL) {
+ overlapped->output_queue_back = NULL;
+ }
+
+ LeaveCriticalSection(&overlapped->output_queue_cs);
+ break;
+
+ // Semaphore was nonsignaled, so a time-out occurred.
+ case WAIT_TIMEOUT:
+ // Cannot open another window.
+ break;
+ }
+
+ return buffer;
+}
+
+static tun_buffer_t* get_buffer_from_output_queue_immediate (tap_win32_overlapped_t* const overlapped)
+{
+ return get_buffer_from_output_queue(overlapped, 0);
+}
+
+static void put_buffer_on_output_queue(tap_win32_overlapped_t* const overlapped, tun_buffer_t* const buffer)
+{
+ EnterCriticalSection(&overlapped->output_queue_cs);
+
+ if(overlapped->output_queue_front == NULL && overlapped->output_queue_back == NULL) {
+ overlapped->output_queue_front = overlapped->output_queue_back = buffer;
+ } else {
+ buffer->next = NULL;
+ overlapped->output_queue_back->next = buffer;
+ overlapped->output_queue_back = buffer;
+ }
+
+ LeaveCriticalSection(&overlapped->output_queue_cs);
+
+ ReleaseSemaphore(overlapped->output_queue_semaphore, 1, NULL);
+}
+
+
+static int is_tap_win32_dev(const char *guid)
+{
+ HKEY netcard_key;
+ LONG status;
+ DWORD len;
+ int i = 0;
+
+ status = RegOpenKeyEx(
+ HKEY_LOCAL_MACHINE,
+ ADAPTER_KEY,
+ 0,
+ KEY_READ,
+ &netcard_key);
+
+ if (status != ERROR_SUCCESS) {
+ return FALSE;
+ }
+
+ for (;;) {
+ char enum_name[256];
+ char unit_string[256];
+ HKEY unit_key;
+ char component_id_string[] = "ComponentId";
+ char component_id[256];
+ char net_cfg_instance_id_string[] = "NetCfgInstanceId";
+ char net_cfg_instance_id[256];
+ DWORD data_type;
+
+ len = sizeof (enum_name);
+ status = RegEnumKeyEx(
+ netcard_key,
+ i,
+ enum_name,
+ &len,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+
+ if (status == ERROR_NO_MORE_ITEMS)
+ break;
+ else if (status != ERROR_SUCCESS) {
+ return FALSE;
+ }
+
+ snprintf (unit_string, sizeof(unit_string), "%s\\%s",
+ ADAPTER_KEY, enum_name);
+
+ status = RegOpenKeyEx(
+ HKEY_LOCAL_MACHINE,
+ unit_string,
+ 0,
+ KEY_READ,
+ &unit_key);
+
+ if (status != ERROR_SUCCESS) {
+ return FALSE;
+ } else {
+ len = sizeof (component_id);
+ status = RegQueryValueEx(
+ unit_key,
+ component_id_string,
+ NULL,
+ &data_type,
+ (LPBYTE)component_id,
+ &len);
+
+ if (!(status != ERROR_SUCCESS || data_type != REG_SZ)) {
+ len = sizeof (net_cfg_instance_id);
+ status = RegQueryValueEx(
+ unit_key,
+ net_cfg_instance_id_string,
+ NULL,
+ &data_type,
+ (LPBYTE)net_cfg_instance_id,
+ &len);
+
+ if (status == ERROR_SUCCESS && data_type == REG_SZ) {
+ if (/* !strcmp (component_id, TAP_COMPONENT_ID) &&*/
+ !strcmp (net_cfg_instance_id, guid)) {
+ RegCloseKey (unit_key);
+ RegCloseKey (netcard_key);
+ return TRUE;
+ }
+ }
+ }
+ RegCloseKey (unit_key);
+ }
+ ++i;
+ }
+
+ RegCloseKey (netcard_key);
+ return FALSE;
+}
+
+static int get_device_guid(
+ char *name,
+ int name_size,
+ char *actual_name,
+ int actual_name_size)
+{
+ LONG status;
+ HKEY control_net_key;
+ DWORD len;
+ int i = 0;
+ int stop = 0;
+
+ status = RegOpenKeyEx(
+ HKEY_LOCAL_MACHINE,
+ NETWORK_CONNECTIONS_KEY,
+ 0,
+ KEY_READ,
+ &control_net_key);
+
+ if (status != ERROR_SUCCESS) {
+ return -1;
+ }
+
+ while (!stop)
+ {
+ char enum_name[256];
+ char connection_string[256];
+ HKEY connection_key;
+ char name_data[256];
+ DWORD name_type;
+ const char name_string[] = "Name";
+
+ len = sizeof (enum_name);
+ status = RegEnumKeyEx(
+ control_net_key,
+ i,
+ enum_name,
+ &len,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+
+ if (status == ERROR_NO_MORE_ITEMS)
+ break;
+ else if (status != ERROR_SUCCESS) {
+ return -1;
+ }
+
+ snprintf(connection_string,
+ sizeof(connection_string),
+ "%s\\%s\\Connection",
+ NETWORK_CONNECTIONS_KEY, enum_name);
+
+ status = RegOpenKeyEx(
+ HKEY_LOCAL_MACHINE,
+ connection_string,
+ 0,
+ KEY_READ,
+ &connection_key);
+
+ if (status == ERROR_SUCCESS) {
+ len = sizeof (name_data);
+ status = RegQueryValueEx(
+ connection_key,
+ name_string,
+ NULL,
+ &name_type,
+ (LPBYTE)name_data,
+ &len);
+
+ if (status != ERROR_SUCCESS || name_type != REG_SZ) {
+ ++i;
+ continue;
+ }
+ else {
+ if (is_tap_win32_dev(enum_name)) {
+ snprintf(name, name_size, "%s", enum_name);
+ if (actual_name) {
+ if (strcmp(actual_name, "") != 0) {
+ if (strcmp(name_data, actual_name) != 0) {
+ RegCloseKey (connection_key);
+ ++i;
+ continue;
+ }
+ }
+ else {
+ snprintf(actual_name, actual_name_size, "%s", name_data);
+ }
+ }
+ stop = 1;
+ }
+ }
+
+ RegCloseKey (connection_key);
+ }
+ ++i;
+ }
+
+ RegCloseKey (control_net_key);
+
+ if (stop == 0)
+ return -1;
+
+ return 0;
+}
+
+static int tap_win32_set_status(HANDLE handle, int status)
+{
+ unsigned long len = 0;
+
+ return DeviceIoControl(handle, TAP_IOCTL_SET_MEDIA_STATUS,
+ &status, sizeof (status),
+ &status, sizeof (status), &len, NULL);
+}
+
+static void tap_win32_overlapped_init(tap_win32_overlapped_t* const overlapped, const HANDLE handle)
+{
+ overlapped->handle = handle;
+
+ overlapped->read_event = CreateEvent(NULL, FALSE, FALSE, NULL);
+ overlapped->write_event = CreateEvent(NULL, FALSE, FALSE, NULL);
+
+ overlapped->read_overlapped.Offset = 0;
+ overlapped->read_overlapped.OffsetHigh = 0;
+ overlapped->read_overlapped.hEvent = overlapped->read_event;
+
+ overlapped->write_overlapped.Offset = 0;
+ overlapped->write_overlapped.OffsetHigh = 0;
+ overlapped->write_overlapped.hEvent = overlapped->write_event;
+
+ InitializeCriticalSection(&overlapped->output_queue_cs);
+ InitializeCriticalSection(&overlapped->free_list_cs);
+
+ overlapped->output_queue_semaphore = CreateSemaphore(
+ NULL, // default security attributes
+ 0, // initial count
+ TUN_MAX_BUFFER_COUNT, // maximum count
+ NULL); // unnamed semaphore
+
+ if(!overlapped->output_queue_semaphore) {
+ fprintf(stderr, "error creating output queue semaphore!\n");
+ }
+
+ overlapped->free_list_semaphore = CreateSemaphore(
+ NULL, // default security attributes
+ TUN_MAX_BUFFER_COUNT, // initial count
+ TUN_MAX_BUFFER_COUNT, // maximum count
+ NULL); // unnamed semaphore
+
+ if(!overlapped->free_list_semaphore) {
+ fprintf(stderr, "error creating free list semaphore!\n");
+ }
+
+ overlapped->free_list = overlapped->output_queue_front = overlapped->output_queue_back = NULL;
+
+ {
+ unsigned index;
+ for(index = 0; index < TUN_MAX_BUFFER_COUNT; index++) {
+ tun_buffer_t* element = &overlapped->buffers[index];
+ element->next = overlapped->free_list;
+ overlapped->free_list = element;
+ }
+ }
+ /* To count buffers, initially no-signal. */
+ overlapped->tap_semaphore = CreateSemaphore(NULL, 0, TUN_MAX_BUFFER_COUNT, NULL);
+ if(!overlapped->tap_semaphore)
+ fprintf(stderr, "error creating tap_semaphore.\n");
+}
+
+static int tap_win32_write(tap_win32_overlapped_t *overlapped,
+ const void *buffer, unsigned long size)
+{
+ unsigned long write_size;
+ BOOL result;
+ DWORD error;
+
+#ifdef TUN_ASYNCHRONOUS_WRITES
+ result = GetOverlappedResult( overlapped->handle, &overlapped->write_overlapped,
+ &write_size, FALSE);
+
+ if (!result && GetLastError() == ERROR_IO_INCOMPLETE)
+ WaitForSingleObject(overlapped->write_event, INFINITE);
+#endif
+
+ result = WriteFile(overlapped->handle, buffer, size,
+ &write_size, &overlapped->write_overlapped);
+
+#ifdef TUN_ASYNCHRONOUS_WRITES
+ /* FIXME: we can't sensibly set write_size here, without waiting
+ * for the IO to complete! Moreover, we can't return zero,
+ * because that will disable receive on this interface, and we
+ * also can't assume it will succeed and return the full size,
+ * because that will result in the buffer being reclaimed while
+ * the IO is in progress. */
+#error Async writes are broken. Please disable TUN_ASYNCHRONOUS_WRITES.
+#else /* !TUN_ASYNCHRONOUS_WRITES */
+ if (!result) {
+ error = GetLastError();
+ if (error == ERROR_IO_PENDING) {
+ result = GetOverlappedResult(overlapped->handle,
+ &overlapped->write_overlapped,
+ &write_size, TRUE);
+ }
+ }
+#endif
+
+ if (!result) {
+#ifdef DEBUG_TAP_WIN32
+ LPTSTR msgbuf;
+ error = GetLastError();
+ FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER|FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, error, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ &msgbuf, 0, NULL);
+ fprintf(stderr, "Tap-Win32: Error WriteFile %d - %s\n", error, msgbuf);
+ LocalFree(msgbuf);
+#endif
+ return 0;
+ }
+
+ return write_size;
+}
+
+static DWORD WINAPI tap_win32_thread_entry(LPVOID param)
+{
+ tap_win32_overlapped_t *overlapped = (tap_win32_overlapped_t*)param;
+ unsigned long read_size;
+ BOOL result;
+ DWORD dwError;
+ tun_buffer_t* buffer = get_buffer_from_free_list(overlapped);
+
+
+ for (;;) {
+ result = ReadFile(overlapped->handle,
+ buffer->buffer,
+ sizeof(buffer->buffer),
+ &read_size,
+ &overlapped->read_overlapped);
+ if (!result) {
+ dwError = GetLastError();
+ if (dwError == ERROR_IO_PENDING) {
+ WaitForSingleObject(overlapped->read_event, INFINITE);
+ result = GetOverlappedResult( overlapped->handle, &overlapped->read_overlapped,
+ &read_size, FALSE);
+ if (!result) {
+#ifdef DEBUG_TAP_WIN32
+ LPVOID lpBuffer;
+ dwError = GetLastError();
+ FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, dwError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPTSTR) & lpBuffer, 0, NULL );
+ fprintf(stderr, "Tap-Win32: Error GetOverlappedResult %d - %s\n", dwError, lpBuffer);
+ LocalFree( lpBuffer );
+#endif
+ }
+ } else {
+#ifdef DEBUG_TAP_WIN32
+ LPVOID lpBuffer;
+ FormatMessage( FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM,
+ NULL, dwError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+ (LPTSTR) & lpBuffer, 0, NULL );
+ fprintf(stderr, "Tap-Win32: Error ReadFile %d - %s\n", dwError, lpBuffer);
+ LocalFree( lpBuffer );
+#endif
+ }
+ }
+
+ if(read_size > 0) {
+ buffer->read_size = read_size;
+ put_buffer_on_output_queue(overlapped, buffer);
+ ReleaseSemaphore(overlapped->tap_semaphore, 1, NULL);
+ buffer = get_buffer_from_free_list(overlapped);
+ }
+ }
+
+ return 0;
+}
+
+static int tap_win32_read(tap_win32_overlapped_t *overlapped,
+ uint8_t **pbuf, int max_size)
+{
+ int size = 0;
+
+ tun_buffer_t* buffer = get_buffer_from_output_queue_immediate(overlapped);
+
+ if(buffer != NULL) {
+ *pbuf = buffer->buffer;
+ size = (int)buffer->read_size;
+ if(size > max_size) {
+ size = max_size;
+ }
+ }
+
+ return size;
+}
+
+static void tap_win32_free_buffer(tap_win32_overlapped_t *overlapped,
+ uint8_t *pbuf)
+{
+ tun_buffer_t* buffer = (tun_buffer_t*)pbuf;
+ put_buffer_on_free_list(overlapped, buffer);
+}
+
+static int tap_win32_open(tap_win32_overlapped_t **phandle,
+ const char *preferred_name)
+{
+ char device_path[256];
+ char device_guid[0x100];
+ int rc;
+ HANDLE handle;
+ BOOL bret;
+ char name_buffer[0x100] = {0, };
+ struct {
+ unsigned long major;
+ unsigned long minor;
+ unsigned long debug;
+ } version;
+ DWORD version_len;
+ DWORD idThread;
+
+ if (preferred_name != NULL) {
+ snprintf(name_buffer, sizeof(name_buffer), "%s", preferred_name);
+ }
+
+ rc = get_device_guid(device_guid, sizeof(device_guid), name_buffer, sizeof(name_buffer));
+ if (rc)
+ return -1;
+
+ snprintf (device_path, sizeof(device_path), "%s%s%s",
+ USERMODEDEVICEDIR,
+ device_guid,
+ TAPSUFFIX);
+
+ handle = CreateFile (
+ device_path,
+ GENERIC_READ | GENERIC_WRITE,
+ 0,
+ 0,
+ OPEN_EXISTING,
+ FILE_ATTRIBUTE_SYSTEM | FILE_FLAG_OVERLAPPED,
+ 0 );
+
+ if (handle == INVALID_HANDLE_VALUE) {
+ return -1;
+ }
+
+ bret = DeviceIoControl(handle, TAP_IOCTL_GET_VERSION,
+ &version, sizeof (version),
+ &version, sizeof (version), &version_len, NULL);
+
+ if (bret == FALSE) {
+ CloseHandle(handle);
+ return -1;
+ }
+
+ if (!tap_win32_set_status(handle, TRUE)) {
+ return -1;
+ }
+
+ tap_win32_overlapped_init(&tap_overlapped, handle);
+
+ *phandle = &tap_overlapped;
+
+ CreateThread(NULL, 0, tap_win32_thread_entry,
+ (LPVOID)&tap_overlapped, 0, &idThread);
+ return 0;
+}
+
+/********************************************/
+
+ typedef struct TAPState {
+ NetClientState nc;
+ tap_win32_overlapped_t *handle;
+ } TAPState;
+
+static void tap_cleanup(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ qemu_del_wait_object(s->handle->tap_semaphore, NULL, NULL);
+
+ /* FIXME: need to kill thread and close file handle:
+ tap_win32_close(s);
+ */
+}
+
+static ssize_t tap_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ return tap_win32_write(s->handle, buf, size);
+}
+
+static void tap_win32_send(void *opaque)
+{
+ TAPState *s = opaque;
+ uint8_t *buf, *orig_buf;
+ int max_size = 4096;
+ int size;
+ uint8_t min_pkt[ETH_ZLEN];
+ size_t min_pktsz = sizeof(min_pkt);
+
+ size = tap_win32_read(s->handle, &buf, max_size);
+ if (size > 0) {
+ orig_buf = buf;
+
+ if (net_peer_needs_padding(&s->nc)) {
+ if (eth_pad_short_frame(min_pkt, &min_pktsz, buf, size)) {
+ buf = min_pkt;
+ size = min_pktsz;
+ }
+ }
+
+ qemu_send_packet(&s->nc, buf, size);
+ tap_win32_free_buffer(s->handle, orig_buf);
+ }
+}
+
+static bool tap_has_ufo(NetClientState *nc)
+{
+ return false;
+}
+
+static bool tap_has_vnet_hdr(NetClientState *nc)
+{
+ return false;
+}
+
+int tap_probe_vnet_hdr_len(int fd, int len)
+{
+ return 0;
+}
+
+void tap_fd_set_vnet_hdr_len(int fd, int len)
+{
+}
+
+int tap_fd_set_vnet_le(int fd, int is_le)
+{
+ return -EINVAL;
+}
+
+int tap_fd_set_vnet_be(int fd, int is_be)
+{
+ return -EINVAL;
+}
+
+static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
+{
+}
+
+static void tap_set_offload(NetClientState *nc, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+}
+
+struct vhost_net *tap_get_vhost_net(NetClientState *nc)
+{
+ return NULL;
+}
+
+static bool tap_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ return false;
+}
+
+static void tap_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ abort();
+}
+
+static NetClientInfo net_tap_win32_info = {
+ .type = NET_CLIENT_DRIVER_TAP,
+ .size = sizeof(TAPState),
+ .receive = tap_receive,
+ .cleanup = tap_cleanup,
+ .has_ufo = tap_has_ufo,
+ .has_vnet_hdr = tap_has_vnet_hdr,
+ .has_vnet_hdr_len = tap_has_vnet_hdr_len,
+ .using_vnet_hdr = tap_using_vnet_hdr,
+ .set_offload = tap_set_offload,
+ .set_vnet_hdr_len = tap_set_vnet_hdr_len,
+};
+
+static int tap_win32_init(NetClientState *peer, const char *model,
+ const char *name, const char *ifname)
+{
+ NetClientState *nc;
+ TAPState *s;
+ tap_win32_overlapped_t *handle;
+
+ if (tap_win32_open(&handle, ifname) < 0) {
+ printf("tap: Could not open '%s'\n", ifname);
+ return -1;
+ }
+
+ nc = qemu_new_net_client(&net_tap_win32_info, peer, model, name);
+
+ s = DO_UPCAST(TAPState, nc, nc);
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "tap: ifname=%s", ifname);
+
+ s->handle = handle;
+
+ qemu_add_wait_object(s->handle->tap_semaphore, tap_win32_send, s);
+
+ return 0;
+}
+
+int net_init_tap(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ /* FIXME error_setg(errp, ...) on failure */
+ const NetdevTapOptions *tap;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_TAP);
+ tap = &netdev->u.tap;
+
+ if (!tap->has_ifname) {
+ error_report("tap: no interface name");
+ return -1;
+ }
+
+ if (tap_win32_init(peer, "tap", name, tap->ifname) == -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+int tap_enable(NetClientState *nc)
+{
+ abort();
+}
+
+int tap_disable(NetClientState *nc)
+{
+ abort();
+}
diff --git a/net/tap.c b/net/tap.c
new file mode 100644
index 000000000..f716be3e3
--- /dev/null
+++ b/net/tap.c
@@ -0,0 +1,1056 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "tap_int.h"
+
+
+#include <sys/ioctl.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <net/if.h>
+
+#include "net/eth.h"
+#include "net/net.h"
+#include "clients.h"
+#include "monitor/monitor.h"
+#include "sysemu/sysemu.h"
+#include "qapi/error.h"
+#include "qemu-common.h"
+#include "qemu/cutils.h"
+#include "qemu/error-report.h"
+#include "qemu/main-loop.h"
+#include "qemu/sockets.h"
+
+#include "net/tap.h"
+
+#include "net/vhost_net.h"
+
+typedef struct TAPState {
+ NetClientState nc;
+ int fd;
+ char down_script[1024];
+ char down_script_arg[128];
+ uint8_t buf[NET_BUFSIZE];
+ bool read_poll;
+ bool write_poll;
+ bool using_vnet_hdr;
+ bool has_ufo;
+ bool enabled;
+ VHostNetState *vhost_net;
+ unsigned host_vnet_hdr_len;
+ Notifier exit;
+} TAPState;
+
+static void launch_script(const char *setup_script, const char *ifname,
+ int fd, Error **errp);
+
+static void tap_send(void *opaque);
+static void tap_writable(void *opaque);
+
+static void tap_update_fd_handler(TAPState *s)
+{
+ qemu_set_fd_handler(s->fd,
+ s->read_poll && s->enabled ? tap_send : NULL,
+ s->write_poll && s->enabled ? tap_writable : NULL,
+ s);
+}
+
+static void tap_read_poll(TAPState *s, bool enable)
+{
+ s->read_poll = enable;
+ tap_update_fd_handler(s);
+}
+
+static void tap_write_poll(TAPState *s, bool enable)
+{
+ s->write_poll = enable;
+ tap_update_fd_handler(s);
+}
+
+static void tap_writable(void *opaque)
+{
+ TAPState *s = opaque;
+
+ tap_write_poll(s, false);
+
+ qemu_flush_queued_packets(&s->nc);
+}
+
+static ssize_t tap_write_packet(TAPState *s, const struct iovec *iov, int iovcnt)
+{
+ ssize_t len;
+
+ do {
+ len = writev(s->fd, iov, iovcnt);
+ } while (len == -1 && errno == EINTR);
+
+ if (len == -1 && errno == EAGAIN) {
+ tap_write_poll(s, true);
+ return 0;
+ }
+
+ return len;
+}
+
+static ssize_t tap_receive_iov(NetClientState *nc, const struct iovec *iov,
+ int iovcnt)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ const struct iovec *iovp = iov;
+ struct iovec iov_copy[iovcnt + 1];
+ struct virtio_net_hdr_mrg_rxbuf hdr = { };
+
+ if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+ iov_copy[0].iov_base = &hdr;
+ iov_copy[0].iov_len = s->host_vnet_hdr_len;
+ memcpy(&iov_copy[1], iov, iovcnt * sizeof(*iov));
+ iovp = iov_copy;
+ iovcnt++;
+ }
+
+ return tap_write_packet(s, iovp, iovcnt);
+}
+
+static ssize_t tap_receive_raw(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ struct iovec iov[2];
+ int iovcnt = 0;
+ struct virtio_net_hdr_mrg_rxbuf hdr = { };
+
+ if (s->host_vnet_hdr_len) {
+ iov[iovcnt].iov_base = &hdr;
+ iov[iovcnt].iov_len = s->host_vnet_hdr_len;
+ iovcnt++;
+ }
+
+ iov[iovcnt].iov_base = (char *)buf;
+ iov[iovcnt].iov_len = size;
+ iovcnt++;
+
+ return tap_write_packet(s, iov, iovcnt);
+}
+
+static ssize_t tap_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ struct iovec iov[1];
+
+ if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+ return tap_receive_raw(nc, buf, size);
+ }
+
+ iov[0].iov_base = (char *)buf;
+ iov[0].iov_len = size;
+
+ return tap_write_packet(s, iov, 1);
+}
+
+#ifndef __sun__
+ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen)
+{
+ return read(tapfd, buf, maxlen);
+}
+#endif
+
+static void tap_send_completed(NetClientState *nc, ssize_t len)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ tap_read_poll(s, true);
+}
+
+static void tap_send(void *opaque)
+{
+ TAPState *s = opaque;
+ int size;
+ int packets = 0;
+
+ while (true) {
+ uint8_t *buf = s->buf;
+ uint8_t min_pkt[ETH_ZLEN];
+ size_t min_pktsz = sizeof(min_pkt);
+
+ size = tap_read_packet(s->fd, s->buf, sizeof(s->buf));
+ if (size <= 0) {
+ break;
+ }
+
+ if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
+ buf += s->host_vnet_hdr_len;
+ size -= s->host_vnet_hdr_len;
+ }
+
+ if (net_peer_needs_padding(&s->nc)) {
+ if (eth_pad_short_frame(min_pkt, &min_pktsz, buf, size)) {
+ buf = min_pkt;
+ size = min_pktsz;
+ }
+ }
+
+ size = qemu_send_packet_async(&s->nc, buf, size, tap_send_completed);
+ if (size == 0) {
+ tap_read_poll(s, false);
+ break;
+ } else if (size < 0) {
+ break;
+ }
+
+ /*
+ * When the host keeps receiving more packets while tap_send() is
+ * running we can hog the QEMU global mutex. Limit the number of
+ * packets that are processed per tap_send() callback to prevent
+ * stalling the guest.
+ */
+ packets++;
+ if (packets >= 50) {
+ break;
+ }
+ }
+}
+
+static bool tap_has_ufo(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return s->has_ufo;
+}
+
+static bool tap_has_vnet_hdr(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return !!s->host_vnet_hdr_len;
+}
+
+static bool tap_has_vnet_hdr_len(NetClientState *nc, int len)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return !!tap_probe_vnet_hdr_len(s->fd, len);
+}
+
+static void tap_set_vnet_hdr_len(NetClientState *nc, int len)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+ assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) ||
+ len == sizeof(struct virtio_net_hdr) ||
+ len == sizeof(struct virtio_net_hdr_v1_hash));
+
+ tap_fd_set_vnet_hdr_len(s->fd, len);
+ s->host_vnet_hdr_len = len;
+}
+
+static void tap_using_vnet_hdr(NetClientState *nc, bool using_vnet_hdr)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+ assert(!!s->host_vnet_hdr_len == using_vnet_hdr);
+
+ s->using_vnet_hdr = using_vnet_hdr;
+}
+
+static int tap_set_vnet_le(NetClientState *nc, bool is_le)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ return tap_fd_set_vnet_le(s->fd, is_le);
+}
+
+static int tap_set_vnet_be(NetClientState *nc, bool is_be)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ return tap_fd_set_vnet_be(s->fd, is_be);
+}
+
+static void tap_set_offload(NetClientState *nc, int csum, int tso4,
+ int tso6, int ecn, int ufo)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ if (s->fd < 0) {
+ return;
+ }
+
+ tap_fd_set_offload(s->fd, csum, tso4, tso6, ecn, ufo);
+}
+
+static void tap_exit_notify(Notifier *notifier, void *data)
+{
+ TAPState *s = container_of(notifier, TAPState, exit);
+ Error *err = NULL;
+
+ if (s->down_script[0]) {
+ launch_script(s->down_script, s->down_script_arg, s->fd, &err);
+ if (err) {
+ error_report_err(err);
+ }
+ }
+}
+
+static void tap_cleanup(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+
+ if (s->vhost_net) {
+ vhost_net_cleanup(s->vhost_net);
+ g_free(s->vhost_net);
+ s->vhost_net = NULL;
+ }
+
+ qemu_purge_queued_packets(nc);
+
+ tap_exit_notify(&s->exit, NULL);
+ qemu_remove_exit_notifier(&s->exit);
+
+ tap_read_poll(s, false);
+ tap_write_poll(s, false);
+ close(s->fd);
+ s->fd = -1;
+}
+
+static void tap_poll(NetClientState *nc, bool enable)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ tap_read_poll(s, enable);
+ tap_write_poll(s, enable);
+}
+
+static bool tap_set_steering_ebpf(NetClientState *nc, int prog_fd)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+
+ return tap_fd_set_steering_ebpf(s->fd, prog_fd) == 0;
+}
+
+int tap_get_fd(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+ return s->fd;
+}
+
+/* fd support */
+
+static NetClientInfo net_tap_info = {
+ .type = NET_CLIENT_DRIVER_TAP,
+ .size = sizeof(TAPState),
+ .receive = tap_receive,
+ .receive_raw = tap_receive_raw,
+ .receive_iov = tap_receive_iov,
+ .poll = tap_poll,
+ .cleanup = tap_cleanup,
+ .has_ufo = tap_has_ufo,
+ .has_vnet_hdr = tap_has_vnet_hdr,
+ .has_vnet_hdr_len = tap_has_vnet_hdr_len,
+ .using_vnet_hdr = tap_using_vnet_hdr,
+ .set_offload = tap_set_offload,
+ .set_vnet_hdr_len = tap_set_vnet_hdr_len,
+ .set_vnet_le = tap_set_vnet_le,
+ .set_vnet_be = tap_set_vnet_be,
+ .set_steering_ebpf = tap_set_steering_ebpf,
+};
+
+static TAPState *net_tap_fd_init(NetClientState *peer,
+ const char *model,
+ const char *name,
+ int fd,
+ int vnet_hdr)
+{
+ NetClientState *nc;
+ TAPState *s;
+
+ nc = qemu_new_net_client(&net_tap_info, peer, model, name);
+
+ s = DO_UPCAST(TAPState, nc, nc);
+
+ s->fd = fd;
+ s->host_vnet_hdr_len = vnet_hdr ? sizeof(struct virtio_net_hdr) : 0;
+ s->using_vnet_hdr = false;
+ s->has_ufo = tap_probe_has_ufo(s->fd);
+ s->enabled = true;
+ tap_set_offload(&s->nc, 0, 0, 0, 0, 0);
+ /*
+ * Make sure host header length is set correctly in tap:
+ * it might have been modified by another instance of qemu.
+ */
+ if (tap_probe_vnet_hdr_len(s->fd, s->host_vnet_hdr_len)) {
+ tap_fd_set_vnet_hdr_len(s->fd, s->host_vnet_hdr_len);
+ }
+ tap_read_poll(s, true);
+ s->vhost_net = NULL;
+
+ s->exit.notify = tap_exit_notify;
+ qemu_add_exit_notifier(&s->exit);
+
+ return s;
+}
+
+static void launch_script(const char *setup_script, const char *ifname,
+ int fd, Error **errp)
+{
+ int pid, status;
+ char *args[3];
+ char **parg;
+
+ /* try to launch network script */
+ pid = fork();
+ if (pid < 0) {
+ error_setg_errno(errp, errno, "could not launch network script %s",
+ setup_script);
+ return;
+ }
+ if (pid == 0) {
+ int open_max = sysconf(_SC_OPEN_MAX), i;
+
+ for (i = 3; i < open_max; i++) {
+ if (i != fd) {
+ close(i);
+ }
+ }
+ parg = args;
+ *parg++ = (char *)setup_script;
+ *parg++ = (char *)ifname;
+ *parg = NULL;
+ execv(setup_script, args);
+ _exit(1);
+ } else {
+ while (waitpid(pid, &status, 0) != pid) {
+ /* loop */
+ }
+
+ if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
+ return;
+ }
+ error_setg(errp, "network script %s failed with status %d",
+ setup_script, status);
+ }
+}
+
+static int recv_fd(int c)
+{
+ int fd;
+ uint8_t msgbuf[CMSG_SPACE(sizeof(fd))];
+ struct msghdr msg = {
+ .msg_control = msgbuf,
+ .msg_controllen = sizeof(msgbuf),
+ };
+ struct cmsghdr *cmsg;
+ struct iovec iov;
+ uint8_t req[1];
+ ssize_t len;
+
+ cmsg = CMSG_FIRSTHDR(&msg);
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+ cmsg->cmsg_len = CMSG_LEN(sizeof(fd));
+ msg.msg_controllen = cmsg->cmsg_len;
+
+ iov.iov_base = req;
+ iov.iov_len = sizeof(req);
+
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ len = recvmsg(c, &msg, 0);
+ if (len > 0) {
+ memcpy(&fd, CMSG_DATA(cmsg), sizeof(fd));
+ return fd;
+ }
+
+ return len;
+}
+
+static int net_bridge_run_helper(const char *helper, const char *bridge,
+ Error **errp)
+{
+ sigset_t oldmask, mask;
+ g_autofree char *default_helper = NULL;
+ int pid, status;
+ char *args[5];
+ char **parg;
+ int sv[2];
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &mask, &oldmask);
+
+ if (!helper) {
+ helper = default_helper = get_relocated_path(DEFAULT_BRIDGE_HELPER);
+ }
+
+ if (socketpair(PF_UNIX, SOCK_STREAM, 0, sv) == -1) {
+ error_setg_errno(errp, errno, "socketpair() failed");
+ return -1;
+ }
+
+ /* try to launch bridge helper */
+ pid = fork();
+ if (pid < 0) {
+ error_setg_errno(errp, errno, "Can't fork bridge helper");
+ return -1;
+ }
+ if (pid == 0) {
+ int open_max = sysconf(_SC_OPEN_MAX), i;
+ char *fd_buf = NULL;
+ char *br_buf = NULL;
+ char *helper_cmd = NULL;
+
+ for (i = 3; i < open_max; i++) {
+ if (i != sv[1]) {
+ close(i);
+ }
+ }
+
+ fd_buf = g_strdup_printf("%s%d", "--fd=", sv[1]);
+
+ if (strrchr(helper, ' ') || strrchr(helper, '\t')) {
+ /* assume helper is a command */
+
+ if (strstr(helper, "--br=") == NULL) {
+ br_buf = g_strdup_printf("%s%s", "--br=", bridge);
+ }
+
+ helper_cmd = g_strdup_printf("%s %s %s %s", helper,
+ "--use-vnet", fd_buf, br_buf ? br_buf : "");
+
+ parg = args;
+ *parg++ = (char *)"sh";
+ *parg++ = (char *)"-c";
+ *parg++ = helper_cmd;
+ *parg++ = NULL;
+
+ execv("/bin/sh", args);
+ g_free(helper_cmd);
+ } else {
+ /* assume helper is just the executable path name */
+
+ br_buf = g_strdup_printf("%s%s", "--br=", bridge);
+
+ parg = args;
+ *parg++ = (char *)helper;
+ *parg++ = (char *)"--use-vnet";
+ *parg++ = fd_buf;
+ *parg++ = br_buf;
+ *parg++ = NULL;
+
+ execv(helper, args);
+ }
+ g_free(fd_buf);
+ g_free(br_buf);
+ _exit(1);
+
+ } else {
+ int fd;
+ int saved_errno;
+
+ close(sv[1]);
+
+ do {
+ fd = recv_fd(sv[0]);
+ } while (fd == -1 && errno == EINTR);
+ saved_errno = errno;
+
+ close(sv[0]);
+
+ while (waitpid(pid, &status, 0) != pid) {
+ /* loop */
+ }
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+ if (fd < 0) {
+ error_setg_errno(errp, saved_errno,
+ "failed to recv file descriptor");
+ return -1;
+ }
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ error_setg(errp, "bridge helper failed");
+ return -1;
+ }
+ return fd;
+ }
+}
+
+int net_init_bridge(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevBridgeOptions *bridge;
+ const char *helper, *br;
+ TAPState *s;
+ int fd, vnet_hdr;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_BRIDGE);
+ bridge = &netdev->u.bridge;
+ helper = bridge->has_helper ? bridge->helper : NULL;
+ br = bridge->has_br ? bridge->br : DEFAULT_BRIDGE_INTERFACE;
+
+ fd = net_bridge_run_helper(helper, br, errp);
+ if (fd == -1) {
+ return -1;
+ }
+
+ qemu_set_nonblock(fd);
+ vnet_hdr = tap_probe_vnet_hdr(fd, errp);
+ if (vnet_hdr < 0) {
+ close(fd);
+ return -1;
+ }
+ s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr);
+
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper,
+ br);
+
+ return 0;
+}
+
+static int net_tap_init(const NetdevTapOptions *tap, int *vnet_hdr,
+ const char *setup_script, char *ifname,
+ size_t ifname_sz, int mq_required, Error **errp)
+{
+ Error *err = NULL;
+ int fd, vnet_hdr_required;
+
+ if (tap->has_vnet_hdr) {
+ *vnet_hdr = tap->vnet_hdr;
+ vnet_hdr_required = *vnet_hdr;
+ } else {
+ *vnet_hdr = 1;
+ vnet_hdr_required = 0;
+ }
+
+ TFR(fd = tap_open(ifname, ifname_sz, vnet_hdr, vnet_hdr_required,
+ mq_required, errp));
+ if (fd < 0) {
+ return -1;
+ }
+
+ if (setup_script &&
+ setup_script[0] != '\0' &&
+ strcmp(setup_script, "no") != 0) {
+ launch_script(setup_script, ifname, fd, &err);
+ if (err) {
+ error_propagate(errp, err);
+ close(fd);
+ return -1;
+ }
+ }
+
+ return fd;
+}
+
+#define MAX_TAP_QUEUES 1024
+
+static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
+ const char *model, const char *name,
+ const char *ifname, const char *script,
+ const char *downscript, const char *vhostfdname,
+ int vnet_hdr, int fd, Error **errp)
+{
+ Error *err = NULL;
+ TAPState *s = net_tap_fd_init(peer, model, name, fd, vnet_hdr);
+ int vhostfd;
+
+ tap_set_sndbuf(s->fd, tap, &err);
+ if (err) {
+ error_propagate(errp, err);
+ return;
+ }
+
+ if (tap->has_fd || tap->has_fds) {
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
+ } else if (tap->has_helper) {
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s",
+ tap->helper);
+ } else {
+ snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+ "ifname=%s,script=%s,downscript=%s", ifname, script,
+ downscript);
+
+ if (strcmp(downscript, "no") != 0) {
+ snprintf(s->down_script, sizeof(s->down_script), "%s", downscript);
+ snprintf(s->down_script_arg, sizeof(s->down_script_arg),
+ "%s", ifname);
+ }
+ }
+
+ if (tap->has_vhost ? tap->vhost :
+ vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
+ VhostNetOptions options;
+
+ options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
+ options.net_backend = &s->nc;
+ if (tap->has_poll_us) {
+ options.busyloop_timeout = tap->poll_us;
+ } else {
+ options.busyloop_timeout = 0;
+ }
+
+ if (vhostfdname) {
+ int ret;
+
+ vhostfd = monitor_fd_param(monitor_cur(), vhostfdname, &err);
+ if (vhostfd == -1) {
+ if (tap->has_vhostforce && tap->vhostforce) {
+ error_propagate(errp, err);
+ } else {
+ warn_report_err(err);
+ }
+ return;
+ }
+ ret = qemu_try_set_nonblock(vhostfd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d",
+ name, fd);
+ return;
+ }
+ } else {
+ vhostfd = open("/dev/vhost-net", O_RDWR);
+ if (vhostfd < 0) {
+ if (tap->has_vhostforce && tap->vhostforce) {
+ error_setg_errno(errp, errno,
+ "tap: open vhost char device failed");
+ } else {
+ warn_report("tap: open vhost char device failed: %s",
+ strerror(errno));
+ }
+ return;
+ }
+ qemu_set_nonblock(vhostfd);
+ }
+ options.opaque = (void *)(uintptr_t)vhostfd;
+ options.nvqs = 2;
+
+ s->vhost_net = vhost_net_init(&options);
+ if (!s->vhost_net) {
+ if (tap->has_vhostforce && tap->vhostforce) {
+ error_setg(errp, VHOST_NET_INIT_FAILED);
+ } else {
+ warn_report(VHOST_NET_INIT_FAILED);
+ }
+ return;
+ }
+ } else if (vhostfdname) {
+ error_setg(errp, "vhostfd(s)= is not valid without vhost");
+ }
+}
+
+static int get_fds(char *str, char *fds[], int max)
+{
+ char *ptr = str, *this;
+ size_t len = strlen(str);
+ int i = 0;
+
+ while (i < max && ptr < str + len) {
+ this = strchr(ptr, ':');
+
+ if (this == NULL) {
+ fds[i] = g_strdup(ptr);
+ } else {
+ fds[i] = g_strndup(ptr, this - ptr);
+ }
+
+ i++;
+ if (this == NULL) {
+ break;
+ } else {
+ ptr = this + 1;
+ }
+ }
+
+ return i;
+}
+
+int net_init_tap(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevTapOptions *tap;
+ int fd, vnet_hdr = 0, i = 0, queues;
+ /* for the no-fd, no-helper case */
+ const char *script;
+ const char *downscript;
+ Error *err = NULL;
+ const char *vhostfdname;
+ char ifname[128];
+ int ret = 0;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_TAP);
+ tap = &netdev->u.tap;
+ queues = tap->has_queues ? tap->queues : 1;
+ vhostfdname = tap->has_vhostfd ? tap->vhostfd : NULL;
+ script = tap->has_script ? tap->script : NULL;
+ downscript = tap->has_downscript ? tap->downscript : NULL;
+
+ /* QEMU hubs do not support multiqueue tap, in this case peer is set.
+ * For -netdev, peer is always NULL. */
+ if (peer && (tap->has_queues || tap->has_fds || tap->has_vhostfds)) {
+ error_setg(errp, "Multiqueue tap cannot be used with hubs");
+ return -1;
+ }
+
+ if (tap->has_fd) {
+ if (tap->has_ifname || tap->has_script || tap->has_downscript ||
+ tap->has_vnet_hdr || tap->has_helper || tap->has_queues ||
+ tap->has_fds || tap->has_vhostfds) {
+ error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, "
+ "helper=, queues=, fds=, and vhostfds= "
+ "are invalid with fd=");
+ return -1;
+ }
+
+ fd = monitor_fd_param(monitor_cur(), tap->fd, errp);
+ if (fd == -1) {
+ return -1;
+ }
+
+ ret = qemu_try_set_nonblock(fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d",
+ name, fd);
+ close(fd);
+ return -1;
+ }
+
+ vnet_hdr = tap_probe_vnet_hdr(fd, errp);
+ if (vnet_hdr < 0) {
+ close(fd);
+ return -1;
+ }
+
+ net_init_tap_one(tap, peer, "tap", name, NULL,
+ script, downscript,
+ vhostfdname, vnet_hdr, fd, &err);
+ if (err) {
+ error_propagate(errp, err);
+ close(fd);
+ return -1;
+ }
+ } else if (tap->has_fds) {
+ char **fds;
+ char **vhost_fds;
+ int nfds = 0, nvhosts = 0;
+
+ if (tap->has_ifname || tap->has_script || tap->has_downscript ||
+ tap->has_vnet_hdr || tap->has_helper || tap->has_queues ||
+ tap->has_vhostfd) {
+ error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, "
+ "helper=, queues=, and vhostfd= "
+ "are invalid with fds=");
+ return -1;
+ }
+
+ fds = g_new0(char *, MAX_TAP_QUEUES);
+ vhost_fds = g_new0(char *, MAX_TAP_QUEUES);
+
+ nfds = get_fds(tap->fds, fds, MAX_TAP_QUEUES);
+ if (tap->has_vhostfds) {
+ nvhosts = get_fds(tap->vhostfds, vhost_fds, MAX_TAP_QUEUES);
+ if (nfds != nvhosts) {
+ error_setg(errp, "The number of fds passed does not match "
+ "the number of vhostfds passed");
+ ret = -1;
+ goto free_fail;
+ }
+ }
+
+ for (i = 0; i < nfds; i++) {
+ fd = monitor_fd_param(monitor_cur(), fds[i], errp);
+ if (fd == -1) {
+ ret = -1;
+ goto free_fail;
+ }
+
+ ret = qemu_try_set_nonblock(fd);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "%s: Can't use file descriptor %d",
+ name, fd);
+ goto free_fail;
+ }
+
+ if (i == 0) {
+ vnet_hdr = tap_probe_vnet_hdr(fd, errp);
+ if (vnet_hdr < 0) {
+ goto free_fail;
+ }
+ } else if (vnet_hdr != tap_probe_vnet_hdr(fd, NULL)) {
+ error_setg(errp,
+ "vnet_hdr not consistent across given tap fds");
+ ret = -1;
+ goto free_fail;
+ }
+
+ net_init_tap_one(tap, peer, "tap", name, ifname,
+ script, downscript,
+ tap->has_vhostfds ? vhost_fds[i] : NULL,
+ vnet_hdr, fd, &err);
+ if (err) {
+ error_propagate(errp, err);
+ ret = -1;
+ goto free_fail;
+ }
+ }
+
+free_fail:
+ for (i = 0; i < nvhosts; i++) {
+ g_free(vhost_fds[i]);
+ }
+ for (i = 0; i < nfds; i++) {
+ g_free(fds[i]);
+ }
+ g_free(fds);
+ g_free(vhost_fds);
+ return ret;
+ } else if (tap->has_helper) {
+ if (tap->has_ifname || tap->has_script || tap->has_downscript ||
+ tap->has_vnet_hdr || tap->has_queues || tap->has_vhostfds) {
+ error_setg(errp, "ifname=, script=, downscript=, vnet_hdr=, "
+ "queues=, and vhostfds= are invalid with helper=");
+ return -1;
+ }
+
+ fd = net_bridge_run_helper(tap->helper,
+ tap->has_br ?
+ tap->br : DEFAULT_BRIDGE_INTERFACE,
+ errp);
+ if (fd == -1) {
+ return -1;
+ }
+
+ qemu_set_nonblock(fd);
+ vnet_hdr = tap_probe_vnet_hdr(fd, errp);
+ if (vnet_hdr < 0) {
+ close(fd);
+ return -1;
+ }
+
+ net_init_tap_one(tap, peer, "bridge", name, ifname,
+ script, downscript, vhostfdname,
+ vnet_hdr, fd, &err);
+ if (err) {
+ error_propagate(errp, err);
+ close(fd);
+ return -1;
+ }
+ } else {
+ g_autofree char *default_script = NULL;
+ g_autofree char *default_downscript = NULL;
+ if (tap->has_vhostfds) {
+ error_setg(errp, "vhostfds= is invalid if fds= wasn't specified");
+ return -1;
+ }
+
+ if (!script) {
+ script = default_script = get_relocated_path(DEFAULT_NETWORK_SCRIPT);
+ }
+ if (!downscript) {
+ downscript = default_downscript =
+ get_relocated_path(DEFAULT_NETWORK_DOWN_SCRIPT);
+ }
+
+ if (tap->has_ifname) {
+ pstrcpy(ifname, sizeof ifname, tap->ifname);
+ } else {
+ ifname[0] = '\0';
+ }
+
+ for (i = 0; i < queues; i++) {
+ fd = net_tap_init(tap, &vnet_hdr, i >= 1 ? "no" : script,
+ ifname, sizeof ifname, queues > 1, errp);
+ if (fd == -1) {
+ return -1;
+ }
+
+ if (queues > 1 && i == 0 && !tap->has_ifname) {
+ if (tap_fd_get_ifname(fd, ifname)) {
+ error_setg(errp, "Fail to get ifname");
+ close(fd);
+ return -1;
+ }
+ }
+
+ net_init_tap_one(tap, peer, "tap", name, ifname,
+ i >= 1 ? "no" : script,
+ i >= 1 ? "no" : downscript,
+ vhostfdname, vnet_hdr, fd, &err);
+ if (err) {
+ error_propagate(errp, err);
+ close(fd);
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+VHostNetState *tap_get_vhost_net(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
+ return s->vhost_net;
+}
+
+int tap_enable(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ int ret;
+
+ if (s->enabled) {
+ return 0;
+ } else {
+ ret = tap_fd_enable(s->fd);
+ if (ret == 0) {
+ s->enabled = true;
+ tap_update_fd_handler(s);
+ }
+ return ret;
+ }
+}
+
+int tap_disable(NetClientState *nc)
+{
+ TAPState *s = DO_UPCAST(TAPState, nc, nc);
+ int ret;
+
+ if (s->enabled == 0) {
+ return 0;
+ } else {
+ ret = tap_fd_disable(s->fd);
+ if (ret == 0) {
+ qemu_purge_queued_packets(nc);
+ s->enabled = false;
+ tap_update_fd_handler(s);
+ }
+ return ret;
+ }
+}
diff --git a/net/tap_int.h b/net/tap_int.h
new file mode 100644
index 000000000..547f8a5a2
--- /dev/null
+++ b/net/tap_int.h
@@ -0,0 +1,49 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ * Copyright (c) 2009 Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef NET_TAP_INT_H
+#define NET_TAP_INT_H
+
+#include "qapi/qapi-types-net.h"
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+ int vnet_hdr_required, int mq_required, Error **errp);
+
+ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen);
+
+void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
+int tap_probe_vnet_hdr(int fd, Error **errp);
+int tap_probe_vnet_hdr_len(int fd, int len);
+int tap_probe_has_ufo(int fd);
+void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo);
+void tap_fd_set_vnet_hdr_len(int fd, int len);
+int tap_fd_set_vnet_le(int fd, int vnet_is_le);
+int tap_fd_set_vnet_be(int fd, int vnet_is_be);
+int tap_fd_enable(int fd);
+int tap_fd_disable(int fd);
+int tap_fd_get_ifname(int fd, char *ifname);
+int tap_fd_set_steering_ebpf(int fd, int prog_fd);
+
+#endif /* NET_TAP_INT_H */
diff --git a/net/trace-events b/net/trace-events
new file mode 100644
index 000000000..d7a17256c
--- /dev/null
+++ b/net/trace-events
@@ -0,0 +1,24 @@
+# See docs/devel/tracing.rst for syntax documentation.
+
+# announce.c
+qemu_announce_self_iter(const char *id, const char *name, const char *mac, int skip) "%s:%s:%s skip: %d"
+qemu_announce_timer_del(bool free_named, bool free_timer, char *id) "free named: %d free timer: %d id: %s"
+
+# vhost-user.c
+vhost_user_event(const char *chr, int event) "chr: %s got event: %d"
+
+# colo.c
+colo_proxy_main(const char *chr) ": %s"
+
+# colo-compare.c
+colo_compare_main(const char *chr) ": %s"
+colo_compare_drop_packet(const char *queue, const char *chr) ": %s: %s"
+colo_compare_udp_miscompare(const char *sta, int size) ": %s = %d"
+colo_compare_icmp_miscompare(const char *sta, int size) ": %s = %d"
+colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, spkt size = %d, ip_src = %s, ip_dst = %s"
+colo_old_packet_check_found(int64_t old_time) "%" PRId64
+colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int hdlen, int pdlen, int offset, int flags) "%s: seq/ack= %u/%u hdlen= %d pdlen= %d offset= %d flags=%d"
+
+# filter-rewriter.c
+colo_filter_rewriter_pkt_info(const char *func, const char *src, const char *dst, uint32_t seq, uint32_t ack, uint32_t flag) "%s: src/dst: %s/%s p: seq/ack=%u/%u flags=0x%x"
+colo_filter_rewriter_conn_offset(uint32_t offset) ": offset=%u"
diff --git a/net/trace.h b/net/trace.h
new file mode 100644
index 000000000..18d42c29b
--- /dev/null
+++ b/net/trace.h
@@ -0,0 +1 @@
+#include "trace/trace-net.h"
diff --git a/net/util.c b/net/util.c
new file mode 100644
index 000000000..0b3dbfe5d
--- /dev/null
+++ b/net/util.c
@@ -0,0 +1,59 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "util.h"
+
+int net_parse_macaddr(uint8_t *macaddr, const char *p)
+{
+ int i;
+ char *last_char;
+ long int offset;
+
+ errno = 0;
+ offset = strtol(p, &last_char, 0);
+ if (errno == 0 && *last_char == '\0' &&
+ offset >= 0 && offset <= 0xFFFFFF) {
+ macaddr[3] = (offset & 0xFF0000) >> 16;
+ macaddr[4] = (offset & 0xFF00) >> 8;
+ macaddr[5] = offset & 0xFF;
+ return 0;
+ }
+
+ for (i = 0; i < 6; i++) {
+ macaddr[i] = strtol(p, (char **)&p, 16);
+ if (i == 5) {
+ if (*p != '\0') {
+ return -1;
+ }
+ } else {
+ if (*p != ':' && *p != '-') {
+ return -1;
+ }
+ p++;
+ }
+ }
+
+ return 0;
+}
diff --git a/net/util.h b/net/util.h
new file mode 100644
index 000000000..358185fd5
--- /dev/null
+++ b/net/util.h
@@ -0,0 +1,86 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef QEMU_NET_UTIL_H
+#define QEMU_NET_UTIL_H
+
+
+/*
+ * Structure of an internet header, naked of options.
+ */
+struct ip {
+#ifdef HOST_WORDS_BIGENDIAN
+ uint8_t ip_v:4, /* version */
+ ip_hl:4; /* header length */
+#else
+ uint8_t ip_hl:4, /* header length */
+ ip_v:4; /* version */
+#endif
+ uint8_t ip_tos; /* type of service */
+ uint16_t ip_len; /* total length */
+ uint16_t ip_id; /* identification */
+ uint16_t ip_off; /* fragment offset field */
+#define IP_DF 0x4000 /* don't fragment flag */
+#define IP_MF 0x2000 /* more fragments flag */
+#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */
+ uint8_t ip_ttl; /* time to live */
+ uint8_t ip_p; /* protocol */
+ uint16_t ip_sum; /* checksum */
+ struct in_addr ip_src, ip_dst; /* source and dest address */
+} QEMU_PACKED;
+
+static inline bool in6_equal_net(const struct in6_addr *a,
+ const struct in6_addr *b,
+ int prefix_len)
+{
+ if (memcmp(a, b, prefix_len / 8) != 0) {
+ return 0;
+ }
+
+ if (prefix_len % 8 == 0) {
+ return 1;
+ }
+
+ return a->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8))
+ == b->s6_addr[prefix_len / 8] >> (8 - (prefix_len % 8));
+}
+
+#define TCPS_CLOSED 0 /* closed */
+#define TCPS_LISTEN 1 /* listening for connection */
+#define TCPS_SYN_SENT 2 /* active, have sent syn */
+#define TCPS_SYN_RECEIVED 3 /* have send and received syn */
+/* states < TCPS_ESTABLISHED are those where connections not established */
+#define TCPS_ESTABLISHED 4 /* established */
+#define TCPS_CLOSE_WAIT 5 /* rcvd fin, waiting for close */
+/* states > TCPS_CLOSE_WAIT are those where user has closed */
+#define TCPS_FIN_WAIT_1 6 /* have closed, sent fin */
+#define TCPS_CLOSING 7 /* closed xchd FIN; await FIN ACK */
+#define TCPS_LAST_ACK 8 /* had fin and close; await FIN ACK */
+/* states > TCPS_CLOSE_WAIT && < TCPS_FIN_WAIT_2 await ACK of FIN */
+#define TCPS_FIN_WAIT_2 9 /* have closed, fin is acked */
+#define TCPS_TIME_WAIT 10 /* in 2*msl quiet wait after close */
+
+int net_parse_macaddr(uint8_t *macaddr, const char *p);
+
+#endif /* QEMU_NET_UTIL_H */
diff --git a/net/vde.c b/net/vde.c
new file mode 100644
index 000000000..99189cccb
--- /dev/null
+++ b/net/vde.c
@@ -0,0 +1,129 @@
+/*
+ * QEMU System Emulator
+ *
+ * Copyright (c) 2003-2008 Fabrice Bellard
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu/osdep.h"
+
+#include <libvdeplug.h>
+
+#include "net/net.h"
+#include "clients.h"
+#include "qemu-common.h"
+#include "qemu/option.h"
+#include "qemu/main-loop.h"
+#include "qapi/error.h"
+
+typedef struct VDEState {
+ NetClientState nc;
+ VDECONN *vde;
+} VDEState;
+
+static void vde_to_qemu(void *opaque)
+{
+ VDEState *s = opaque;
+ uint8_t buf[NET_BUFSIZE];
+ int size;
+
+ size = vde_recv(s->vde, (char *)buf, sizeof(buf), 0);
+ if (size > 0) {
+ qemu_send_packet(&s->nc, buf, size);
+ }
+}
+
+static ssize_t vde_receive(NetClientState *nc, const uint8_t *buf, size_t size)
+{
+ VDEState *s = DO_UPCAST(VDEState, nc, nc);
+ ssize_t ret;
+
+ do {
+ ret = vde_send(s->vde, (const char *)buf, size, 0);
+ } while (ret < 0 && errno == EINTR);
+
+ return ret;
+}
+
+static void vde_cleanup(NetClientState *nc)
+{
+ VDEState *s = DO_UPCAST(VDEState, nc, nc);
+ qemu_set_fd_handler(vde_datafd(s->vde), NULL, NULL, NULL);
+ vde_close(s->vde);
+}
+
+static NetClientInfo net_vde_info = {
+ .type = NET_CLIENT_DRIVER_VDE,
+ .size = sizeof(VDEState),
+ .receive = vde_receive,
+ .cleanup = vde_cleanup,
+};
+
+static int net_vde_init(NetClientState *peer, const char *model,
+ const char *name, const char *sock,
+ int port, const char *group, int mode, Error **errp)
+{
+ NetClientState *nc;
+ VDEState *s;
+ VDECONN *vde;
+ char *init_group = (char *)group;
+ char *init_sock = (char *)sock;
+
+ struct vde_open_args args = {
+ .port = port,
+ .group = init_group,
+ .mode = mode,
+ };
+
+ vde = vde_open(init_sock, (char *)"QEMU", &args);
+ if (!vde){
+ error_setg_errno(errp, errno, "Could not open vde");
+ return -1;
+ }
+
+ nc = qemu_new_net_client(&net_vde_info, peer, model, name);
+
+ snprintf(nc->info_str, sizeof(nc->info_str), "sock=%s,fd=%d",
+ sock, vde_datafd(vde));
+
+ s = DO_UPCAST(VDEState, nc, nc);
+
+ s->vde = vde;
+
+ qemu_set_fd_handler(vde_datafd(s->vde), vde_to_qemu, NULL, s);
+
+ return 0;
+}
+
+int net_init_vde(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevVdeOptions *vde;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_VDE);
+ vde = &netdev->u.vde;
+
+ /* missing optional values have been initialized to "all bits zero" */
+ if (net_vde_init(peer, "vde", name, vde->sock, vde->port, vde->group,
+ vde->has_mode ? vde->mode : 0700, errp) == -1) {
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/net/vhost-user-stub.c b/net/vhost-user-stub.c
new file mode 100644
index 000000000..52ab4e13f
--- /dev/null
+++ b/net/vhost-user-stub.c
@@ -0,0 +1,23 @@
+/*
+ * vhost-user-stub.c
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost_net.h"
+#include "net/vhost-user.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
+
+int net_init_vhost_user(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ error_setg(errp, "vhost-user requires frontend driver virtio-net-*");
+ return -1;
+}
diff --git a/net/vhost-user.c b/net/vhost-user.c
new file mode 100644
index 000000000..b1a0247b5
--- /dev/null
+++ b/net/vhost-user.c
@@ -0,0 +1,438 @@
+/*
+ * vhost-user.c
+ *
+ * Copyright (c) 2013 Virtual Open Systems Sarl.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost_net.h"
+#include "net/vhost-user.h"
+#include "hw/virtio/vhost-user.h"
+#include "chardev/char-fe.h"
+#include "qapi/error.h"
+#include "qapi/qapi-commands-net.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "trace.h"
+
+typedef struct NetVhostUserState {
+ NetClientState nc;
+ CharBackend chr; /* only queue index 0 */
+ VhostUserState *vhost_user;
+ VHostNetState *vhost_net;
+ guint watch;
+ uint64_t acked_features;
+ bool started;
+} NetVhostUserState;
+
+VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
+{
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+ return s->vhost_net;
+}
+
+uint64_t vhost_user_get_acked_features(NetClientState *nc)
+{
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+ return s->acked_features;
+}
+
+static void vhost_user_stop(int queues, NetClientState *ncs[])
+{
+ NetVhostUserState *s;
+ int i;
+
+ for (i = 0; i < queues; i++) {
+ assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
+
+ if (s->vhost_net) {
+ /* save acked features */
+ uint64_t features = vhost_net_get_acked_features(s->vhost_net);
+ if (features) {
+ s->acked_features = features;
+ }
+ vhost_net_cleanup(s->vhost_net);
+ }
+ }
+}
+
+static int vhost_user_start(int queues, NetClientState *ncs[],
+ VhostUserState *be)
+{
+ VhostNetOptions options;
+ struct vhost_net *net = NULL;
+ NetVhostUserState *s;
+ int max_queues;
+ int i;
+
+ options.backend_type = VHOST_BACKEND_TYPE_USER;
+
+ for (i = 0; i < queues; i++) {
+ assert(ncs[i]->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
+
+ options.net_backend = ncs[i];
+ options.opaque = be;
+ options.busyloop_timeout = 0;
+ options.nvqs = 2;
+ net = vhost_net_init(&options);
+ if (!net) {
+ error_report("failed to init vhost_net for queue %d", i);
+ goto err;
+ }
+
+ if (i == 0) {
+ max_queues = vhost_net_get_max_queues(net);
+ if (queues > max_queues) {
+ error_report("you are asking more queues than supported: %d",
+ max_queues);
+ goto err;
+ }
+ }
+
+ if (s->vhost_net) {
+ vhost_net_cleanup(s->vhost_net);
+ g_free(s->vhost_net);
+ }
+ s->vhost_net = net;
+ }
+
+ return 0;
+
+err:
+ if (net) {
+ vhost_net_cleanup(net);
+ g_free(net);
+ }
+ vhost_user_stop(i, ncs);
+ return -1;
+}
+
+static ssize_t vhost_user_receive(NetClientState *nc, const uint8_t *buf,
+ size_t size)
+{
+ /* In case of RARP (message size is 60) notify backup to send a fake RARP.
+ This fake RARP will be sent by backend only for guest
+ without GUEST_ANNOUNCE capability.
+ */
+ if (size == 60) {
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
+ int r;
+ static int display_rarp_failure = 1;
+ char mac_addr[6];
+
+ /* extract guest mac address from the RARP message */
+ memcpy(mac_addr, &buf[6], 6);
+
+ r = vhost_net_notify_migration_done(s->vhost_net, mac_addr);
+
+ if ((r != 0) && (display_rarp_failure)) {
+ fprintf(stderr,
+ "Vhost user backend fails to broadcast fake RARP\n");
+ fflush(stderr);
+ display_rarp_failure = 0;
+ }
+ }
+
+ return size;
+}
+
+static void net_vhost_user_cleanup(NetClientState *nc)
+{
+ NetVhostUserState *s = DO_UPCAST(NetVhostUserState, nc, nc);
+
+ if (s->vhost_net) {
+ vhost_net_cleanup(s->vhost_net);
+ g_free(s->vhost_net);
+ s->vhost_net = NULL;
+ }
+ if (nc->queue_index == 0) {
+ if (s->watch) {
+ g_source_remove(s->watch);
+ s->watch = 0;
+ }
+ qemu_chr_fe_deinit(&s->chr, true);
+ if (s->vhost_user) {
+ vhost_user_cleanup(s->vhost_user);
+ g_free(s->vhost_user);
+ s->vhost_user = NULL;
+ }
+ }
+
+ qemu_purge_queued_packets(nc);
+}
+
+static int vhost_user_set_vnet_endianness(NetClientState *nc,
+ bool enable)
+{
+ /* Nothing to do. If the server supports
+ * VHOST_USER_PROTOCOL_F_CROSS_ENDIAN, it will get the
+ * vnet header endianness from there. If it doesn't, negotiation
+ * fails.
+ */
+ return 0;
+}
+
+static bool vhost_user_has_vnet_hdr(NetClientState *nc)
+{
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+
+ return true;
+}
+
+static bool vhost_user_has_ufo(NetClientState *nc)
+{
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_USER);
+
+ return true;
+}
+
+static bool vhost_user_check_peer_type(NetClientState *nc, ObjectClass *oc,
+ Error **errp)
+{
+ const char *driver = object_class_get_name(oc);
+
+ if (!g_str_has_prefix(driver, "virtio-net-")) {
+ error_setg(errp, "vhost-user requires frontend driver virtio-net-*");
+ return false;
+ }
+
+ return true;
+}
+
+static NetClientInfo net_vhost_user_info = {
+ .type = NET_CLIENT_DRIVER_VHOST_USER,
+ .size = sizeof(NetVhostUserState),
+ .receive = vhost_user_receive,
+ .cleanup = net_vhost_user_cleanup,
+ .has_vnet_hdr = vhost_user_has_vnet_hdr,
+ .has_ufo = vhost_user_has_ufo,
+ .set_vnet_be = vhost_user_set_vnet_endianness,
+ .set_vnet_le = vhost_user_set_vnet_endianness,
+ .check_peer_type = vhost_user_check_peer_type,
+};
+
+static gboolean net_vhost_user_watch(void *do_not_use, GIOCondition cond,
+ void *opaque)
+{
+ NetVhostUserState *s = opaque;
+
+ qemu_chr_fe_disconnect(&s->chr);
+
+ return TRUE;
+}
+
+static void net_vhost_user_event(void *opaque, QEMUChrEvent event);
+
+static void chr_closed_bh(void *opaque)
+{
+ const char *name = opaque;
+ NetClientState *ncs[MAX_QUEUE_NUM];
+ NetVhostUserState *s;
+ Error *err = NULL;
+ int queues, i;
+
+ queues = qemu_find_net_clients_except(name, ncs,
+ NET_CLIENT_DRIVER_NIC,
+ MAX_QUEUE_NUM);
+ assert(queues < MAX_QUEUE_NUM);
+
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
+
+ for (i = queues -1; i >= 0; i--) {
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[i]);
+
+ if (s->vhost_net) {
+ s->acked_features = vhost_net_get_acked_features(s->vhost_net);
+ }
+ }
+
+ qmp_set_link(name, false, &err);
+
+ qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, net_vhost_user_event,
+ NULL, opaque, NULL, true);
+
+ if (err) {
+ error_report_err(err);
+ }
+}
+
+static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
+{
+ const char *name = opaque;
+ NetClientState *ncs[MAX_QUEUE_NUM];
+ NetVhostUserState *s;
+ Chardev *chr;
+ Error *err = NULL;
+ int queues;
+
+ queues = qemu_find_net_clients_except(name, ncs,
+ NET_CLIENT_DRIVER_NIC,
+ MAX_QUEUE_NUM);
+ assert(queues < MAX_QUEUE_NUM);
+
+ s = DO_UPCAST(NetVhostUserState, nc, ncs[0]);
+ chr = qemu_chr_fe_get_driver(&s->chr);
+ trace_vhost_user_event(chr->label, event);
+ switch (event) {
+ case CHR_EVENT_OPENED:
+ if (vhost_user_start(queues, ncs, s->vhost_user) < 0) {
+ qemu_chr_fe_disconnect(&s->chr);
+ return;
+ }
+ s->watch = qemu_chr_fe_add_watch(&s->chr, G_IO_HUP,
+ net_vhost_user_watch, s);
+ qmp_set_link(name, true, &err);
+ s->started = true;
+ break;
+ case CHR_EVENT_CLOSED:
+ /* a close event may happen during a read/write, but vhost
+ * code assumes the vhost_dev remains setup, so delay the
+ * stop & clear to idle.
+ * FIXME: better handle failure in vhost code, remove bh
+ */
+ if (s->watch) {
+ AioContext *ctx = qemu_get_current_aio_context();
+
+ g_source_remove(s->watch);
+ s->watch = 0;
+ qemu_chr_fe_set_handlers(&s->chr, NULL, NULL, NULL, NULL,
+ NULL, NULL, false);
+
+ aio_bh_schedule_oneshot(ctx, chr_closed_bh, opaque);
+ }
+ break;
+ case CHR_EVENT_BREAK:
+ case CHR_EVENT_MUX_IN:
+ case CHR_EVENT_MUX_OUT:
+ /* Ignore */
+ break;
+ }
+
+ if (err) {
+ error_report_err(err);
+ }
+}
+
+static int net_vhost_user_init(NetClientState *peer, const char *device,
+ const char *name, Chardev *chr,
+ int queues)
+{
+ Error *err = NULL;
+ NetClientState *nc, *nc0 = NULL;
+ NetVhostUserState *s = NULL;
+ VhostUserState *user;
+ int i;
+
+ assert(name);
+ assert(queues > 0);
+
+ user = g_new0(struct VhostUserState, 1);
+ for (i = 0; i < queues; i++) {
+ nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
+ snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s",
+ i, chr->label);
+ nc->queue_index = i;
+ if (!nc0) {
+ nc0 = nc;
+ s = DO_UPCAST(NetVhostUserState, nc, nc);
+ if (!qemu_chr_fe_init(&s->chr, chr, &err) ||
+ !vhost_user_init(user, &s->chr, &err)) {
+ error_report_err(err);
+ goto err;
+ }
+ }
+ s = DO_UPCAST(NetVhostUserState, nc, nc);
+ s->vhost_user = user;
+ }
+
+ s = DO_UPCAST(NetVhostUserState, nc, nc0);
+ do {
+ if (qemu_chr_fe_wait_connected(&s->chr, &err) < 0) {
+ error_report_err(err);
+ goto err;
+ }
+ qemu_chr_fe_set_handlers(&s->chr, NULL, NULL,
+ net_vhost_user_event, NULL, nc0->name, NULL,
+ true);
+ } while (!s->started);
+
+ assert(s->vhost_net);
+
+ return 0;
+
+err:
+ if (user) {
+ vhost_user_cleanup(user);
+ g_free(user);
+ if (s) {
+ s->vhost_user = NULL;
+ }
+ }
+ if (nc0) {
+ qemu_del_net_client(nc0);
+ }
+
+ return -1;
+}
+
+static Chardev *net_vhost_claim_chardev(
+ const NetdevVhostUserOptions *opts, Error **errp)
+{
+ Chardev *chr = qemu_chr_find(opts->chardev);
+
+ if (chr == NULL) {
+ error_setg(errp, "chardev \"%s\" not found", opts->chardev);
+ return NULL;
+ }
+
+ if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_RECONNECTABLE)) {
+ error_setg(errp, "chardev \"%s\" is not reconnectable",
+ opts->chardev);
+ return NULL;
+ }
+ if (!qemu_chr_has_feature(chr, QEMU_CHAR_FEATURE_FD_PASS)) {
+ error_setg(errp, "chardev \"%s\" does not support FD passing",
+ opts->chardev);
+ return NULL;
+ }
+
+ return chr;
+}
+
+int net_init_vhost_user(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ int queues;
+ const NetdevVhostUserOptions *vhost_user_opts;
+ Chardev *chr;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_VHOST_USER);
+ vhost_user_opts = &netdev->u.vhost_user;
+
+ chr = net_vhost_claim_chardev(vhost_user_opts, errp);
+ if (!chr) {
+ return -1;
+ }
+
+ queues = vhost_user_opts->has_queues ? vhost_user_opts->queues : 1;
+ if (queues < 1 || queues > MAX_QUEUE_NUM) {
+ error_setg(errp,
+ "vhost-user number of queues must be in range [1, %d]",
+ MAX_QUEUE_NUM);
+ return -1;
+ }
+
+ return net_vhost_user_init(peer, "vhost_user", name, chr, queues);
+}
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
new file mode 100644
index 000000000..25dd6dd97
--- /dev/null
+++ b/net/vhost-vdpa.c
@@ -0,0 +1,315 @@
+/*
+ * vhost-vdpa.c
+ *
+ * Copyright(c) 2017-2018 Intel Corporation.
+ * Copyright(c) 2020 Red Hat, Inc.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "clients.h"
+#include "net/vhost_net.h"
+#include "net/vhost-vdpa.h"
+#include "hw/virtio/vhost-vdpa.h"
+#include "qemu/config-file.h"
+#include "qemu/error-report.h"
+#include "qemu/option.h"
+#include "qapi/error.h"
+#include <linux/vhost.h>
+#include <sys/ioctl.h>
+#include <err.h>
+#include "standard-headers/linux/virtio_net.h"
+#include "monitor/monitor.h"
+#include "hw/virtio/vhost.h"
+
+/* Todo:need to add the multiqueue support here */
+typedef struct VhostVDPAState {
+ NetClientState nc;
+ struct vhost_vdpa vhost_vdpa;
+ VHostNetState *vhost_net;
+ bool started;
+} VhostVDPAState;
+
+const int vdpa_feature_bits[] = {
+ VIRTIO_F_NOTIFY_ON_EMPTY,
+ VIRTIO_RING_F_INDIRECT_DESC,
+ VIRTIO_RING_F_EVENT_IDX,
+ VIRTIO_F_ANY_LAYOUT,
+ VIRTIO_F_VERSION_1,
+ VIRTIO_NET_F_CSUM,
+ VIRTIO_NET_F_GUEST_CSUM,
+ VIRTIO_NET_F_GSO,
+ VIRTIO_NET_F_GUEST_TSO4,
+ VIRTIO_NET_F_GUEST_TSO6,
+ VIRTIO_NET_F_GUEST_ECN,
+ VIRTIO_NET_F_GUEST_UFO,
+ VIRTIO_NET_F_HOST_TSO4,
+ VIRTIO_NET_F_HOST_TSO6,
+ VIRTIO_NET_F_HOST_ECN,
+ VIRTIO_NET_F_HOST_UFO,
+ VIRTIO_NET_F_MRG_RXBUF,
+ VIRTIO_NET_F_MTU,
+ VIRTIO_NET_F_CTRL_RX,
+ VIRTIO_NET_F_CTRL_RX_EXTRA,
+ VIRTIO_NET_F_CTRL_VLAN,
+ VIRTIO_NET_F_GUEST_ANNOUNCE,
+ VIRTIO_NET_F_CTRL_MAC_ADDR,
+ VIRTIO_NET_F_RSS,
+ VIRTIO_NET_F_MQ,
+ VIRTIO_NET_F_CTRL_VQ,
+ VIRTIO_F_IOMMU_PLATFORM,
+ VIRTIO_F_RING_PACKED,
+ VIRTIO_NET_F_RSS,
+ VIRTIO_NET_F_HASH_REPORT,
+ VIRTIO_NET_F_GUEST_ANNOUNCE,
+ VIRTIO_NET_F_STATUS,
+ VHOST_INVALID_FEATURE_BIT
+};
+
+VHostNetState *vhost_vdpa_get_vhost_net(NetClientState *nc)
+{
+ VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+ return s->vhost_net;
+}
+
+static int vhost_vdpa_net_check_device_id(struct vhost_net *net)
+{
+ uint32_t device_id;
+ int ret;
+ struct vhost_dev *hdev;
+
+ hdev = (struct vhost_dev *)&net->dev;
+ ret = hdev->vhost_ops->vhost_get_device_id(hdev, &device_id);
+ if (device_id != VIRTIO_ID_NET) {
+ return -ENOTSUP;
+ }
+ return ret;
+}
+
+static int vhost_vdpa_add(NetClientState *ncs, void *be,
+ int queue_pair_index, int nvqs)
+{
+ VhostNetOptions options;
+ struct vhost_net *net = NULL;
+ VhostVDPAState *s;
+ int ret;
+
+ options.backend_type = VHOST_BACKEND_TYPE_VDPA;
+ assert(ncs->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+ s = DO_UPCAST(VhostVDPAState, nc, ncs);
+ options.net_backend = ncs;
+ options.opaque = be;
+ options.busyloop_timeout = 0;
+ options.nvqs = nvqs;
+
+ net = vhost_net_init(&options);
+ if (!net) {
+ error_report("failed to init vhost_net for queue");
+ goto err_init;
+ }
+ s->vhost_net = net;
+ ret = vhost_vdpa_net_check_device_id(net);
+ if (ret) {
+ goto err_check;
+ }
+ return 0;
+err_check:
+ vhost_net_cleanup(net);
+ g_free(net);
+err_init:
+ return -1;
+}
+
+static void vhost_vdpa_cleanup(NetClientState *nc)
+{
+ VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+
+ if (s->vhost_net) {
+ vhost_net_cleanup(s->vhost_net);
+ g_free(s->vhost_net);
+ s->vhost_net = NULL;
+ }
+ if (s->vhost_vdpa.device_fd >= 0) {
+ qemu_close(s->vhost_vdpa.device_fd);
+ s->vhost_vdpa.device_fd = -1;
+ }
+}
+
+static bool vhost_vdpa_has_vnet_hdr(NetClientState *nc)
+{
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+
+ return true;
+}
+
+static bool vhost_vdpa_has_ufo(NetClientState *nc)
+{
+ assert(nc->info->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+ VhostVDPAState *s = DO_UPCAST(VhostVDPAState, nc, nc);
+ uint64_t features = 0;
+ features |= (1ULL << VIRTIO_NET_F_HOST_UFO);
+ features = vhost_net_get_features(s->vhost_net, features);
+ return !!(features & (1ULL << VIRTIO_NET_F_HOST_UFO));
+
+}
+
+static bool vhost_vdpa_check_peer_type(NetClientState *nc, ObjectClass *oc,
+ Error **errp)
+{
+ const char *driver = object_class_get_name(oc);
+
+ if (!g_str_has_prefix(driver, "virtio-net-")) {
+ error_setg(errp, "vhost-vdpa requires frontend driver virtio-net-*");
+ return false;
+ }
+
+ return true;
+}
+
+/** Dummy receive in case qemu falls back to userland tap networking */
+static ssize_t vhost_vdpa_receive(NetClientState *nc, const uint8_t *buf,
+ size_t size)
+{
+ return 0;
+}
+
+static NetClientInfo net_vhost_vdpa_info = {
+ .type = NET_CLIENT_DRIVER_VHOST_VDPA,
+ .size = sizeof(VhostVDPAState),
+ .receive = vhost_vdpa_receive,
+ .cleanup = vhost_vdpa_cleanup,
+ .has_vnet_hdr = vhost_vdpa_has_vnet_hdr,
+ .has_ufo = vhost_vdpa_has_ufo,
+ .check_peer_type = vhost_vdpa_check_peer_type,
+};
+
+static NetClientState *net_vhost_vdpa_init(NetClientState *peer,
+ const char *device,
+ const char *name,
+ int vdpa_device_fd,
+ int queue_pair_index,
+ int nvqs,
+ bool is_datapath)
+{
+ NetClientState *nc = NULL;
+ VhostVDPAState *s;
+ int ret = 0;
+ assert(name);
+ if (is_datapath) {
+ nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device,
+ name);
+ } else {
+ nc = qemu_new_net_control_client(&net_vhost_vdpa_info, peer,
+ device, name);
+ }
+ snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
+ s = DO_UPCAST(VhostVDPAState, nc, nc);
+
+ s->vhost_vdpa.device_fd = vdpa_device_fd;
+ s->vhost_vdpa.index = queue_pair_index;
+ ret = vhost_vdpa_add(nc, (void *)&s->vhost_vdpa, queue_pair_index, nvqs);
+ if (ret) {
+ qemu_del_net_client(nc);
+ return NULL;
+ }
+ return nc;
+}
+
+static int vhost_vdpa_get_max_queue_pairs(int fd, int *has_cvq, Error **errp)
+{
+ unsigned long config_size = offsetof(struct vhost_vdpa_config, buf);
+ g_autofree struct vhost_vdpa_config *config = NULL;
+ __virtio16 *max_queue_pairs;
+ uint64_t features;
+ int ret;
+
+ ret = ioctl(fd, VHOST_GET_FEATURES, &features);
+ if (ret) {
+ error_setg(errp, "Fail to query features from vhost-vDPA device");
+ return ret;
+ }
+
+ if (features & (1 << VIRTIO_NET_F_CTRL_VQ)) {
+ *has_cvq = 1;
+ } else {
+ *has_cvq = 0;
+ }
+
+ if (features & (1 << VIRTIO_NET_F_MQ)) {
+ config = g_malloc0(config_size + sizeof(*max_queue_pairs));
+ config->off = offsetof(struct virtio_net_config, max_virtqueue_pairs);
+ config->len = sizeof(*max_queue_pairs);
+
+ ret = ioctl(fd, VHOST_VDPA_GET_CONFIG, config);
+ if (ret) {
+ error_setg(errp, "Fail to get config from vhost-vDPA device");
+ return -ret;
+ }
+
+ max_queue_pairs = (__virtio16 *)&config->buf;
+
+ return lduw_le_p(max_queue_pairs);
+ }
+
+ return 1;
+}
+
+int net_init_vhost_vdpa(const Netdev *netdev, const char *name,
+ NetClientState *peer, Error **errp)
+{
+ const NetdevVhostVDPAOptions *opts;
+ int vdpa_device_fd;
+ NetClientState **ncs, *nc;
+ int queue_pairs, i, has_cvq = 0;
+
+ assert(netdev->type == NET_CLIENT_DRIVER_VHOST_VDPA);
+ opts = &netdev->u.vhost_vdpa;
+ if (!opts->vhostdev) {
+ error_setg(errp, "vdpa character device not specified with vhostdev");
+ return -1;
+ }
+
+ vdpa_device_fd = qemu_open(opts->vhostdev, O_RDWR, errp);
+ if (vdpa_device_fd == -1) {
+ return -errno;
+ }
+
+ queue_pairs = vhost_vdpa_get_max_queue_pairs(vdpa_device_fd,
+ &has_cvq, errp);
+ if (queue_pairs < 0) {
+ qemu_close(vdpa_device_fd);
+ return queue_pairs;
+ }
+
+ ncs = g_malloc0(sizeof(*ncs) * queue_pairs);
+
+ for (i = 0; i < queue_pairs; i++) {
+ ncs[i] = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
+ vdpa_device_fd, i, 2, true);
+ if (!ncs[i])
+ goto err;
+ }
+
+ if (has_cvq) {
+ nc = net_vhost_vdpa_init(peer, TYPE_VHOST_VDPA, name,
+ vdpa_device_fd, i, 1, false);
+ if (!nc)
+ goto err;
+ }
+
+ g_free(ncs);
+ return 0;
+
+err:
+ if (i) {
+ qemu_del_net_client(ncs[0]);
+ }
+ qemu_close(vdpa_device_fd);
+ g_free(ncs);
+
+ return -1;
+}