aboutsummaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
Diffstat (limited to 'tools')
-rwxr-xr-xtools/ebpf/Makefile.ebpf21
-rw-r--r--tools/ebpf/rss.bpf.c571
-rw-r--r--tools/meson.build35
-rw-r--r--tools/vhost-user-rng/50-qemu-rng.json.in5
-rw-r--r--tools/vhost-user-rng/main.c407
-rw-r--r--tools/vhost-user-rng/meson.build10
-rw-r--r--tools/virtiofsd/50-qemu-virtiofsd.json.in5
-rw-r--r--tools/virtiofsd/buffer.c350
-rw-r--r--tools/virtiofsd/fuse_common.h832
-rw-r--r--tools/virtiofsd/fuse_i.h100
-rw-r--r--tools/virtiofsd/fuse_log.c39
-rw-r--r--tools/virtiofsd/fuse_log.h73
-rw-r--r--tools/virtiofsd/fuse_lowlevel.c2614
-rw-r--r--tools/virtiofsd/fuse_lowlevel.h1975
-rw-r--r--tools/virtiofsd/fuse_misc.h59
-rw-r--r--tools/virtiofsd/fuse_opt.c446
-rw-r--r--tools/virtiofsd/fuse_opt.h272
-rw-r--r--tools/virtiofsd/fuse_signals.c93
-rw-r--r--tools/virtiofsd/fuse_virtio.c1079
-rw-r--r--tools/virtiofsd/fuse_virtio.h33
-rw-r--r--tools/virtiofsd/helper.c405
-rw-r--r--tools/virtiofsd/meson.build18
-rw-r--r--tools/virtiofsd/passthrough_helpers.h51
-rw-r--r--tools/virtiofsd/passthrough_ll.c4090
-rw-r--r--tools/virtiofsd/passthrough_seccomp.c177
-rw-r--r--tools/virtiofsd/passthrough_seccomp.h15
26 files changed, 13775 insertions, 0 deletions
diff --git a/tools/ebpf/Makefile.ebpf b/tools/ebpf/Makefile.ebpf
new file mode 100755
index 000000000..8f327ae3b
--- /dev/null
+++ b/tools/ebpf/Makefile.ebpf
@@ -0,0 +1,21 @@
+OBJS = rss.bpf.o
+
+LLC ?= llc
+CLANG ?= clang
+INC_FLAGS = `$(CLANG) -print-file-name=include`
+EXTRA_CFLAGS ?= -O2 -emit-llvm -fno-stack-protector
+
+all: $(OBJS)
+
+.PHONY: clean
+
+clean:
+ rm -f $(OBJS)
+
+$(OBJS): %.o:%.c
+ $(CLANG) $(INC_FLAGS) \
+ -D__KERNEL__ -D__ASM_SYSREG_H \
+ -I../include $(LINUXINCLUDE) \
+ $(EXTRA_CFLAGS) -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
+ bpftool gen skeleton rss.bpf.o > rss.bpf.skeleton.h
+ cp rss.bpf.skeleton.h ../../ebpf/
diff --git a/tools/ebpf/rss.bpf.c b/tools/ebpf/rss.bpf.c
new file mode 100644
index 000000000..e85ec55f9
--- /dev/null
+++ b/tools/ebpf/rss.bpf.c
@@ -0,0 +1,571 @@
+/*
+ * eBPF RSS program
+ *
+ * Developed by Daynix Computing LTD (http://www.daynix.com)
+ *
+ * Authors:
+ * Andrew Melnychenko <andrew@daynix.com>
+ * Yuri Benditovich <yuri.benditovich@daynix.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ * Prepare:
+ * Requires llvm, clang, bpftool, linux kernel tree
+ *
+ * Build rss.bpf.skeleton.h:
+ * make -f Makefile.ebpf clean all
+ */
+
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/bpf.h>
+
+#include <linux/in.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+#include <linux/udp.h>
+#include <linux/tcp.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+#include <linux/virtio_net.h>
+
+#define INDIRECTION_TABLE_SIZE 128
+#define HASH_CALCULATION_BUFFER_SIZE 36
+
+struct rss_config_t {
+ __u8 redirect;
+ __u8 populate_hash;
+ __u32 hash_types;
+ __u16 indirections_len;
+ __u16 default_queue;
+} __attribute__((packed));
+
+struct toeplitz_key_data_t {
+ __u32 leftmost_32_bits;
+ __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE];
+};
+
+struct packet_hash_info_t {
+ __u8 is_ipv4;
+ __u8 is_ipv6;
+ __u8 is_udp;
+ __u8 is_tcp;
+ __u8 is_ipv6_ext_src;
+ __u8 is_ipv6_ext_dst;
+ __u8 is_fragmented;
+
+ __u16 src_port;
+ __u16 dst_port;
+
+ union {
+ struct {
+ __be32 in_src;
+ __be32 in_dst;
+ };
+
+ struct {
+ struct in6_addr in6_src;
+ struct in6_addr in6_dst;
+ struct in6_addr in6_ext_src;
+ struct in6_addr in6_ext_dst;
+ };
+ };
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_configurations = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct rss_config_t),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_toeplitz_key = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(struct toeplitz_key_data_t),
+ .max_entries = 1,
+};
+
+struct bpf_map_def SEC("maps")
+tap_rss_map_indirection_table = {
+ .type = BPF_MAP_TYPE_ARRAY,
+ .key_size = sizeof(__u32),
+ .value_size = sizeof(__u16),
+ .max_entries = INDIRECTION_TABLE_SIZE,
+};
+
+static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written,
+ const void *ptr, size_t size) {
+ __builtin_memcpy(&rss_input[*bytes_written], ptr, size);
+ *bytes_written += size;
+}
+
+static inline
+void net_toeplitz_add(__u32 *result,
+ __u8 *input,
+ __u32 len
+ , struct toeplitz_key_data_t *key) {
+
+ __u32 accumulator = *result;
+ __u32 leftmost_32_bits = key->leftmost_32_bits;
+ __u32 byte;
+
+ for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) {
+ __u8 input_byte = input[byte];
+ __u8 key_byte = key->next_byte[byte];
+ __u8 bit;
+
+ for (bit = 0; bit < 8; bit++) {
+ if (input_byte & (1 << 7)) {
+ accumulator ^= leftmost_32_bits;
+ }
+
+ leftmost_32_bits =
+ (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7);
+
+ input_byte <<= 1;
+ key_byte <<= 1;
+ }
+ }
+
+ *result = accumulator;
+}
+
+
+static inline int ip6_extension_header_type(__u8 hdr_type)
+{
+ switch (hdr_type) {
+ case IPPROTO_HOPOPTS:
+ case IPPROTO_ROUTING:
+ case IPPROTO_FRAGMENT:
+ case IPPROTO_ICMPV6:
+ case IPPROTO_NONE:
+ case IPPROTO_DSTOPTS:
+ case IPPROTO_MH:
+ return 1;
+ default:
+ return 0;
+ }
+}
+/*
+ * According to
+ * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml
+ * we expect that there are would be no more than 11 extensions in IPv6 header,
+ * also there is 27 TLV options for Destination and Hop-by-hop extensions.
+ * Need to choose reasonable amount of maximum extensions/options we may
+ * check to find ext src/dst.
+ */
+#define IP6_EXTENSIONS_COUNT 11
+#define IP6_OPTIONS_COUNT 30
+
+static inline int parse_ipv6_ext(struct __sk_buff *skb,
+ struct packet_hash_info_t *info,
+ __u8 *l4_protocol, size_t *l4_offset)
+{
+ int err = 0;
+
+ if (!ip6_extension_header_type(*l4_protocol)) {
+ return 0;
+ }
+
+ struct ipv6_opt_hdr ext_hdr = {};
+
+ for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) {
+
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr,
+ sizeof(ext_hdr), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if (*l4_protocol == IPPROTO_ROUTING) {
+ struct ipv6_rt_hdr ext_rt = {};
+
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt,
+ sizeof(ext_rt), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if ((ext_rt.type == IPV6_SRCRT_TYPE_2) &&
+ (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) &&
+ (ext_rt.segments_left == 1)) {
+
+ err = bpf_skb_load_bytes_relative(skb,
+ *l4_offset + offsetof(struct rt2_hdr, addr),
+ &info->in6_ext_dst, sizeof(info->in6_ext_dst),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->is_ipv6_ext_dst = 1;
+ }
+
+ } else if (*l4_protocol == IPPROTO_DSTOPTS) {
+ struct ipv6_opt_t {
+ __u8 type;
+ __u8 length;
+ } __attribute__((packed)) opt = {};
+
+ size_t opt_offset = sizeof(ext_hdr);
+
+ for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) {
+ err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset,
+ &opt, sizeof(opt), BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ if (opt.type == IPV6_TLV_HAO) {
+ err = bpf_skb_load_bytes_relative(skb,
+ *l4_offset + opt_offset
+ + offsetof(struct ipv6_destopt_hao, addr),
+ &info->in6_ext_src, sizeof(info->in6_ext_src),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->is_ipv6_ext_src = 1;
+ break;
+ }
+
+ opt_offset += (opt.type == IPV6_TLV_PAD1) ?
+ 1 : opt.length + sizeof(opt);
+
+ if (opt_offset + 1 >= ext_hdr.hdrlen * 8) {
+ break;
+ }
+ }
+ } else if (*l4_protocol == IPPROTO_FRAGMENT) {
+ info->is_fragmented = true;
+ }
+
+ *l4_protocol = ext_hdr.nexthdr;
+ *l4_offset += (ext_hdr.hdrlen + 1) * 8;
+
+ if (!ip6_extension_header_type(ext_hdr.nexthdr)) {
+ return 0;
+ }
+ }
+
+ return 0;
+error:
+ return err;
+}
+
+static __be16 parse_eth_type(struct __sk_buff *skb)
+{
+ unsigned int offset = 12;
+ __be16 ret = 0;
+ int err = 0;
+
+ err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
+ BPF_HDR_START_MAC);
+ if (err) {
+ return 0;
+ }
+
+ switch (bpf_ntohs(ret)) {
+ case ETH_P_8021AD:
+ offset += 4;
+ case ETH_P_8021Q:
+ offset += 4;
+ err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret),
+ BPF_HDR_START_MAC);
+ default:
+ break;
+ }
+
+ if (err) {
+ return 0;
+ }
+
+ return ret;
+}
+
+static inline int parse_packet(struct __sk_buff *skb,
+ struct packet_hash_info_t *info)
+{
+ int err = 0;
+
+ if (!info || !skb) {
+ return -1;
+ }
+
+ size_t l4_offset = 0;
+ __u8 l4_protocol = 0;
+ __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb));
+ if (l3_protocol == 0) {
+ err = -1;
+ goto error;
+ }
+
+ if (l3_protocol == ETH_P_IP) {
+ info->is_ipv4 = 1;
+
+ struct iphdr ip = {};
+ err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->in_src = ip.saddr;
+ info->in_dst = ip.daddr;
+ info->is_fragmented = !!ip.frag_off;
+
+ l4_protocol = ip.protocol;
+ l4_offset = ip.ihl * 4;
+ } else if (l3_protocol == ETH_P_IPV6) {
+ info->is_ipv6 = 1;
+
+ struct ipv6hdr ip6 = {};
+ err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->in6_src = ip6.saddr;
+ info->in6_dst = ip6.daddr;
+
+ l4_protocol = ip6.nexthdr;
+ l4_offset = sizeof(ip6);
+
+ err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset);
+ if (err) {
+ goto error;
+ }
+ }
+
+ if (l4_protocol != 0 && !info->is_fragmented) {
+ if (l4_protocol == IPPROTO_TCP) {
+ info->is_tcp = 1;
+
+ struct tcphdr tcp = {};
+ err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->src_port = tcp.source;
+ info->dst_port = tcp.dest;
+ } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */
+ info->is_udp = 1;
+
+ struct udphdr udp = {};
+ err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp),
+ BPF_HDR_START_NET);
+ if (err) {
+ goto error;
+ }
+
+ info->src_port = udp.source;
+ info->dst_port = udp.dest;
+ }
+ }
+
+ return 0;
+
+error:
+ return err;
+}
+
+static inline __u32 calculate_rss_hash(struct __sk_buff *skb,
+ struct rss_config_t *config, struct toeplitz_key_data_t *toe)
+{
+ __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {};
+ size_t bytes_written = 0;
+ __u32 result = 0;
+ int err = 0;
+ struct packet_hash_info_t packet_info = {};
+
+ err = parse_packet(skb, &packet_info);
+ if (err) {
+ return 0;
+ }
+
+ if (packet_info.is_ipv4) {
+ if (packet_info.is_tcp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (packet_info.is_udp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_src,
+ sizeof(packet_info.in_src));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in_dst,
+ sizeof(packet_info.in_dst));
+ }
+ } else if (packet_info.is_ipv6) {
+ if (packet_info.is_tcp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) {
+
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+ } else if (packet_info.is_udp &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) {
+
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.src_port,
+ sizeof(packet_info.src_port));
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.dst_port,
+ sizeof(packet_info.dst_port));
+
+ } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) {
+ if (packet_info.is_ipv6_ext_src &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_src,
+ sizeof(packet_info.in6_ext_src));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_src,
+ sizeof(packet_info.in6_src));
+ }
+ if (packet_info.is_ipv6_ext_dst &&
+ config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) {
+
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_ext_dst,
+ sizeof(packet_info.in6_ext_dst));
+ } else {
+ net_rx_rss_add_chunk(rss_input, &bytes_written,
+ &packet_info.in6_dst,
+ sizeof(packet_info.in6_dst));
+ }
+ }
+ }
+
+ if (bytes_written) {
+ net_toeplitz_add(&result, rss_input, bytes_written, toe);
+ }
+
+ return result;
+}
+
+SEC("tun_rss_steering")
+int tun_rss_steering_prog(struct __sk_buff *skb)
+{
+
+ struct rss_config_t *config;
+ struct toeplitz_key_data_t *toe;
+
+ __u32 key = 0;
+ __u32 hash = 0;
+
+ config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key);
+ toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key);
+
+ if (config && toe) {
+ if (!config->redirect) {
+ return config->default_queue;
+ }
+
+ hash = calculate_rss_hash(skb, config, toe);
+ if (hash) {
+ __u32 table_idx = hash % config->indirections_len;
+ __u16 *queue = 0;
+
+ queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table,
+ &table_idx);
+
+ if (queue) {
+ return *queue;
+ }
+ }
+
+ return config->default_queue;
+ }
+
+ return -1;
+}
+
+char _license[] SEC("license") = "GPL v2";
diff --git a/tools/meson.build b/tools/meson.build
new file mode 100644
index 000000000..1dd3e204d
--- /dev/null
+++ b/tools/meson.build
@@ -0,0 +1,35 @@
+have_virtiofsd = (targetos == 'linux' and
+ have_tools and
+ seccomp.found() and
+ libcap_ng.found() and
+ 'CONFIG_VHOST_USER' in config_host)
+
+if get_option('virtiofsd').enabled()
+ if not have_virtiofsd
+ if targetos != 'linux'
+ error('virtiofsd requires Linux')
+ elif not seccomp.found() or not libcap_ng.found()
+ error('virtiofsd requires libcap-ng-devel and seccomp-devel')
+ elif 'CONFIG_VHOST_USER' not in config_host
+ error('virtiofsd needs vhost-user support')
+ else
+ # Disabled all the tools but virtiofsd.
+ have_virtiofsd = true
+ endif
+ endif
+elif get_option('virtiofsd').disabled() or not have_system
+ have_virtiofsd = false
+endif
+
+if have_virtiofsd
+ subdir('virtiofsd')
+endif
+
+have_virtiorng = (have_system and
+ have_tools and
+ 'CONFIG_LINUX' in config_host)
+
+if have_virtiorng
+ subdir('vhost-user-rng')
+endif
+
diff --git a/tools/vhost-user-rng/50-qemu-rng.json.in b/tools/vhost-user-rng/50-qemu-rng.json.in
new file mode 100644
index 000000000..64198f163
--- /dev/null
+++ b/tools/vhost-user-rng/50-qemu-rng.json.in
@@ -0,0 +1,5 @@
+{
+ "description": "QEMU vhost-user-rng",
+ "type": "bridge",
+ "binary": "@libexecdir@/vhost-user-rng"
+}
diff --git a/tools/vhost-user-rng/main.c b/tools/vhost-user-rng/main.c
new file mode 100644
index 000000000..4823bce18
--- /dev/null
+++ b/tools/vhost-user-rng/main.c
@@ -0,0 +1,407 @@
+/*
+ * VIRTIO RNG Emulation via vhost-user
+ *
+ * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#define G_LOG_DOMAIN "vhost-user-rng"
+#define G_LOG_USE_STRUCTURED 1
+
+#include <glib.h>
+#include <gio/gio.h>
+#include <gio/gunixsocketaddress.h>
+#include <glib-unix.h>
+#include <glib/gstdio.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include <endian.h>
+#include <assert.h>
+
+#include "qemu/cutils.h"
+#include "subprojects/libvhost-user/libvhost-user-glib.h"
+#include "subprojects/libvhost-user/libvhost-user.h"
+
+#ifndef container_of
+#define container_of(ptr, type, member) ({ \
+ const typeof(((type *) 0)->member) * __mptr = (ptr); \
+ (type *) ((char *) __mptr - offsetof(type, member)); })
+#endif
+
+typedef struct {
+ VugDev dev;
+ struct itimerspec ts;
+ timer_t rate_limit_timer;
+ pthread_mutex_t rng_mutex;
+ pthread_cond_t rng_cond;
+ int64_t quota_remaining;
+ bool activate_timer;
+ GMainLoop *loop;
+} VuRNG;
+
+static gboolean print_cap, verbose;
+static gchar *source_path, *socket_path;
+static gint source_fd, socket_fd = -1;
+
+/* Defaults tailored on virtio-rng.c */
+static uint32_t period_ms = 1 << 16;
+static uint64_t max_bytes = INT64_MAX;
+
+static void check_rate_limit(union sigval sv)
+{
+ VuRNG *rng = sv.sival_ptr;
+ bool wakeup = false;
+
+ pthread_mutex_lock(&rng->rng_mutex);
+ /*
+ * The timer has expired and the guest has used all available
+ * entropy, which means function vu_rng_handle_request() is waiting
+ * on us. As such wake it up once we're done here.
+ */
+ if (rng->quota_remaining == 0) {
+ wakeup = true;
+ }
+
+ /*
+ * Reset the entropy available to the guest and tell function
+ * vu_rng_handle_requests() to start the timer before using it.
+ */
+ rng->quota_remaining = max_bytes;
+ rng->activate_timer = true;
+ pthread_mutex_unlock(&rng->rng_mutex);
+
+ if (wakeup) {
+ pthread_cond_signal(&rng->rng_cond);
+ }
+}
+
+static void setup_timer(VuRNG *rng)
+{
+ struct sigevent sev;
+ int ret;
+
+ memset(&rng->ts, 0, sizeof(struct itimerspec));
+ rng->ts.it_value.tv_sec = period_ms / 1000;
+ rng->ts.it_value.tv_nsec = (period_ms % 1000) * 1000000;
+
+ /*
+ * Call function check_rate_limit() as if it was the start of
+ * a new thread when the timer expires.
+ */
+ sev.sigev_notify = SIGEV_THREAD;
+ sev.sigev_notify_function = check_rate_limit;
+ sev.sigev_value.sival_ptr = rng;
+ /* Needs to be NULL if defaults attributes are to be used. */
+ sev.sigev_notify_attributes = NULL;
+ ret = timer_create(CLOCK_MONOTONIC, &sev, &rng->rate_limit_timer);
+ if (ret < 0) {
+ fprintf(stderr, "timer_create() failed\n");
+ }
+
+}
+
+
+/* Virtio helpers */
+static uint64_t rng_get_features(VuDev *dev)
+{
+ if (verbose) {
+ g_info("%s: replying", __func__);
+ }
+ return 0;
+}
+
+static void rng_set_features(VuDev *dev, uint64_t features)
+{
+ if (verbose && features) {
+ g_autoptr(GString) s = g_string_new("Requested un-handled feature");
+ g_string_append_printf(s, " 0x%" PRIx64 "", features);
+ g_info("%s: %s", __func__, s->str);
+ }
+}
+
+static void vu_rng_handle_requests(VuDev *dev, int qidx)
+{
+ VuRNG *rng = container_of(dev, VuRNG, dev.parent);
+ VuVirtq *vq = vu_get_queue(dev, qidx);
+ VuVirtqElement *elem;
+ size_t to_read;
+ int len, ret;
+
+ printf("vu_rng_handle_requests\n");
+
+ for (;;) {
+ /* Get element in the vhost virtqueue */
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ if (!elem) {
+ break;
+ }
+
+ printf("elem->in_sg[0].iov_base: 0x%lx\n", (uint64_t)elem->in_sg[0].iov_base);
+
+ /* Get the amount of entropy to read from the vhost server */
+ to_read = elem->in_sg[0].iov_len;
+
+ pthread_mutex_lock(&rng->rng_mutex);
+
+ /*
+ * We have consumed all entropy available for this time slice.
+ * Wait for the timer (check_rate_limit()) to tell us about the
+ * start of a new time slice.
+ */
+ if (rng->quota_remaining == 0) {
+ pthread_cond_wait(&rng->rng_cond, &rng->rng_mutex);
+ }
+
+ /* Start the timer if the last time slice has expired */
+ if (rng->activate_timer == true) {
+ rng->activate_timer = false;
+ ret = timer_settime(rng->rate_limit_timer, 0, &rng->ts, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "timer_settime() failed\n");
+ }
+ }
+
+ /* Make sure we don't read more than it's available */
+ if (rng->quota_remaining < to_read) {
+ to_read = rng->quota_remaining;
+ }
+
+ len = read(source_fd, elem->in_sg[0].iov_base, to_read);
+
+ /* Simply return 0 if an error occurs */
+ if (len < 0) {
+ len = 0;
+ }
+
+ rng->quota_remaining -= len;
+
+ pthread_mutex_unlock(&rng->rng_mutex);
+
+ vu_queue_push(dev, vq, elem, len);
+ free(elem);
+ }
+
+ vu_queue_notify(dev, vq);
+}
+
+static void
+vu_rng_queue_set_started(VuDev *dev, int qidx, bool started)
+{
+ VuVirtq *vq = vu_get_queue(dev, qidx);
+
+ g_debug("queue started %d:%d\n", qidx, started);
+
+ if (!qidx) {
+ vu_set_queue_handler(dev, vq, started ? vu_rng_handle_requests : NULL);
+ }
+}
+
+/*
+ * Any messages not handled here are processed by the libvhost library
+ * itself.
+ */
+static int rng_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply)
+{
+ VuRNG *rng = container_of(dev, VuRNG, dev.parent);
+
+ if (msg->request == VHOST_USER_NONE) {
+ g_main_loop_quit(rng->loop);
+ return 1;
+ }
+
+ return 0;
+}
+
+static const VuDevIface vuiface = {
+ .set_features = rng_set_features,
+ .get_features = rng_get_features,
+ .queue_set_started = vu_rng_queue_set_started,
+ .process_msg = rng_process_msg,
+};
+
+static gboolean hangup(gpointer user_data)
+{
+ GMainLoop *loop = (GMainLoop *) user_data;
+
+ g_printerr("%s: caught hangup/quit signal, quitting", __func__);
+ g_main_loop_quit(loop);
+ return true;
+}
+
+static void panic(VuDev *dev, const char *msg)
+{
+ g_critical("%s\n", msg);
+ exit(EXIT_FAILURE);
+}
+
+/* Print vhost-user.json backend program capabilities */
+static void print_capabilities(void)
+{
+ printf("{\n");
+ printf(" \"type\": \"RNG\"\n");
+ printf(" \"filename\": [ RNG source ]\n");
+ printf("}\n");
+}
+
+static GOptionEntry options[] = {
+ { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &socket_path,
+ "Location of vhost-user Unix domain socket, incompatible with --fd",
+ "PATH" },
+ { "fd", 'f', 0, G_OPTION_ARG_INT, &socket_fd,
+ "Specify the backend file-descriptor, incompatible with --socket-path",
+ "FD" },
+ { "period", 'p', 0, G_OPTION_ARG_INT, &period_ms,
+ "Time needed (in ms) to transfer a maximum amount of byte", NULL },
+ { "max-bytes", 'm', 0, G_OPTION_ARG_INT64, &max_bytes,
+ "Maximum amount of byte that can be transferred in a period", NULL },
+ { "filename", 'n', 0, G_OPTION_ARG_FILENAME, &source_path,
+ "RNG source, defaults to /dev/urandom", "PATH" },
+ { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &print_cap,
+ "Output to stdout the backend capabilities in JSON format and exit",
+ NULL},
+ { "verbose", 'v', 0, G_OPTION_ARG_NONE, &verbose,
+ "Be more verbose in output", NULL},
+ { NULL }
+};
+
+int main(int argc, char *argv[])
+{
+ GError *error = NULL;
+ GOptionContext *context;
+ g_autoptr(GSocket) socket = NULL;
+ char default_source[] = "/dev/urandom";
+ char *source = default_source;
+ VuRNG rng;
+
+ context = g_option_context_new("vhost-user emulation of RNG device");
+ g_option_context_add_main_entries(context, options, "vhost-user-rng");
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_printerr("option parsing failed: %s\n", error->message);
+ exit(1);
+ }
+
+ if (print_cap) {
+ print_capabilities();
+ exit(0);
+ }
+
+ if (!socket_path && socket_fd < 0) {
+ g_printerr("Please specify either --fd or --socket-path\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (socket_path && socket_fd > 0) {
+ g_printerr("Either --fd or --socket-path, not both\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (max_bytes > INT64_MAX) {
+ g_printerr("'max-bytes' parameter must be non-negative, "
+ "and less than 2^63\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (period_ms <= 0) {
+ g_printerr("'period' parameter expects a positive integer\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Now create a vhost-user socket that we will receive messages
+ * on. Once we have our handler set up we can enter the glib main
+ * loop.
+ */
+ if (socket_path) {
+ g_autoptr(GSocketAddress) addr = g_unix_socket_address_new(socket_path);
+ g_autoptr(GSocket) bind_socket = g_socket_new(G_SOCKET_FAMILY_UNIX,
+ G_SOCKET_TYPE_STREAM,
+ G_SOCKET_PROTOCOL_DEFAULT,
+ &error);
+
+ if (!g_socket_bind(bind_socket, addr, false, &error)) {
+ g_printerr("Failed to bind to socket at %s (%s).\n",
+ socket_path, error->message);
+ exit(EXIT_FAILURE);
+ }
+ if (!g_socket_listen(bind_socket, &error)) {
+ g_printerr("Failed to listen on socket %s (%s).\n",
+ socket_path, error->message);
+ }
+ g_message("awaiting connection to %s", socket_path);
+ socket = g_socket_accept(bind_socket, NULL, &error);
+ if (!socket) {
+ g_printerr("Failed to accept on socket %s (%s).\n",
+ socket_path, error->message);
+ }
+ } else {
+ socket = g_socket_new_from_fd(socket_fd, &error);
+ if (!socket) {
+ g_printerr("Failed to connect to FD %d (%s).\n",
+ socket_fd, error->message);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ /* Overwrite default RNG source with what user provided, if any */
+ if (source_path) {
+ source = source_path;
+ }
+
+ source_fd = open(source, O_RDWR);
+ if (source_fd < 0) {
+ g_printerr("Failed to open RNG source %s\n", source);
+ g_socket_close(socket, &error);
+ unlink(socket_path);
+ exit(EXIT_FAILURE);
+ }
+
+ /* catch exit signals */
+ g_unix_signal_add(SIGHUP, hangup, rng.loop);
+ g_unix_signal_add(SIGINT, hangup, rng.loop);
+
+ /*
+ * Create the main loop first so all the various sources can be
+ * added. As well as catching signals we need to ensure vug_init
+ * can add it's GSource watches.
+ */
+ rng.loop = g_main_loop_new(NULL, FALSE);
+
+ if (!vug_init(&rng.dev, 1, g_socket_get_fd(socket),
+ panic, &vuiface)) {
+ g_printerr("Failed to initialize libvhost-user-glib.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rng.quota_remaining = max_bytes;
+ rng.activate_timer = true;
+ pthread_mutex_init(&rng.rng_mutex, NULL);
+ pthread_cond_init(&rng.rng_cond, NULL);
+ setup_timer(&rng);
+
+ if (verbose) {
+ g_info("period_ms: %d tv_sec: %ld tv_nsec: %lu\n",
+ period_ms, rng.ts.it_value.tv_sec, rng.ts.it_value.tv_nsec);
+ }
+
+ g_message("entering main loop, awaiting messages");
+ g_main_loop_run(rng.loop);
+ g_message("finished main loop, cleaning up");
+
+ g_main_loop_unref(rng.loop);
+ vug_deinit(&rng.dev);
+ timer_delete(rng.rate_limit_timer);
+ close(source_fd);
+ unlink(socket_path);
+}
diff --git a/tools/vhost-user-rng/meson.build b/tools/vhost-user-rng/meson.build
new file mode 100644
index 000000000..4bcc4ad87
--- /dev/null
+++ b/tools/vhost-user-rng/meson.build
@@ -0,0 +1,10 @@
+executable('vhost-user-rng', files(
+ 'main.c'),
+ dependencies: [qemuutil, glib, gio, rt],
+ install: true,
+ install_dir: get_option('libexecdir'))
+
+configure_file(input: '50-qemu-rng.json.in',
+ output: '50-qemu-rng.json',
+ configuration: config_host,
+ install_dir: qemu_datadir / 'vhost-user')
diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in
new file mode 100644
index 000000000..9bcd86f8d
--- /dev/null
+++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in
@@ -0,0 +1,5 @@
+{
+ "description": "QEMU virtiofsd vhost-user-fs",
+ "type": "fs",
+ "binary": "@libexecdir@/virtiofsd"
+}
diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c
new file mode 100644
index 000000000..b5f04be35
--- /dev/null
+++ b/tools/virtiofsd/buffer.c
@@ -0,0 +1,350 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2010 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Functions for dealing with `struct fuse_buf` and `struct
+ * fuse_bufvec`.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+
+size_t fuse_buf_size(const struct fuse_bufvec *bufv)
+{
+ size_t i;
+ size_t size = 0;
+
+ for (i = 0; i < bufv->count; i++) {
+ if (bufv->buf[i].size == SIZE_MAX) {
+ size = SIZE_MAX;
+ } else {
+ size += bufv->buf[i].size;
+ }
+ }
+
+ return size;
+}
+
+static ssize_t fuse_buf_writev(struct fuse_buf *out_buf,
+ struct fuse_bufvec *in_buf)
+{
+ ssize_t res, i, j;
+ size_t iovcnt = in_buf->count;
+ struct iovec *iov;
+ int fd = out_buf->fd;
+
+ iov = g_try_new0(struct iovec, iovcnt);
+ if (!iov) {
+ return -ENOMEM;
+ }
+
+ for (i = 0, j = 0; i < iovcnt; i++) {
+ /* Skip the buf with 0 size */
+ if (in_buf->buf[i].size) {
+ iov[j].iov_base = in_buf->buf[i].mem;
+ iov[j].iov_len = in_buf->buf[i].size;
+ j++;
+ }
+ }
+
+ if (out_buf->flags & FUSE_BUF_FD_SEEK) {
+ res = pwritev(fd, iov, iovcnt, out_buf->pos);
+ } else {
+ res = writev(fd, iov, iovcnt);
+ }
+
+ if (res == -1) {
+ res = -errno;
+ }
+
+ g_free(iov);
+ return res;
+}
+
+static size_t min_size(size_t s1, size_t s2)
+{
+ return s1 < s2 ? s1 : s2;
+}
+
+static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ ssize_t res = 0;
+ size_t copied = 0;
+
+ while (len) {
+ if (dst->flags & FUSE_BUF_FD_SEEK) {
+ res = pwrite(dst->fd, (char *)src->mem + src_off, len,
+ dst->pos + dst_off);
+ } else {
+ res = write(dst->fd, (char *)src->mem + src_off, len);
+ }
+ if (res == -1) {
+ if (!copied) {
+ return -errno;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+ if (!(dst->flags & FUSE_BUF_FD_RETRY)) {
+ break;
+ }
+
+ src_off += res;
+ dst_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ ssize_t res = 0;
+ size_t copied = 0;
+
+ while (len) {
+ if (src->flags & FUSE_BUF_FD_SEEK) {
+ res = pread(src->fd, (char *)dst->mem + dst_off, len,
+ src->pos + src_off);
+ } else {
+ res = read(src->fd, (char *)dst->mem + dst_off, len);
+ }
+ if (res == -1) {
+ if (!copied) {
+ return -errno;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+ if (!(src->flags & FUSE_BUF_FD_RETRY)) {
+ break;
+ }
+
+ dst_off += res;
+ src_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ char buf[4096];
+ struct fuse_buf tmp = {
+ .size = sizeof(buf),
+ .flags = 0,
+ };
+ ssize_t res;
+ size_t copied = 0;
+
+ tmp.mem = buf;
+
+ while (len) {
+ size_t this_len = min_size(tmp.size, len);
+ size_t read_len;
+
+ res = fuse_buf_read(&tmp, 0, src, src_off, this_len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ read_len = res;
+ res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ if (res == 0) {
+ break;
+ }
+
+ copied += res;
+
+ if (res < this_len) {
+ break;
+ }
+
+ dst_off += res;
+ src_off += res;
+ len -= res;
+ }
+
+ return copied;
+}
+
+static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off,
+ const struct fuse_buf *src, size_t src_off,
+ size_t len)
+{
+ int src_is_fd = src->flags & FUSE_BUF_IS_FD;
+ int dst_is_fd = dst->flags & FUSE_BUF_IS_FD;
+
+ if (!src_is_fd && !dst_is_fd) {
+ char *dstmem = (char *)dst->mem + dst_off;
+ char *srcmem = (char *)src->mem + src_off;
+
+ if (dstmem != srcmem) {
+ if (dstmem + len <= srcmem || srcmem + len <= dstmem) {
+ memcpy(dstmem, srcmem, len);
+ } else {
+ memmove(dstmem, srcmem, len);
+ }
+ }
+
+ return len;
+ } else if (!src_is_fd) {
+ return fuse_buf_write(dst, dst_off, src, src_off, len);
+ } else if (!dst_is_fd) {
+ return fuse_buf_read(dst, dst_off, src, src_off, len);
+ } else {
+ return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len);
+ }
+}
+
+static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv)
+{
+ if (bufv->idx < bufv->count) {
+ return &bufv->buf[bufv->idx];
+ } else {
+ return NULL;
+ }
+}
+
+static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len)
+{
+ const struct fuse_buf *buf = fuse_bufvec_current(bufv);
+
+ if (!buf) {
+ return 0;
+ }
+
+ bufv->off += len;
+ assert(bufv->off <= buf->size);
+ if (bufv->off == buf->size) {
+ assert(bufv->idx < bufv->count);
+ bufv->idx++;
+ if (bufv->idx == bufv->count) {
+ return 0;
+ }
+ bufv->off = 0;
+ }
+ return 1;
+}
+
+ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv)
+{
+ size_t copied = 0, i;
+
+ if (dstv == srcv) {
+ return fuse_buf_size(dstv);
+ }
+
+ /*
+ * use writev to improve bandwidth when all the
+ * src buffers already mapped by the daemon
+ * process
+ */
+ for (i = 0; i < srcv->count; i++) {
+ if (srcv->buf[i].flags & FUSE_BUF_IS_FD) {
+ break;
+ }
+ }
+ if ((i == srcv->count) && (dstv->count == 1) &&
+ (dstv->idx == 0) &&
+ (dstv->buf[0].flags & FUSE_BUF_IS_FD)) {
+ dstv->buf[0].pos += dstv->off;
+ return fuse_buf_writev(&dstv->buf[0], srcv);
+ }
+
+ for (;;) {
+ const struct fuse_buf *src = fuse_bufvec_current(srcv);
+ const struct fuse_buf *dst = fuse_bufvec_current(dstv);
+ size_t src_len;
+ size_t dst_len;
+ size_t len;
+ ssize_t res;
+
+ if (src == NULL || dst == NULL) {
+ break;
+ }
+
+ src_len = src->size - srcv->off;
+ dst_len = dst->size - dstv->off;
+ len = min_size(src_len, dst_len);
+
+ res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len);
+ if (res < 0) {
+ if (!copied) {
+ return res;
+ }
+ break;
+ }
+ copied += res;
+
+ if (!fuse_bufvec_advance(srcv, res) ||
+ !fuse_bufvec_advance(dstv, res)) {
+ break;
+ }
+
+ if (res < len) {
+ break;
+ }
+ }
+
+ return copied;
+}
+
+void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len)
+{
+ void *ptr;
+
+ if (len > iter->size - iter->pos) {
+ return NULL;
+ }
+
+ ptr = iter->mem + iter->pos;
+ iter->pos += len;
+ return ptr;
+}
+
+const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter)
+{
+ const char *str = iter->mem + iter->pos;
+ size_t remaining = iter->size - iter->pos;
+ size_t i;
+
+ for (i = 0; i < remaining; i++) {
+ if (str[i] == '\0') {
+ iter->pos += i + 1;
+ return str;
+ }
+ }
+ return NULL;
+}
diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h
new file mode 100644
index 000000000..0c2665b97
--- /dev/null
+++ b/tools/virtiofsd/fuse_common.h
@@ -0,0 +1,832 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+/** @file */
+
+#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_)
+#error \
+ "Never include <fuse_common.h> directly; use <fuse.h> or <fuse_lowlevel.h> instead."
+#endif
+
+#ifndef FUSE_COMMON_H_
+#define FUSE_COMMON_H_
+
+#include "fuse_log.h"
+#include "fuse_opt.h"
+
+/** Major version of FUSE library interface */
+#define FUSE_MAJOR_VERSION 3
+
+/** Minor version of FUSE library interface */
+#define FUSE_MINOR_VERSION 2
+
+#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min))
+#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION)
+
+/**
+ * Information about an open file.
+ *
+ * File Handles are created by the open, opendir, and create methods and closed
+ * by the release and releasedir methods. Multiple file handles may be
+ * concurrently open for the same file. Generally, a client will create one
+ * file handle per file descriptor, though in some cases multiple file
+ * descriptors can share a single file handle.
+ */
+struct fuse_file_info {
+ /** Open flags. Available in open() and release() */
+ int flags;
+
+ /*
+ * In case of a write operation indicates if this was caused
+ * by a delayed write from the page cache. If so, then the
+ * context's pid, uid, and gid fields will not be valid, and
+ * the *fh* value may not match the *fh* value that would
+ * have been sent with the corresponding individual write
+ * requests if write caching had been disabled.
+ */
+ unsigned int writepage:1;
+
+ /** Can be filled in by open, to use direct I/O on this file. */
+ unsigned int direct_io:1;
+
+ /*
+ * Can be filled in by open. It signals the kernel that any
+ * currently cached file data (ie., data that the filesystem
+ * provided the last time the file was open) need not be
+ * invalidated. Has no effect when set in other contexts (in
+ * particular it does nothing when set by opendir()).
+ */
+ unsigned int keep_cache:1;
+
+ /*
+ * Indicates a flush operation. Set in flush operation, also
+ * maybe set in highlevel lock operation and lowlevel release
+ * operation.
+ */
+ unsigned int flush:1;
+
+ /*
+ * Can be filled in by open, to indicate that the file is not
+ * seekable.
+ */
+ unsigned int nonseekable:1;
+
+ /*
+ * Indicates that flock locks for this file should be
+ * released. If set, lock_owner shall contain a valid value.
+ * May only be set in ->release().
+ */
+ unsigned int flock_release:1;
+
+ /*
+ * Can be filled in by opendir. It signals the kernel to
+ * enable caching of entries returned by readdir(). Has no
+ * effect when set in other contexts (in particular it does
+ * nothing when set by open()).
+ */
+ unsigned int cache_readdir:1;
+
+ /* Indicates that suid/sgid bits should be removed upon write */
+ unsigned int kill_priv:1;
+
+
+ /** Padding. Reserved for future use*/
+ unsigned int padding:24;
+ unsigned int padding2:32;
+
+ /*
+ * File handle id. May be filled in by filesystem in create,
+ * open, and opendir(). Available in most other file operations on the
+ * same file handle.
+ */
+ uint64_t fh;
+
+ /** Lock owner id. Available in locking operations and flush */
+ uint64_t lock_owner;
+
+ /*
+ * Requested poll events. Available in ->poll. Only set on kernels
+ * which support it. If unsupported, this field is set to zero.
+ */
+ uint32_t poll_events;
+};
+
+/*
+ * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want'
+ */
+
+/**
+ * Indicates that the filesystem supports asynchronous read requests.
+ *
+ * If this capability is not requested/available, the kernel will
+ * ensure that there is at most one pending read request per
+ * file-handle at any time, and will attempt to order read requests by
+ * increasing offset.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ASYNC_READ (1 << 0)
+
+/**
+ * Indicates that the filesystem supports "remote" locking.
+ *
+ * This feature is enabled by default when supported by the kernel,
+ * and if getlk() and setlk() handlers are implemented.
+ */
+#define FUSE_CAP_POSIX_LOCKS (1 << 1)
+
+/**
+ * Indicates that the filesystem supports the O_TRUNC open flag. If
+ * disabled, and an application specifies O_TRUNC, fuse first calls
+ * truncate() and then open() with O_TRUNC filtered out.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3)
+
+/**
+ * Indicates that the filesystem supports lookups of "." and "..".
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_EXPORT_SUPPORT (1 << 4)
+
+/**
+ * Indicates that the kernel should not apply the umask to the
+ * file mode on create operations.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_DONT_MASK (1 << 6)
+
+/**
+ * Indicates that libfuse should try to use splice() when writing to
+ * the fuse device. This may improve performance.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_SPLICE_WRITE (1 << 7)
+
+/**
+ * Indicates that libfuse should try to move pages instead of copying when
+ * writing to / reading from the fuse device. This may improve performance.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_SPLICE_MOVE (1 << 8)
+
+/**
+ * Indicates that libfuse should try to use splice() when reading from
+ * the fuse device. This may improve performance.
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements a write_buf() handler.
+ */
+#define FUSE_CAP_SPLICE_READ (1 << 9)
+
+/**
+ * If set, the calls to flock(2) will be emulated using POSIX locks and must
+ * then be handled by the filesystem's setlock() handler.
+ *
+ * If not set, flock(2) calls will be handled by the FUSE kernel module
+ * internally (so any access that does not go through the kernel cannot be taken
+ * into account).
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements a flock() handler.
+ */
+#define FUSE_CAP_FLOCK_LOCKS (1 << 10)
+
+/**
+ * Indicates that the filesystem supports ioctl's on directories.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_IOCTL_DIR (1 << 11)
+
+/**
+ * Traditionally, while a file is open the FUSE kernel module only
+ * asks the filesystem for an update of the file's attributes when a
+ * client attempts to read beyond EOF. This is unsuitable for
+ * e.g. network filesystems, where the file contents may change
+ * without the kernel knowing about it.
+ *
+ * If this flag is set, FUSE will check the validity of the attributes
+ * on every read. If the attributes are no longer valid (i.e., if the
+ * *attr_timeout* passed to fuse_reply_attr() or set in `struct
+ * fuse_entry_param` has passed), it will first issue a `getattr`
+ * request. If the new mtime differs from the previous value, any
+ * cached file *contents* will be invalidated as well.
+ *
+ * This flag should always be set when available. If all file changes
+ * go through the kernel, *attr_timeout* should be set to a very large
+ * number to avoid unnecessary getattr() calls.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12)
+
+/**
+ * Indicates that the filesystem supports readdirplus.
+ *
+ * This feature is enabled by default when supported by the kernel and if the
+ * filesystem implements a readdirplus() handler.
+ */
+#define FUSE_CAP_READDIRPLUS (1 << 13)
+
+/**
+ * Indicates that the filesystem supports adaptive readdirplus.
+ *
+ * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect.
+ *
+ * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel
+ * will always issue readdirplus() requests to retrieve directory
+ * contents.
+ *
+ * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel
+ * will issue both readdir() and readdirplus() requests, depending on
+ * how much information is expected to be required.
+ *
+ * As of Linux 4.20, the algorithm is as follows: when userspace
+ * starts to read directory entries, issue a READDIRPLUS request to
+ * the filesystem. If any entry attributes have been looked up by the
+ * time userspace requests the next batch of entries continue with
+ * READDIRPLUS, otherwise switch to plain READDIR. This will reasult
+ * in eg plain "ls" triggering READDIRPLUS first then READDIR after
+ * that because it doesn't do lookups. "ls -l" should result in all
+ * READDIRPLUS, except if dentries are already cached.
+ *
+ * This feature is enabled by default when supported by the kernel and
+ * if the filesystem implements both a readdirplus() and a readdir()
+ * handler.
+ */
+#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14)
+
+/**
+ * Indicates that the filesystem supports asynchronous direct I/O submission.
+ *
+ * If this capability is not requested/available, the kernel will ensure that
+ * there is at most one pending read and one pending write request per direct
+ * I/O file-handle at any time.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_ASYNC_DIO (1 << 15)
+
+/**
+ * Indicates that writeback caching should be enabled. This means that
+ * individual write request may be buffered and merged in the kernel
+ * before they are send to the filesystem.
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_WRITEBACK_CACHE (1 << 16)
+
+/**
+ * Indicates support for zero-message opens. If this flag is set in
+ * the `capable` field of the `fuse_conn_info` structure, then the
+ * filesystem may return `ENOSYS` from the open() handler to indicate
+ * success. Further attempts to open files will be handled in the
+ * kernel. (If this flag is not set, returning ENOSYS will be treated
+ * as an error and signaled to the caller).
+ *
+ * Setting (or unsetting) this flag in the `want` field has *no
+ * effect*.
+ */
+#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17)
+
+/**
+ * Indicates support for parallel directory operations. If this flag
+ * is unset, the FUSE kernel module will ensure that lookup() and
+ * readdir() requests are never issued concurrently for the same
+ * directory.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_PARALLEL_DIROPS (1 << 18)
+
+/**
+ * Indicates support for POSIX ACLs.
+ *
+ * If this feature is enabled, the kernel will cache and have
+ * responsibility for enforcing ACLs. ACL will be stored as xattrs and
+ * passed to userspace, which is responsible for updating the ACLs in
+ * the filesystem, keeping the file mode in sync with the ACL, and
+ * ensuring inheritance of default ACLs when new filesystem nodes are
+ * created. Note that this requires that the file system is able to
+ * parse and interpret the xattr representation of ACLs.
+ *
+ * Enabling this feature implicitly turns on the
+ * ``default_permissions`` mount option (even if it was not passed to
+ * mount(2)).
+ *
+ * This feature is disabled by default.
+ */
+#define FUSE_CAP_POSIX_ACL (1 << 19)
+
+/**
+ * Indicates that the filesystem is responsible for unsetting
+ * setuid and setgid bits when a file is written, truncated, or
+ * its owner is changed.
+ *
+ * This feature is enabled by default when supported by the kernel.
+ */
+#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20)
+
+/**
+ * Indicates support for zero-message opendirs. If this flag is set in
+ * the `capable` field of the `fuse_conn_info` structure, then the filesystem
+ * may return `ENOSYS` from the opendir() handler to indicate success. Further
+ * opendir and releasedir messages will be handled in the kernel. (If this
+ * flag is not set, returning ENOSYS will be treated as an error and signalled
+ * to the caller.)
+ *
+ * Setting (or unsetting) this flag in the `want` field has *no effect*.
+ */
+#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24)
+
+/**
+ * Indicates that the kernel supports the FUSE_ATTR_SUBMOUNT flag.
+ *
+ * Setting (or unsetting) this flag in the `want` field has *no effect*.
+ */
+#define FUSE_CAP_SUBMOUNTS (1 << 27)
+
+/**
+ * Indicates that the filesystem is responsible for clearing
+ * security.capability xattr and clearing setuid and setgid bits. Following
+ * are the rules.
+ * - clear "security.capability" on write, truncate and chown unconditionally
+ * - clear suid/sgid if following is true. Note, sgid is cleared only if
+ * group executable bit is set.
+ * o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set.
+ * o setattr has FATTR_UID or FATTR_GID
+ * o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID
+ * o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set.
+ * o write has FUSE_WRITE_KILL_SUIDGID
+ */
+#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28)
+
+/**
+ * Indicates that file server supports extended struct fuse_setxattr_in
+ */
+#define FUSE_CAP_SETXATTR_EXT (1 << 29)
+
+/**
+ * Ioctl flags
+ *
+ * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
+ * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
+ * FUSE_IOCTL_RETRY: retry with new iovecs
+ * FUSE_IOCTL_DIR: is a directory
+ *
+ * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
+ */
+#define FUSE_IOCTL_COMPAT (1 << 0)
+#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
+#define FUSE_IOCTL_RETRY (1 << 2)
+#define FUSE_IOCTL_DIR (1 << 4)
+
+#define FUSE_IOCTL_MAX_IOV 256
+
+/**
+ * Connection information, passed to the ->init() method
+ *
+ * Some of the elements are read-write, these can be changed to
+ * indicate the value requested by the filesystem. The requested
+ * value must usually be smaller than the indicated value.
+ */
+struct fuse_conn_info {
+ /**
+ * Major version of the protocol (read-only)
+ */
+ unsigned proto_major;
+
+ /**
+ * Minor version of the protocol (read-only)
+ */
+ unsigned proto_minor;
+
+ /**
+ * Maximum size of the write buffer
+ */
+ unsigned max_write;
+
+ /**
+ * Maximum size of read requests. A value of zero indicates no
+ * limit. However, even if the filesystem does not specify a
+ * limit, the maximum size of read requests will still be
+ * limited by the kernel.
+ *
+ * NOTE: For the time being, the maximum size of read requests
+ * must be set both here *and* passed to fuse_session_new()
+ * using the ``-o max_read=<n>`` mount option. At some point
+ * in the future, specifying the mount option will no longer
+ * be necessary.
+ */
+ unsigned max_read;
+
+ /**
+ * Maximum readahead
+ */
+ unsigned max_readahead;
+
+ /**
+ * Capability flags that the kernel supports (read-only)
+ */
+ unsigned capable;
+
+ /**
+ * Capability flags that the filesystem wants to enable.
+ *
+ * libfuse attempts to initialize this field with
+ * reasonable default values before calling the init() handler.
+ */
+ unsigned want;
+
+ /**
+ * Maximum number of pending "background" requests. A
+ * background request is any type of request for which the
+ * total number is not limited by other means. As of kernel
+ * 4.8, only two types of requests fall into this category:
+ *
+ * 1. Read-ahead requests
+ * 2. Asynchronous direct I/O requests
+ *
+ * Read-ahead requests are generated (if max_readahead is
+ * non-zero) by the kernel to preemptively fill its caches
+ * when it anticipates that userspace will soon read more
+ * data.
+ *
+ * Asynchronous direct I/O requests are generated if
+ * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large
+ * direct I/O request. In this case the kernel will internally
+ * split it up into multiple smaller requests and submit them
+ * to the filesystem concurrently.
+ *
+ * Note that the following requests are *not* background
+ * requests: writeback requests (limited by the kernel's
+ * flusher algorithm), regular (i.e., synchronous and
+ * buffered) userspace read/write requests (limited to one per
+ * thread), asynchronous read requests (Linux's io_submit(2)
+ * call actually blocks, so these are also limited to one per
+ * thread).
+ */
+ unsigned max_background;
+
+ /**
+ * Kernel congestion threshold parameter. If the number of pending
+ * background requests exceeds this number, the FUSE kernel module will
+ * mark the filesystem as "congested". This instructs the kernel to
+ * expect that queued requests will take some time to complete, and to
+ * adjust its algorithms accordingly (e.g. by putting a waiting thread
+ * to sleep instead of using a busy-loop).
+ */
+ unsigned congestion_threshold;
+
+ /**
+ * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible
+ * for updating mtime and ctime when write requests are received. The
+ * updated values are passed to the filesystem with setattr() requests.
+ * However, if the filesystem does not support the full resolution of
+ * the kernel timestamps (nanoseconds), the mtime and ctime values used
+ * by kernel and filesystem will differ (and result in an apparent
+ * change of times after a cache flush).
+ *
+ * To prevent this problem, this variable can be used to inform the
+ * kernel about the timestamp granularity supported by the file-system.
+ * The value should be power of 10. The default is 1, i.e. full
+ * nano-second resolution. Filesystems supporting only second resolution
+ * should set this to 1000000000.
+ */
+ unsigned time_gran;
+
+ /**
+ * For future use.
+ */
+ unsigned reserved[22];
+};
+
+struct fuse_session;
+struct fuse_pollhandle;
+struct fuse_conn_info_opts;
+
+/**
+ * This function parses several command-line options that can be used
+ * to override elements of struct fuse_conn_info. The pointer returned
+ * by this function should be passed to the
+ * fuse_apply_conn_info_opts() method by the file system's init()
+ * handler.
+ *
+ * Before using this function, think twice if you really want these
+ * parameters to be adjustable from the command line. In most cases,
+ * they should be determined by the file system internally.
+ *
+ * The following options are recognized:
+ *
+ * -o max_write=N sets conn->max_write
+ * -o max_readahead=N sets conn->max_readahead
+ * -o max_background=N sets conn->max_background
+ * -o congestion_threshold=N sets conn->congestion_threshold
+ * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want
+ * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want
+ * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want
+ * -o no_remote_lock Equivalent to -o
+ *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets
+ *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets
+ *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets
+ *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets
+ *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets
+ *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets
+ *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets
+ *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets
+ *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o
+ *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO
+ *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in
+ *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in
+ *conn->want -o time_gran=N sets conn->time_gran
+ *
+ * Known options will be removed from *args*, unknown options will be
+ * passed through unchanged.
+ *
+ * @param args argument vector (input+output)
+ * @return parsed options
+ **/
+struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args);
+
+/**
+ * This function applies the (parsed) parameters in *opts* to the
+ * *conn* pointer. It may modify the following fields: wants,
+ * max_write, max_readahead, congestion_threshold, max_background,
+ * time_gran. A field is only set (or unset) if the corresponding
+ * option has been explicitly set.
+ */
+void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts,
+ struct fuse_conn_info *conn);
+
+/**
+ * Go into the background
+ *
+ * @param foreground if true, stay in the foreground
+ * @return 0 on success, -1 on failure
+ */
+int fuse_daemonize(int foreground);
+
+/**
+ * Get the version of the library
+ *
+ * @return the version
+ */
+int fuse_version(void);
+
+/**
+ * Get the full package version string of the library
+ *
+ * @return the package version
+ */
+const char *fuse_pkgversion(void);
+
+/**
+ * Destroy poll handle
+ *
+ * @param ph the poll handle
+ */
+void fuse_pollhandle_destroy(struct fuse_pollhandle *ph);
+
+/*
+ * Data buffer
+ */
+
+/**
+ * Buffer flags
+ */
+enum fuse_buf_flags {
+ /**
+ * Buffer contains a file descriptor
+ *
+ * If this flag is set, the .fd field is valid, otherwise the
+ * .mem fields is valid.
+ */
+ FUSE_BUF_IS_FD = (1 << 1),
+
+ /**
+ * Seek on the file descriptor
+ *
+ * If this flag is set then the .pos field is valid and is
+ * used to seek to the given offset before performing
+ * operation on file descriptor.
+ */
+ FUSE_BUF_FD_SEEK = (1 << 2),
+
+ /**
+ * Retry operation on file descriptor
+ *
+ * If this flag is set then retry operation on file descriptor
+ * until .size bytes have been copied or an error or EOF is
+ * detected.
+ */
+ FUSE_BUF_FD_RETRY = (1 << 3),
+};
+
+/**
+ * Single data buffer
+ *
+ * Generic data buffer for I/O, extended attributes, etc... Data may
+ * be supplied as a memory pointer or as a file descriptor
+ */
+struct fuse_buf {
+ /**
+ * Size of data in bytes
+ */
+ size_t size;
+
+ /**
+ * Buffer flags
+ */
+ enum fuse_buf_flags flags;
+
+ /**
+ * Memory pointer
+ *
+ * Used unless FUSE_BUF_IS_FD flag is set.
+ */
+ void *mem;
+
+ /**
+ * File descriptor
+ *
+ * Used if FUSE_BUF_IS_FD flag is set.
+ */
+ int fd;
+
+ /**
+ * File position
+ *
+ * Used if FUSE_BUF_FD_SEEK flag is set.
+ */
+ off_t pos;
+};
+
+/**
+ * Data buffer vector
+ *
+ * An array of data buffers, each containing a memory pointer or a
+ * file descriptor.
+ *
+ * Allocate dynamically to add more than one buffer.
+ */
+struct fuse_bufvec {
+ /**
+ * Number of buffers in the array
+ */
+ size_t count;
+
+ /**
+ * Index of current buffer within the array
+ */
+ size_t idx;
+
+ /**
+ * Current offset within the current buffer
+ */
+ size_t off;
+
+ /**
+ * Array of buffers
+ */
+ struct fuse_buf buf[1];
+};
+
+/* Initialize bufvec with a single buffer of given size */
+#define FUSE_BUFVEC_INIT(size__) \
+ ((struct fuse_bufvec){ /* .count= */ 1, \
+ /* .idx = */ 0, \
+ /* .off = */ 0, /* .buf = */ \
+ { /* [0] = */ { \
+ /* .size = */ (size__), \
+ /* .flags = */ (enum fuse_buf_flags)0, \
+ /* .mem = */ NULL, \
+ /* .fd = */ -1, \
+ /* .pos = */ 0, \
+ } } })
+
+/**
+ * Get total size of data in a fuse buffer vector
+ *
+ * @param bufv buffer vector
+ * @return size of data
+ */
+size_t fuse_buf_size(const struct fuse_bufvec *bufv);
+
+/**
+ * Copy data from one buffer vector to another
+ *
+ * @param dst destination buffer vector
+ * @param src source buffer vector
+ * @return actual number of bytes copied or -errno on error
+ */
+ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src);
+
+/**
+ * Memory buffer iterator
+ *
+ */
+struct fuse_mbuf_iter {
+ /**
+ * Data pointer
+ */
+ void *mem;
+
+ /**
+ * Total length, in bytes
+ */
+ size_t size;
+
+ /**
+ * Offset from start of buffer
+ */
+ size_t pos;
+};
+
+/* Initialize memory buffer iterator from a fuse_buf */
+#define FUSE_MBUF_ITER_INIT(fbuf) \
+ ((struct fuse_mbuf_iter){ \
+ .mem = fbuf->mem, \
+ .size = fbuf->size, \
+ .pos = 0, \
+ })
+
+/**
+ * Consume bytes from a memory buffer iterator
+ *
+ * @param iter memory buffer iterator
+ * @param len number of bytes to consume
+ * @return pointer to start of consumed bytes or
+ * NULL if advancing beyond end of buffer
+ */
+void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len);
+
+/**
+ * Consume a NUL-terminated string from a memory buffer iterator
+ *
+ * @param iter memory buffer iterator
+ * @return pointer to the string or
+ * NULL if advancing beyond end of buffer or there is no NUL-terminator
+ */
+const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter);
+
+/*
+ * Signal handling
+ */
+/**
+ * Exit session on HUP, TERM and INT signals and ignore PIPE signal
+ *
+ * Stores session in a global variable. May only be called once per
+ * process until fuse_remove_signal_handlers() is called.
+ *
+ * Once either of the POSIX signals arrives, the signal handler calls
+ * fuse_session_exit().
+ *
+ * @param se the session to exit
+ * @return 0 on success, -1 on failure
+ *
+ * See also:
+ * fuse_remove_signal_handlers()
+ */
+int fuse_set_signal_handlers(struct fuse_session *se);
+
+/**
+ * Restore default signal handlers
+ *
+ * Resets global session. After this fuse_set_signal_handlers() may
+ * be called again.
+ *
+ * @param se the same session as given in fuse_set_signal_handlers()
+ *
+ * See also:
+ * fuse_set_signal_handlers()
+ */
+void fuse_remove_signal_handlers(struct fuse_session *se);
+
+/*
+ * Compatibility stuff
+ */
+
+#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30
+#error only API version 30 or greater is supported
+#endif
+
+
+/*
+ * This interface uses 64 bit off_t.
+ *
+ * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags!
+ */
+QEMU_BUILD_BUG_ON(sizeof(off_t) != 8);
+
+#endif /* FUSE_COMMON_H_ */
diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h
new file mode 100644
index 000000000..492e00218
--- /dev/null
+++ b/tools/virtiofsd/fuse_i.h
@@ -0,0 +1,100 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#ifndef FUSE_I_H
+#define FUSE_I_H
+
+#define FUSE_USE_VERSION 31
+#include "fuse_lowlevel.h"
+
+struct fv_VuDev;
+struct fv_QueueInfo;
+
+struct fuse_req {
+ struct fuse_session *se;
+ uint64_t unique;
+ int ctr;
+ pthread_mutex_t lock;
+ struct fuse_ctx ctx;
+ struct fuse_chan *ch;
+ int interrupted;
+ unsigned int ioctl_64bit:1;
+ union {
+ struct {
+ uint64_t unique;
+ } i;
+ struct {
+ fuse_interrupt_func_t func;
+ void *data;
+ } ni;
+ } u;
+ struct fuse_req *next;
+ struct fuse_req *prev;
+};
+
+struct fuse_notify_req {
+ uint64_t unique;
+ void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t,
+ const void *, const struct fuse_buf *);
+ struct fuse_notify_req *next;
+ struct fuse_notify_req *prev;
+};
+
+struct fuse_session {
+ char *mountpoint;
+ volatile int exited;
+ int fd;
+ int debug;
+ int deny_others;
+ struct fuse_lowlevel_ops op;
+ int got_init;
+ struct cuse_data *cuse_data;
+ void *userdata;
+ uid_t owner;
+ struct fuse_conn_info conn;
+ struct fuse_req list;
+ struct fuse_req interrupts;
+ pthread_mutex_t lock;
+ pthread_rwlock_t init_rwlock;
+ int got_destroy;
+ int broken_splice_nonblock;
+ uint64_t notify_ctr;
+ struct fuse_notify_req notify_list;
+ size_t bufsize;
+ int error;
+ char *vu_socket_path;
+ char *vu_socket_group;
+ int vu_listen_fd;
+ int vu_socketfd;
+ struct fv_VuDev *virtio_dev;
+ int thread_pool_size;
+};
+
+struct fuse_chan {
+ pthread_mutex_t lock;
+ int ctr;
+ int fd;
+ struct fv_QueueInfo *qi;
+};
+
+int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov,
+ int count);
+void fuse_free_req(fuse_req_t req);
+
+void fuse_session_process_buf_int(struct fuse_session *se,
+ struct fuse_bufvec *bufv,
+ struct fuse_chan *ch);
+
+
+#define FUSE_MAX_MAX_PAGES 256
+#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32
+
+/* room needed in buffer to accommodate header */
+#define FUSE_BUFFER_HEADER_SIZE 0x1000
+
+#endif
diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c
new file mode 100644
index 000000000..745d88cd2
--- /dev/null
+++ b/tools/virtiofsd/fuse_log.c
@@ -0,0 +1,39 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * Logging API.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_log.h"
+
+
+static void default_log_func(__attribute__((unused)) enum fuse_log_level level,
+ const char *fmt, va_list ap)
+{
+ vfprintf(stderr, fmt, ap);
+}
+
+static fuse_log_func_t log_func = default_log_func;
+
+void fuse_set_log_func(fuse_log_func_t func)
+{
+ if (!func) {
+ func = default_log_func;
+ }
+
+ log_func = func;
+}
+
+void fuse_log(enum fuse_log_level level, const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ log_func(level, fmt, ap);
+ va_end(ap);
+}
diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h
new file mode 100644
index 000000000..8d7091bd4
--- /dev/null
+++ b/tools/virtiofsd/fuse_log.h
@@ -0,0 +1,73 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_LOG_H_
+#define FUSE_LOG_H_
+
+/** @file
+ *
+ * This file defines the logging interface of FUSE
+ */
+
+
+/**
+ * Log severity level
+ *
+ * These levels correspond to syslog(2) log levels since they are widely used.
+ */
+enum fuse_log_level {
+ FUSE_LOG_EMERG,
+ FUSE_LOG_ALERT,
+ FUSE_LOG_CRIT,
+ FUSE_LOG_ERR,
+ FUSE_LOG_WARNING,
+ FUSE_LOG_NOTICE,
+ FUSE_LOG_INFO,
+ FUSE_LOG_DEBUG
+};
+
+/**
+ * Log message handler function.
+ *
+ * This function must be thread-safe. It may be called from any libfuse
+ * function, including fuse_parse_cmdline() and other functions invoked before
+ * a FUSE filesystem is created.
+ *
+ * Install a custom log message handler function using fuse_set_log_func().
+ *
+ * @param level log severity level
+ * @param fmt sprintf-style format string including newline
+ * @param ap format string arguments
+ */
+typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt,
+ va_list ap);
+
+/**
+ * Install a custom log handler function.
+ *
+ * Log messages are emitted by libfuse functions to report errors and debug
+ * information. Messages are printed to stderr by default but this can be
+ * overridden by installing a custom log message handler function.
+ *
+ * The log message handler function is global and affects all FUSE filesystems
+ * created within this process.
+ *
+ * @param func a custom log message handler function or NULL to revert to
+ * the default
+ */
+void fuse_set_log_func(fuse_log_func_t func);
+
+/**
+ * Emit a log message
+ *
+ * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc)
+ * @param fmt sprintf-style format string including newline
+ */
+void fuse_log(enum fuse_log_level level, const char *fmt, ...);
+
+#endif /* FUSE_LOG_H_ */
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
new file mode 100644
index 000000000..e4679c73a
--- /dev/null
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -0,0 +1,2614 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Implementation of (most of) the low-level FUSE API. The session loop
+ * functions are implemented in separate files.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "standard-headers/linux/fuse.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+#include "fuse_virtio.h"
+
+#include <sys/file.h>
+
+#define THREAD_POOL_SIZE 0
+
+#define OFFSET_MAX 0x7fffffffffffffffLL
+
+struct fuse_pollhandle {
+ uint64_t kh;
+ struct fuse_session *se;
+};
+
+static size_t pagesize;
+
+static __attribute__((constructor)) void fuse_ll_init_pagesize(void)
+{
+ pagesize = getpagesize();
+}
+
+static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr)
+{
+ *attr = (struct fuse_attr){
+ .ino = stbuf->st_ino,
+ .mode = stbuf->st_mode,
+ .nlink = stbuf->st_nlink,
+ .uid = stbuf->st_uid,
+ .gid = stbuf->st_gid,
+ .rdev = stbuf->st_rdev,
+ .size = stbuf->st_size,
+ .blksize = stbuf->st_blksize,
+ .blocks = stbuf->st_blocks,
+ .atime = stbuf->st_atime,
+ .mtime = stbuf->st_mtime,
+ .ctime = stbuf->st_ctime,
+ .atimensec = ST_ATIM_NSEC(stbuf),
+ .mtimensec = ST_MTIM_NSEC(stbuf),
+ .ctimensec = ST_CTIM_NSEC(stbuf),
+ };
+}
+
+static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf)
+{
+ stbuf->st_mode = attr->mode;
+ stbuf->st_uid = attr->uid;
+ stbuf->st_gid = attr->gid;
+ stbuf->st_size = attr->size;
+ stbuf->st_atime = attr->atime;
+ stbuf->st_mtime = attr->mtime;
+ stbuf->st_ctime = attr->ctime;
+ ST_ATIM_NSEC_SET(stbuf, attr->atimensec);
+ ST_MTIM_NSEC_SET(stbuf, attr->mtimensec);
+ ST_CTIM_NSEC_SET(stbuf, attr->ctimensec);
+}
+
+static size_t iov_length(const struct iovec *iov, size_t count)
+{
+ size_t seg;
+ size_t ret = 0;
+
+ for (seg = 0; seg < count; seg++) {
+ ret += iov[seg].iov_len;
+ }
+ return ret;
+}
+
+static void list_init_req(struct fuse_req *req)
+{
+ req->next = req;
+ req->prev = req;
+}
+
+static void list_del_req(struct fuse_req *req)
+{
+ struct fuse_req *prev = req->prev;
+ struct fuse_req *next = req->next;
+ prev->next = next;
+ next->prev = prev;
+}
+
+static void list_add_req(struct fuse_req *req, struct fuse_req *next)
+{
+ struct fuse_req *prev = next->prev;
+ req->next = next;
+ req->prev = prev;
+ prev->next = req;
+ next->prev = req;
+}
+
+static void destroy_req(fuse_req_t req)
+{
+ pthread_mutex_destroy(&req->lock);
+ g_free(req);
+}
+
+void fuse_free_req(fuse_req_t req)
+{
+ int ctr;
+ struct fuse_session *se = req->se;
+
+ pthread_mutex_lock(&se->lock);
+ req->u.ni.func = NULL;
+ req->u.ni.data = NULL;
+ list_del_req(req);
+ ctr = --req->ctr;
+ req->ch = NULL;
+ pthread_mutex_unlock(&se->lock);
+ if (!ctr) {
+ destroy_req(req);
+ }
+}
+
+static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se)
+{
+ struct fuse_req *req;
+
+ req = g_try_new0(struct fuse_req, 1);
+ if (req == NULL) {
+ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n");
+ } else {
+ req->se = se;
+ req->ctr = 1;
+ list_init_req(req);
+ fuse_mutex_init(&req->lock);
+ }
+
+ return req;
+}
+
+/* Send data. If *ch* is NULL, send via session master fd */
+static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count)
+{
+ struct fuse_out_header *out = iov[0].iov_base;
+
+ out->len = iov_length(iov, count);
+ if (out->unique == 0) {
+ fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error,
+ out->len);
+ } else if (out->error) {
+ fuse_log(FUSE_LOG_DEBUG,
+ " unique: %llu, error: %i (%s), outsize: %i\n",
+ (unsigned long long)out->unique, out->error,
+ strerror(-out->error), out->len);
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n",
+ (unsigned long long)out->unique, out->len);
+ }
+
+ if (fuse_lowlevel_is_virtio(se)) {
+ return virtio_send_msg(se, ch, iov, count);
+ }
+
+ abort(); /* virtio should have taken it before here */
+ return 0;
+}
+
+
+int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov,
+ int count)
+{
+ struct fuse_out_header out = {
+ .unique = req->unique,
+ .error = error,
+ };
+
+ if (error <= -1000 || error > 0) {
+ fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error);
+ out.error = -ERANGE;
+ }
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(struct fuse_out_header);
+
+ return fuse_send_msg(req->se, req->ch, iov, count);
+}
+
+static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov,
+ int count)
+{
+ int res;
+
+ res = fuse_send_reply_iov_nofree(req, error, iov, count);
+ fuse_free_req(req);
+ return res;
+}
+
+static int send_reply(fuse_req_t req, int error, const void *arg,
+ size_t argsize)
+{
+ struct iovec iov[2];
+ int count = 1;
+ if (argsize) {
+ iov[1].iov_base = (void *)arg;
+ iov[1].iov_len = argsize;
+ count++;
+ }
+ return send_reply_iov(req, error, iov, count);
+}
+
+int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count)
+{
+ int res;
+ g_autofree struct iovec *padded_iov = NULL;
+
+ padded_iov = g_try_new(struct iovec, count + 1);
+ if (padded_iov == NULL) {
+ return fuse_reply_err(req, ENOMEM);
+ }
+
+ memcpy(padded_iov + 1, iov, count * sizeof(struct iovec));
+ count++;
+
+ res = send_reply_iov(req, 0, padded_iov, count);
+
+ return res;
+}
+
+
+/*
+ * 'buf` is allowed to be empty so that the proper size may be
+ * allocated by the caller
+ */
+size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name, const struct stat *stbuf, off_t off)
+{
+ (void)req;
+ size_t namelen;
+ size_t entlen;
+ size_t entlen_padded;
+ struct fuse_dirent *dirent;
+
+ namelen = strlen(name);
+ entlen = FUSE_NAME_OFFSET + namelen;
+ entlen_padded = FUSE_DIRENT_ALIGN(entlen);
+
+ if ((buf == NULL) || (entlen_padded > bufsize)) {
+ return entlen_padded;
+ }
+
+ dirent = (struct fuse_dirent *)buf;
+ dirent->ino = stbuf->st_ino;
+ dirent->off = off;
+ dirent->namelen = namelen;
+ dirent->type = (stbuf->st_mode & S_IFMT) >> 12;
+ memcpy(dirent->name, name, namelen);
+ memset(dirent->name + namelen, 0, entlen_padded - entlen);
+
+ return entlen_padded;
+}
+
+static void convert_statfs(const struct statvfs *stbuf,
+ struct fuse_kstatfs *kstatfs)
+{
+ *kstatfs = (struct fuse_kstatfs){
+ .bsize = stbuf->f_bsize,
+ .frsize = stbuf->f_frsize,
+ .blocks = stbuf->f_blocks,
+ .bfree = stbuf->f_bfree,
+ .bavail = stbuf->f_bavail,
+ .files = stbuf->f_files,
+ .ffree = stbuf->f_ffree,
+ .namelen = stbuf->f_namemax,
+ };
+}
+
+static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize)
+{
+ return send_reply(req, 0, arg, argsize);
+}
+
+int fuse_reply_err(fuse_req_t req, int err)
+{
+ return send_reply(req, -err, NULL, 0);
+}
+
+void fuse_reply_none(fuse_req_t req)
+{
+ fuse_free_req(req);
+}
+
+static unsigned long calc_timeout_sec(double t)
+{
+ if (t > (double)ULONG_MAX) {
+ return ULONG_MAX;
+ } else if (t < 0.0) {
+ return 0;
+ } else {
+ return (unsigned long)t;
+ }
+}
+
+static unsigned int calc_timeout_nsec(double t)
+{
+ double f = t - (double)calc_timeout_sec(t);
+ if (f < 0.0) {
+ return 0;
+ } else if (f >= 0.999999999) {
+ return 999999999;
+ } else {
+ return (unsigned int)(f * 1.0e9);
+ }
+}
+
+static void fill_entry(struct fuse_entry_out *arg,
+ const struct fuse_entry_param *e)
+{
+ *arg = (struct fuse_entry_out){
+ .nodeid = e->ino,
+ .generation = e->generation,
+ .entry_valid = calc_timeout_sec(e->entry_timeout),
+ .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout),
+ .attr_valid = calc_timeout_sec(e->attr_timeout),
+ .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout),
+ };
+ convert_stat(&e->attr, &arg->attr);
+
+ arg->attr.flags = e->attr_flags;
+}
+
+/*
+ * `buf` is allowed to be empty so that the proper size may be
+ * allocated by the caller
+ */
+size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name,
+ const struct fuse_entry_param *e, off_t off)
+{
+ (void)req;
+ size_t namelen;
+ size_t entlen;
+ size_t entlen_padded;
+
+ namelen = strlen(name);
+ entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen;
+ entlen_padded = FUSE_DIRENT_ALIGN(entlen);
+ if ((buf == NULL) || (entlen_padded > bufsize)) {
+ return entlen_padded;
+ }
+
+ struct fuse_direntplus *dp = (struct fuse_direntplus *)buf;
+ memset(&dp->entry_out, 0, sizeof(dp->entry_out));
+ fill_entry(&dp->entry_out, e);
+
+ struct fuse_dirent *dirent = &dp->dirent;
+ *dirent = (struct fuse_dirent){
+ .ino = e->attr.st_ino,
+ .off = off,
+ .namelen = namelen,
+ .type = (e->attr.st_mode & S_IFMT) >> 12,
+ };
+ memcpy(dirent->name, name, namelen);
+ memset(dirent->name + namelen, 0, entlen_padded - entlen);
+
+ return entlen_padded;
+}
+
+static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f)
+{
+ arg->fh = f->fh;
+ if (f->direct_io) {
+ arg->open_flags |= FOPEN_DIRECT_IO;
+ }
+ if (f->keep_cache) {
+ arg->open_flags |= FOPEN_KEEP_CACHE;
+ }
+ if (f->cache_readdir) {
+ arg->open_flags |= FOPEN_CACHE_DIR;
+ }
+ if (f->nonseekable) {
+ arg->open_flags |= FOPEN_NONSEEKABLE;
+ }
+}
+
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e)
+{
+ struct fuse_entry_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ fill_entry(&arg, e);
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
+ const struct fuse_file_info *f)
+{
+ char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)];
+ size_t entrysize = sizeof(struct fuse_entry_out);
+ struct fuse_entry_out *earg = (struct fuse_entry_out *)buf;
+ struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize);
+
+ memset(buf, 0, sizeof(buf));
+ fill_entry(earg, e);
+ fill_open(oarg, f);
+ return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out));
+}
+
+int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
+ double attr_timeout)
+{
+ struct fuse_attr_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ arg.attr_valid = calc_timeout_sec(attr_timeout);
+ arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout);
+ convert_stat(attr, &arg.attr);
+
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_readlink(fuse_req_t req, const char *linkname)
+{
+ return send_reply_ok(req, linkname, strlen(linkname));
+}
+
+int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f)
+{
+ struct fuse_open_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ fill_open(&arg, f);
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_write(fuse_req_t req, size_t count)
+{
+ struct fuse_write_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.size = count;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size)
+{
+ return send_reply_ok(req, buf, size);
+}
+
+static int fuse_send_data_iov_fallback(struct fuse_session *se,
+ struct fuse_chan *ch, struct iovec *iov,
+ int iov_count, struct fuse_bufvec *buf,
+ size_t len)
+{
+ /* Optimize common case */
+ if (buf->count == 1 && buf->idx == 0 && buf->off == 0 &&
+ !(buf->buf[0].flags & FUSE_BUF_IS_FD)) {
+ /*
+ * FIXME: also avoid memory copy if there are multiple buffers
+ * but none of them contain an fd
+ */
+
+ iov[iov_count].iov_base = buf->buf[0].mem;
+ iov[iov_count].iov_len = len;
+ iov_count++;
+ return fuse_send_msg(se, ch, iov, iov_count);
+ }
+
+ if (fuse_lowlevel_is_virtio(se) && buf->count == 1 &&
+ buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) {
+ return virtio_send_data_iov(se, ch, iov, iov_count, buf, len);
+ }
+
+ abort(); /* Will have taken vhost path */
+ return 0;
+}
+
+static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int iov_count,
+ struct fuse_bufvec *buf)
+{
+ size_t len = fuse_buf_size(buf);
+
+ return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len);
+}
+
+int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv)
+{
+ struct iovec iov[2];
+ struct fuse_out_header out = {
+ .unique = req->unique,
+ };
+ int res;
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(struct fuse_out_header);
+
+ res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv);
+ if (res <= 0) {
+ fuse_free_req(req);
+ return res;
+ } else {
+ return fuse_reply_err(req, res);
+ }
+}
+
+int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf)
+{
+ struct fuse_statfs_out arg;
+ size_t size = sizeof(arg);
+
+ memset(&arg, 0, sizeof(arg));
+ convert_statfs(stbuf, &arg.st);
+
+ return send_reply_ok(req, &arg, size);
+}
+
+int fuse_reply_xattr(fuse_req_t req, size_t count)
+{
+ struct fuse_getxattr_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.size = count;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_lock(fuse_req_t req, const struct flock *lock)
+{
+ struct fuse_lk_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.lk.type = lock->l_type;
+ if (lock->l_type != F_UNLCK) {
+ arg.lk.start = lock->l_start;
+ if (lock->l_len == 0) {
+ arg.lk.end = OFFSET_MAX;
+ } else {
+ arg.lk.end = lock->l_start + lock->l_len - 1;
+ }
+ }
+ arg.lk.pid = lock->l_pid;
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_bmap(fuse_req_t req, uint64_t idx)
+{
+ struct fuse_bmap_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.block = idx;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov,
+ size_t count)
+{
+ struct fuse_ioctl_iovec *fiov;
+ size_t i;
+
+ fiov = g_try_new(struct fuse_ioctl_iovec, count);
+ if (!fiov) {
+ return NULL;
+ }
+
+ for (i = 0; i < count; i++) {
+ fiov[i].base = (uintptr_t)iov[i].iov_base;
+ fiov[i].len = iov[i].iov_len;
+ }
+
+ return fiov;
+}
+
+int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov,
+ size_t in_count, const struct iovec *out_iov,
+ size_t out_count)
+{
+ struct fuse_ioctl_out arg;
+ g_autofree struct fuse_ioctl_iovec *in_fiov = NULL;
+ g_autofree struct fuse_ioctl_iovec *out_fiov = NULL;
+ struct iovec iov[4];
+ size_t count = 1;
+ int res;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.flags |= FUSE_IOCTL_RETRY;
+ arg.in_iovs = in_count;
+ arg.out_iovs = out_count;
+ iov[count].iov_base = &arg;
+ iov[count].iov_len = sizeof(arg);
+ count++;
+
+ /* Can't handle non-compat 64bit ioctls on 32bit */
+ if (sizeof(void *) == 4 && req->ioctl_64bit) {
+ res = fuse_reply_err(req, EINVAL);
+ return res;
+ }
+
+ if (in_count) {
+ in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count);
+ if (!in_fiov) {
+ res = fuse_reply_err(req, ENOMEM);
+ return res;
+ }
+
+ iov[count].iov_base = (void *)in_fiov;
+ iov[count].iov_len = sizeof(in_fiov[0]) * in_count;
+ count++;
+ }
+ if (out_count) {
+ out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count);
+ if (!out_fiov) {
+ res = fuse_reply_err(req, ENOMEM);
+ return res;
+ }
+
+ iov[count].iov_base = (void *)out_fiov;
+ iov[count].iov_len = sizeof(out_fiov[0]) * out_count;
+ count++;
+ }
+
+ res = send_reply_iov(req, 0, iov, count);
+
+ return res;
+}
+
+int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size)
+{
+ struct fuse_ioctl_out arg;
+ struct iovec iov[3];
+ size_t count = 1;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.result = result;
+ iov[count].iov_base = &arg;
+ iov[count].iov_len = sizeof(arg);
+ count++;
+
+ if (size) {
+ iov[count].iov_base = (char *)buf;
+ iov[count].iov_len = size;
+ count++;
+ }
+
+ return send_reply_iov(req, 0, iov, count);
+}
+
+int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov,
+ int count)
+{
+ g_autofree struct iovec *padded_iov = NULL;
+ struct fuse_ioctl_out arg;
+ int res;
+
+ padded_iov = g_try_new(struct iovec, count + 2);
+ if (padded_iov == NULL) {
+ return fuse_reply_err(req, ENOMEM);
+ }
+
+ memset(&arg, 0, sizeof(arg));
+ arg.result = result;
+ padded_iov[1].iov_base = &arg;
+ padded_iov[1].iov_len = sizeof(arg);
+
+ memcpy(&padded_iov[2], iov, count * sizeof(struct iovec));
+
+ res = send_reply_iov(req, 0, padded_iov, count + 2);
+
+ return res;
+}
+
+int fuse_reply_poll(fuse_req_t req, unsigned revents)
+{
+ struct fuse_poll_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.revents = revents;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+int fuse_reply_lseek(fuse_req_t req, off_t off)
+{
+ struct fuse_lseek_out arg;
+
+ memset(&arg, 0, sizeof(arg));
+ arg.offset = off;
+
+ return send_reply_ok(req, &arg, sizeof(arg));
+}
+
+static void do_lookup(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.lookup) {
+ req->se->op.lookup(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_forget(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_forget_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.forget) {
+ req->se->op.forget(req, nodeid, arg->nlookup);
+ } else {
+ fuse_reply_none(req);
+ }
+}
+
+static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_batch_forget_in *arg;
+ struct fuse_forget_data *forgets;
+ size_t scount;
+
+ (void)nodeid;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ /*
+ * Prevent integer overflow. The compiler emits the following warning
+ * unless we use the scount local variable:
+ *
+ * error: comparison is always false due to limited range of data type
+ * [-Werror=type-limits]
+ *
+ * This may be true on 64-bit hosts but we need this check for 32-bit
+ * hosts.
+ */
+ scount = arg->count;
+ if (scount > SIZE_MAX / sizeof(forgets[0])) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0]));
+ if (!forgets) {
+ fuse_reply_none(req);
+ return;
+ }
+
+ if (req->se->op.forget_multi) {
+ req->se->op.forget_multi(req, arg->count, forgets);
+ } else if (req->se->op.forget) {
+ unsigned int i;
+
+ for (i = 0; i < arg->count; i++) {
+ struct fuse_req *dummy_req;
+
+ dummy_req = fuse_ll_alloc_req(req->se);
+ if (dummy_req == NULL) {
+ break;
+ }
+
+ dummy_req->unique = req->unique;
+ dummy_req->ctx = req->ctx;
+ dummy_req->ch = NULL;
+
+ req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup);
+ }
+ fuse_reply_none(req);
+ } else {
+ fuse_reply_none(req);
+ }
+}
+
+static void do_getattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_file_info *fip = NULL;
+ struct fuse_file_info fi;
+
+ struct fuse_getattr_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (arg->getattr_flags & FUSE_GETATTR_FH) {
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fip = &fi;
+ }
+
+ if (req->se->op.getattr) {
+ req->se->op.getattr(req, nodeid, fip);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_setattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.setattr) {
+ struct fuse_setattr_in *arg;
+ struct fuse_file_info *fi = NULL;
+ struct fuse_file_info fi_store;
+ struct stat stbuf;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&stbuf, 0, sizeof(stbuf));
+ convert_attr(arg, &stbuf);
+ if (arg->valid & FATTR_FH) {
+ arg->valid &= ~FATTR_FH;
+ memset(&fi_store, 0, sizeof(fi_store));
+ fi = &fi_store;
+ fi->fh = arg->fh;
+ }
+ arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID |
+ FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE |
+ FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME |
+ FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW |
+ FUSE_SET_ATTR_CTIME | FUSE_SET_ATTR_KILL_SUIDGID;
+
+ req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_access(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_access_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.access) {
+ req->se->op.access(req, nodeid, arg->mask);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_readlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ (void)iter;
+
+ if (req->se->op.readlink) {
+ req->se->op.readlink(req, nodeid);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_mknod(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_mknod_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ req->ctx.umask = arg->umask;
+
+ if (req->se->op.mknod) {
+ req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_mkdir_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ req->ctx.umask = arg->umask;
+
+ if (req->se->op.mkdir) {
+ req->se->op.mkdir(req, nodeid, name, arg->mode);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_unlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.unlink) {
+ req->se->op.unlink(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rmdir) {
+ req->se->op.rmdir(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_symlink(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+ const char *linkname = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name || !linkname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.symlink) {
+ req->se->op.symlink(req, linkname, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rename(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_rename_in *arg;
+ const char *oldname;
+ const char *newname;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ oldname = fuse_mbuf_iter_advance_str(iter);
+ newname = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !oldname || !newname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rename) {
+ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_rename2(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_rename2_in *arg;
+ const char *oldname;
+ const char *newname;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ oldname = fuse_mbuf_iter_advance_str(iter);
+ newname = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !oldname || !newname) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.rename) {
+ req->se->op.rename(req, nodeid, oldname, arg->newdir, newname,
+ arg->flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_link(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.link) {
+ req->se->op.link(req, arg->oldnodeid, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_create(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.create) {
+ struct fuse_create_in *arg;
+ struct fuse_file_info fi;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
+
+ req->ctx.umask = arg->umask;
+
+ req->se->op.create(req, nodeid, name, arg->mode, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_open(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_open_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ /* File creation is handled by do_create() or do_mknod() */
+ if (arg->flags & (O_CREAT | O_TMPFILE)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID;
+
+ if (req->se->op.open) {
+ req->se->op.open(req, nodeid, &fi);
+ } else {
+ fuse_reply_open(req, &fi);
+ }
+}
+
+static void do_read(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ if (req->se->op.read) {
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+ req->se->op.read(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_write(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_write_in *arg;
+ struct fuse_file_info fi;
+ const char *param;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ param = fuse_mbuf_iter_advance(iter, arg->size);
+ if (!param) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0;
+ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV);
+
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+
+ if (req->se->op.write) {
+ req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv)
+{
+ struct fuse_session *se = req->se;
+ struct fuse_bufvec *pbufv = ibufv;
+ struct fuse_bufvec tmpbufv = {
+ .buf[0] = ibufv->buf[0],
+ .count = 1,
+ };
+ struct fuse_write_in *arg;
+ size_t arg_size = sizeof(*arg);
+ struct fuse_file_info fi;
+
+ memset(&fi, 0, sizeof(fi));
+
+ arg = fuse_mbuf_iter_advance(iter, arg_size);
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ fi.lock_owner = arg->lock_owner;
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+ fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE);
+ fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV);
+
+ if (ibufv->count == 1) {
+ assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD));
+ tmpbufv.buf[0].mem = ((char *)arg) + arg_size;
+ tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size;
+ pbufv = &tmpbufv;
+ } else {
+ /*
+ * Input bufv contains the headers in the first element
+ * and the data in the rest, we need to skip that first element
+ */
+ ibufv->buf[0].size = 0;
+ }
+
+ if (fuse_buf_size(pbufv) != arg->size) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: do_write_buf: buffer size doesn't match arg->size\n");
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi);
+}
+
+static void do_flush(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_flush_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.flush = 1;
+ fi.lock_owner = arg->lock_owner;
+
+ if (req->se->op.flush) {
+ req->se->op.flush(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_release(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_release_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+ fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0;
+ fi.lock_owner = arg->lock_owner;
+
+ if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) {
+ fi.flock_release = 1;
+ }
+
+ if (req->se->op.release) {
+ req->se->op.release(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, 0);
+ }
+}
+
+static void do_fsync(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fsync_in *arg;
+ struct fuse_file_info fi;
+ int datasync;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ datasync = arg->fsync_flags & 1;
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fsync) {
+ if (fi.fh == (uint64_t)-1) {
+ req->se->op.fsync(req, nodeid, datasync, NULL);
+ } else {
+ req->se->op.fsync(req, nodeid, datasync, &fi);
+ }
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_opendir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_open_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+
+ if (req->se->op.opendir) {
+ req->se->op.opendir(req, nodeid, &fi);
+ } else {
+ fuse_reply_open(req, &fi);
+ }
+}
+
+static void do_readdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.readdir) {
+ req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_read_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.readdirplus) {
+ req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_release_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.flags = arg->flags;
+ fi.fh = arg->fh;
+
+ if (req->se->op.releasedir) {
+ req->se->op.releasedir(req, nodeid, &fi);
+ } else {
+ fuse_reply_err(req, 0);
+ }
+}
+
+static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fsync_in *arg;
+ struct fuse_file_info fi;
+ int datasync;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ datasync = arg->fsync_flags & 1;
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fsyncdir) {
+ req->se->op.fsyncdir(req, nodeid, datasync, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_statfs(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ (void)nodeid;
+ (void)iter;
+
+ if (req->se->op.statfs) {
+ req->se->op.statfs(req, nodeid);
+ } else {
+ struct statvfs buf = {
+ .f_namemax = 255,
+ .f_bsize = 512,
+ };
+ fuse_reply_statfs(req, &buf);
+ }
+}
+
+static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_setxattr_in *arg;
+ const char *name;
+ const char *value;
+ bool setxattr_ext = req->se->conn.want & FUSE_CAP_SETXATTR_EXT;
+
+ if (setxattr_ext) {
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ } else {
+ arg = fuse_mbuf_iter_advance(iter, FUSE_COMPAT_SETXATTR_IN_SIZE);
+ }
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ value = fuse_mbuf_iter_advance(iter, arg->size);
+ if (!value) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.setxattr) {
+ uint32_t setxattr_flags = setxattr_ext ? arg->setxattr_flags : 0;
+ req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags,
+ setxattr_flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_getxattr_in *arg;
+ const char *name;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ name = fuse_mbuf_iter_advance_str(iter);
+ if (!arg || !name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.getxattr) {
+ req->se->op.getxattr(req, nodeid, name, arg->size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_getxattr_in *arg;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.listxattr) {
+ req->se->op.listxattr(req, nodeid, arg->size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ const char *name = fuse_mbuf_iter_advance_str(iter);
+
+ if (!name) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.removexattr) {
+ req->se->op.removexattr(req, nodeid, name);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void convert_fuse_file_lock(struct fuse_file_lock *fl,
+ struct flock *flock)
+{
+ memset(flock, 0, sizeof(struct flock));
+ flock->l_type = fl->type;
+ flock->l_whence = SEEK_SET;
+ flock->l_start = fl->start;
+ if (fl->end == OFFSET_MAX) {
+ flock->l_len = 0;
+ } else {
+ flock->l_len = fl->end - fl->start + 1;
+ }
+ flock->l_pid = fl->pid;
+}
+
+static void do_getlk(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_lk_in *arg;
+ struct fuse_file_info fi;
+ struct flock flock;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->owner;
+
+ convert_fuse_file_lock(&arg->lk, &flock);
+ if (req->se->op.getlk) {
+ req->se->op.getlk(req, nodeid, &fi, &flock);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter, int sleep)
+{
+ struct fuse_lk_in *arg;
+ struct fuse_file_info fi;
+ struct flock flock;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.lock_owner = arg->owner;
+
+ if (arg->lk_flags & FUSE_LK_FLOCK) {
+ int op = 0;
+
+ switch (arg->lk.type) {
+ case F_RDLCK:
+ op = LOCK_SH;
+ break;
+ case F_WRLCK:
+ op = LOCK_EX;
+ break;
+ case F_UNLCK:
+ op = LOCK_UN;
+ break;
+ }
+ if (!sleep) {
+ op |= LOCK_NB;
+ }
+
+ if (req->se->op.flock) {
+ req->se->op.flock(req, nodeid, &fi, op);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+ } else {
+ convert_fuse_file_lock(&arg->lk, &flock);
+ if (req->se->op.setlk) {
+ req->se->op.setlk(req, nodeid, &fi, &flock, sleep);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+ }
+}
+
+static void do_setlk(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ do_setlk_common(req, nodeid, iter, 0);
+}
+
+static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ do_setlk_common(req, nodeid, iter, 1);
+}
+
+static int find_interrupted(struct fuse_session *se, struct fuse_req *req)
+{
+ struct fuse_req *curr;
+
+ for (curr = se->list.next; curr != &se->list; curr = curr->next) {
+ if (curr->unique == req->u.i.unique) {
+ fuse_interrupt_func_t func;
+ void *data;
+
+ curr->ctr++;
+ pthread_mutex_unlock(&se->lock);
+
+ /* Ugh, ugly locking */
+ pthread_mutex_lock(&curr->lock);
+ pthread_mutex_lock(&se->lock);
+ curr->interrupted = 1;
+ func = curr->u.ni.func;
+ data = curr->u.ni.data;
+ pthread_mutex_unlock(&se->lock);
+ if (func) {
+ func(curr, data);
+ }
+ pthread_mutex_unlock(&curr->lock);
+
+ pthread_mutex_lock(&se->lock);
+ curr->ctr--;
+ if (!curr->ctr) {
+ destroy_req(curr);
+ }
+
+ return 1;
+ }
+ }
+ for (curr = se->interrupts.next; curr != &se->interrupts;
+ curr = curr->next) {
+ if (curr->u.i.unique == req->u.i.unique) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_interrupt_in *arg;
+ struct fuse_session *se = req->se;
+
+ (void)nodeid;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n",
+ (unsigned long long)arg->unique);
+
+ req->u.i.unique = arg->unique;
+
+ pthread_mutex_lock(&se->lock);
+ if (find_interrupted(se, req)) {
+ destroy_req(req);
+ } else {
+ list_add_req(req, &se->interrupts);
+ }
+ pthread_mutex_unlock(&se->lock);
+}
+
+static struct fuse_req *check_interrupt(struct fuse_session *se,
+ struct fuse_req *req)
+{
+ struct fuse_req *curr;
+
+ for (curr = se->interrupts.next; curr != &se->interrupts;
+ curr = curr->next) {
+ if (curr->u.i.unique == req->unique) {
+ req->interrupted = 1;
+ list_del_req(curr);
+ g_free(curr);
+ return NULL;
+ }
+ }
+ curr = se->interrupts.next;
+ if (curr != &se->interrupts) {
+ list_del_req(curr);
+ list_init_req(curr);
+ return curr;
+ } else {
+ return NULL;
+ }
+}
+
+static void do_bmap(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ if (req->se->op.bmap) {
+ req->se->op.bmap(req, nodeid, arg->blocksize, arg->block);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_ioctl_in *arg;
+ unsigned int flags;
+ void *in_buf = NULL;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ flags = arg->flags;
+ if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) {
+ fuse_reply_err(req, ENOTTY);
+ return;
+ }
+
+ if (arg->in_size) {
+ in_buf = fuse_mbuf_iter_advance(iter, arg->in_size);
+ if (!in_buf) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) {
+ req->ioctl_64bit = 1;
+ }
+
+ if (req->se->op.ioctl) {
+ req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg,
+ &fi, flags, in_buf, arg->in_size, arg->out_size);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+void fuse_pollhandle_destroy(struct fuse_pollhandle *ph)
+{
+ free(ph);
+}
+
+static void do_poll(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_poll_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+ fi.poll_events = arg->events;
+
+ if (req->se->op.poll) {
+ struct fuse_pollhandle *ph = NULL;
+
+ if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) {
+ ph = malloc(sizeof(struct fuse_pollhandle));
+ if (ph == NULL) {
+ fuse_reply_err(req, ENOMEM);
+ return;
+ }
+ ph->kh = arg->kh;
+ ph->se = req->se;
+ }
+
+ req->se->op.poll(req, nodeid, &fi, ph);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_fallocate_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.fallocate) {
+ req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length,
+ &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_copy_file_range_in *arg;
+ struct fuse_file_info fi_in, fi_out;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ memset(&fi_in, 0, sizeof(fi_in));
+ fi_in.fh = arg->fh_in;
+
+ memset(&fi_out, 0, sizeof(fi_out));
+ fi_out.fh = arg->fh_out;
+
+
+ if (req->se->op.copy_file_range) {
+ req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in,
+ arg->nodeid_out, arg->off_out, &fi_out,
+ arg->len, arg->flags);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_lseek(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_lseek_in *arg;
+ struct fuse_file_info fi;
+
+ arg = fuse_mbuf_iter_advance(iter, sizeof(*arg));
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ memset(&fi, 0, sizeof(fi));
+ fi.fh = arg->fh;
+
+ if (req->se->op.lseek) {
+ req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi);
+ } else {
+ fuse_reply_err(req, ENOSYS);
+ }
+}
+
+static void do_init(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ size_t compat_size = offsetof(struct fuse_init_in, max_readahead);
+ struct fuse_init_in *arg;
+ struct fuse_init_out outarg;
+ struct fuse_session *se = req->se;
+ size_t bufsize = se->bufsize;
+ size_t outargsize = sizeof(outarg);
+
+ (void)nodeid;
+
+ /* First consume the old fields... */
+ arg = fuse_mbuf_iter_advance(iter, compat_size);
+ if (!arg) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ /* ...and now consume the new fields. */
+ if (arg->major == 7 && arg->minor >= 6) {
+ if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor);
+ if (arg->major == 7 && arg->minor >= 6) {
+ fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags);
+ fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead);
+ }
+ se->conn.proto_major = arg->major;
+ se->conn.proto_minor = arg->minor;
+ se->conn.capable = 0;
+ se->conn.want = 0;
+
+ memset(&outarg, 0, sizeof(outarg));
+ outarg.major = FUSE_KERNEL_VERSION;
+ outarg.minor = FUSE_KERNEL_MINOR_VERSION;
+
+ if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) {
+ fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n",
+ arg->major, arg->minor);
+ fuse_reply_err(req, EPROTO);
+ return;
+ }
+
+ if (arg->major > 7) {
+ /* Wait for a second INIT request with a 7.X version */
+ send_reply_ok(req, &outarg, sizeof(outarg));
+ return;
+ }
+
+ if (arg->max_readahead < se->conn.max_readahead) {
+ se->conn.max_readahead = arg->max_readahead;
+ }
+ if (arg->flags & FUSE_ASYNC_READ) {
+ se->conn.capable |= FUSE_CAP_ASYNC_READ;
+ }
+ if (arg->flags & FUSE_POSIX_LOCKS) {
+ se->conn.capable |= FUSE_CAP_POSIX_LOCKS;
+ }
+ if (arg->flags & FUSE_ATOMIC_O_TRUNC) {
+ se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC;
+ }
+ if (arg->flags & FUSE_EXPORT_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT;
+ }
+ if (arg->flags & FUSE_DONT_MASK) {
+ se->conn.capable |= FUSE_CAP_DONT_MASK;
+ }
+ if (arg->flags & FUSE_FLOCK_LOCKS) {
+ se->conn.capable |= FUSE_CAP_FLOCK_LOCKS;
+ }
+ if (arg->flags & FUSE_AUTO_INVAL_DATA) {
+ se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA;
+ }
+ if (arg->flags & FUSE_DO_READDIRPLUS) {
+ se->conn.capable |= FUSE_CAP_READDIRPLUS;
+ }
+ if (arg->flags & FUSE_READDIRPLUS_AUTO) {
+ se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO;
+ }
+ if (arg->flags & FUSE_ASYNC_DIO) {
+ se->conn.capable |= FUSE_CAP_ASYNC_DIO;
+ }
+ if (arg->flags & FUSE_WRITEBACK_CACHE) {
+ se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE;
+ }
+ if (arg->flags & FUSE_NO_OPEN_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT;
+ }
+ if (arg->flags & FUSE_PARALLEL_DIROPS) {
+ se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS;
+ }
+ if (arg->flags & FUSE_POSIX_ACL) {
+ se->conn.capable |= FUSE_CAP_POSIX_ACL;
+ }
+ if (arg->flags & FUSE_HANDLE_KILLPRIV) {
+ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV;
+ }
+ if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) {
+ se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT;
+ }
+ if (!(arg->flags & FUSE_MAX_PAGES)) {
+ size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() +
+ FUSE_BUFFER_HEADER_SIZE;
+ if (bufsize > max_bufsize) {
+ bufsize = max_bufsize;
+ }
+ }
+ if (arg->flags & FUSE_SUBMOUNTS) {
+ se->conn.capable |= FUSE_CAP_SUBMOUNTS;
+ }
+ if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) {
+ se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2;
+ }
+ if (arg->flags & FUSE_SETXATTR_EXT) {
+ se->conn.capable |= FUSE_CAP_SETXATTR_EXT;
+ }
+#ifdef HAVE_SPLICE
+#ifdef HAVE_VMSPLICE
+ se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE;
+#endif
+ se->conn.capable |= FUSE_CAP_SPLICE_READ;
+#endif
+ se->conn.capable |= FUSE_CAP_IOCTL_DIR;
+
+ /*
+ * Default settings for modern filesystems.
+ *
+ * Most of these capabilities were disabled by default in
+ * libfuse2 for backwards compatibility reasons. In libfuse3,
+ * we can finally enable them by default (as long as they're
+ * supported by the kernel).
+ */
+#define LL_SET_DEFAULT(cond, cap) \
+ if ((cond) && (se->conn.capable & (cap))) \
+ se->conn.want |= (cap)
+ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ);
+ LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS);
+ LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA);
+ LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV);
+ LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO);
+ LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR);
+ LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC);
+ LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ);
+ LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS);
+ LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS);
+ LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS);
+ LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir,
+ FUSE_CAP_READDIRPLUS_AUTO);
+ se->conn.time_gran = 1;
+
+ if (bufsize < FUSE_MIN_READ_BUFFER) {
+ fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n",
+ bufsize);
+ bufsize = FUSE_MIN_READ_BUFFER;
+ }
+ se->bufsize = bufsize;
+
+ if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) {
+ se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE;
+ }
+
+ se->got_init = 1;
+ se->got_destroy = 0;
+ if (se->op.init) {
+ se->op.init(se->userdata, &se->conn);
+ }
+
+ if (se->conn.want & (~se->conn.capable)) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: error: filesystem requested capabilities "
+ "0x%x that are not supported by kernel, aborting.\n",
+ se->conn.want & (~se->conn.capable));
+ fuse_reply_err(req, EPROTO);
+ se->error = -EPROTO;
+ fuse_session_exit(se);
+ return;
+ }
+
+ if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) {
+ se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE;
+ }
+ if (arg->flags & FUSE_MAX_PAGES) {
+ outarg.flags |= FUSE_MAX_PAGES;
+ outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1;
+ }
+
+ /*
+ * Always enable big writes, this is superseded
+ * by the max_write option
+ */
+ outarg.flags |= FUSE_BIG_WRITES;
+
+ if (se->conn.want & FUSE_CAP_ASYNC_READ) {
+ outarg.flags |= FUSE_ASYNC_READ;
+ }
+ if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) {
+ outarg.flags |= FUSE_PARALLEL_DIROPS;
+ }
+ if (se->conn.want & FUSE_CAP_POSIX_LOCKS) {
+ outarg.flags |= FUSE_POSIX_LOCKS;
+ }
+ if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) {
+ outarg.flags |= FUSE_ATOMIC_O_TRUNC;
+ }
+ if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) {
+ outarg.flags |= FUSE_EXPORT_SUPPORT;
+ }
+ if (se->conn.want & FUSE_CAP_DONT_MASK) {
+ outarg.flags |= FUSE_DONT_MASK;
+ }
+ if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) {
+ outarg.flags |= FUSE_FLOCK_LOCKS;
+ }
+ if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) {
+ outarg.flags |= FUSE_AUTO_INVAL_DATA;
+ }
+ if (se->conn.want & FUSE_CAP_READDIRPLUS) {
+ outarg.flags |= FUSE_DO_READDIRPLUS;
+ }
+ if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) {
+ outarg.flags |= FUSE_READDIRPLUS_AUTO;
+ }
+ if (se->conn.want & FUSE_CAP_ASYNC_DIO) {
+ outarg.flags |= FUSE_ASYNC_DIO;
+ }
+ if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) {
+ outarg.flags |= FUSE_WRITEBACK_CACHE;
+ }
+ if (se->conn.want & FUSE_CAP_POSIX_ACL) {
+ outarg.flags |= FUSE_POSIX_ACL;
+ }
+ outarg.max_readahead = se->conn.max_readahead;
+ outarg.max_write = se->conn.max_write;
+ if (se->conn.max_background >= (1 << 16)) {
+ se->conn.max_background = (1 << 16) - 1;
+ }
+ if (se->conn.congestion_threshold > se->conn.max_background) {
+ se->conn.congestion_threshold = se->conn.max_background;
+ }
+ if (!se->conn.congestion_threshold) {
+ se->conn.congestion_threshold = se->conn.max_background * 3 / 4;
+ }
+
+ outarg.max_background = se->conn.max_background;
+ outarg.congestion_threshold = se->conn.congestion_threshold;
+ outarg.time_gran = se->conn.time_gran;
+
+ if (se->conn.want & FUSE_CAP_HANDLE_KILLPRIV_V2) {
+ outarg.flags |= FUSE_HANDLE_KILLPRIV_V2;
+ }
+
+ if (se->conn.want & FUSE_CAP_SETXATTR_EXT) {
+ outarg.flags |= FUSE_SETXATTR_EXT;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor);
+ fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags);
+ fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead);
+ fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write);
+ fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background);
+ fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n",
+ outarg.congestion_threshold);
+ fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran);
+
+ send_reply_ok(req, &outarg, outargsize);
+}
+
+static void do_destroy(fuse_req_t req, fuse_ino_t nodeid,
+ struct fuse_mbuf_iter *iter)
+{
+ struct fuse_session *se = req->se;
+
+ (void)nodeid;
+ (void)iter;
+
+ se->got_destroy = 1;
+ se->got_init = 0;
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+
+ send_reply_ok(req, NULL, 0);
+}
+
+int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino,
+ off_t offset, struct fuse_bufvec *bufv)
+{
+ struct fuse_out_header out = {
+ .error = FUSE_NOTIFY_STORE,
+ };
+ struct fuse_notify_store_out outarg = {
+ .nodeid = ino,
+ .offset = offset,
+ .size = fuse_buf_size(bufv),
+ };
+ struct iovec iov[3];
+ int res;
+
+ if (!se) {
+ return -EINVAL;
+ }
+
+ iov[0].iov_base = &out;
+ iov[0].iov_len = sizeof(out);
+ iov[1].iov_base = &outarg;
+ iov[1].iov_len = sizeof(outarg);
+
+ res = fuse_send_data_iov(se, NULL, iov, 2, bufv);
+ if (res > 0) {
+ res = -res;
+ }
+
+ return res;
+}
+
+void *fuse_req_userdata(fuse_req_t req)
+{
+ return req->se->userdata;
+}
+
+const struct fuse_ctx *fuse_req_ctx(fuse_req_t req)
+{
+ return &req->ctx;
+}
+
+void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func,
+ void *data)
+{
+ pthread_mutex_lock(&req->lock);
+ pthread_mutex_lock(&req->se->lock);
+ req->u.ni.func = func;
+ req->u.ni.data = data;
+ pthread_mutex_unlock(&req->se->lock);
+ if (req->interrupted && func) {
+ func(req, data);
+ }
+ pthread_mutex_unlock(&req->lock);
+}
+
+int fuse_req_interrupted(fuse_req_t req)
+{
+ int interrupted;
+
+ pthread_mutex_lock(&req->se->lock);
+ interrupted = req->interrupted;
+ pthread_mutex_unlock(&req->se->lock);
+
+ return interrupted;
+}
+
+static struct {
+ void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *);
+ const char *name;
+} fuse_ll_ops[] = {
+ [FUSE_LOOKUP] = { do_lookup, "LOOKUP" },
+ [FUSE_FORGET] = { do_forget, "FORGET" },
+ [FUSE_GETATTR] = { do_getattr, "GETATTR" },
+ [FUSE_SETATTR] = { do_setattr, "SETATTR" },
+ [FUSE_READLINK] = { do_readlink, "READLINK" },
+ [FUSE_SYMLINK] = { do_symlink, "SYMLINK" },
+ [FUSE_MKNOD] = { do_mknod, "MKNOD" },
+ [FUSE_MKDIR] = { do_mkdir, "MKDIR" },
+ [FUSE_UNLINK] = { do_unlink, "UNLINK" },
+ [FUSE_RMDIR] = { do_rmdir, "RMDIR" },
+ [FUSE_RENAME] = { do_rename, "RENAME" },
+ [FUSE_LINK] = { do_link, "LINK" },
+ [FUSE_OPEN] = { do_open, "OPEN" },
+ [FUSE_READ] = { do_read, "READ" },
+ [FUSE_WRITE] = { do_write, "WRITE" },
+ [FUSE_STATFS] = { do_statfs, "STATFS" },
+ [FUSE_RELEASE] = { do_release, "RELEASE" },
+ [FUSE_FSYNC] = { do_fsync, "FSYNC" },
+ [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" },
+ [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" },
+ [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" },
+ [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" },
+ [FUSE_FLUSH] = { do_flush, "FLUSH" },
+ [FUSE_INIT] = { do_init, "INIT" },
+ [FUSE_OPENDIR] = { do_opendir, "OPENDIR" },
+ [FUSE_READDIR] = { do_readdir, "READDIR" },
+ [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" },
+ [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" },
+ [FUSE_GETLK] = { do_getlk, "GETLK" },
+ [FUSE_SETLK] = { do_setlk, "SETLK" },
+ [FUSE_SETLKW] = { do_setlkw, "SETLKW" },
+ [FUSE_ACCESS] = { do_access, "ACCESS" },
+ [FUSE_CREATE] = { do_create, "CREATE" },
+ [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" },
+ [FUSE_BMAP] = { do_bmap, "BMAP" },
+ [FUSE_IOCTL] = { do_ioctl, "IOCTL" },
+ [FUSE_POLL] = { do_poll, "POLL" },
+ [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" },
+ [FUSE_DESTROY] = { do_destroy, "DESTROY" },
+ [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" },
+ [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" },
+ [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" },
+ [FUSE_RENAME2] = { do_rename2, "RENAME2" },
+ [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" },
+ [FUSE_LSEEK] = { do_lseek, "LSEEK" },
+};
+
+#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0]))
+
+static const char *opname(enum fuse_opcode opcode)
+{
+ if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) {
+ return "???";
+ } else {
+ return fuse_ll_ops[opcode].name;
+ }
+}
+
+void fuse_session_process_buf(struct fuse_session *se,
+ const struct fuse_buf *buf)
+{
+ struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 };
+ fuse_session_process_buf_int(se, &bufv, NULL);
+}
+
+/*
+ * Restriction:
+ * bufv is normally a single entry buffer, except for a write
+ * where (if it's in memory) then the bufv may be multiple entries,
+ * where the first entry contains all headers and subsequent entries
+ * contain data
+ * bufv shall not use any offsets etc to make the data anything
+ * other than contiguous starting from 0.
+ */
+void fuse_session_process_buf_int(struct fuse_session *se,
+ struct fuse_bufvec *bufv,
+ struct fuse_chan *ch)
+{
+ const struct fuse_buf *buf = bufv->buf;
+ struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf);
+ struct fuse_in_header *in;
+ struct fuse_req *req;
+ int err;
+
+ /* The first buffer must be a memory buffer */
+ assert(!(buf->flags & FUSE_BUF_IS_FD));
+
+ in = fuse_mbuf_iter_advance(&iter, sizeof(*in));
+ assert(in); /* caller guarantees the input buffer is large enough */
+
+ fuse_log(
+ FUSE_LOG_DEBUG,
+ "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n",
+ (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode),
+ in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid);
+
+ req = fuse_ll_alloc_req(se);
+ if (req == NULL) {
+ struct fuse_out_header out = {
+ .unique = in->unique,
+ .error = -ENOMEM,
+ };
+ struct iovec iov = {
+ .iov_base = &out,
+ .iov_len = sizeof(struct fuse_out_header),
+ };
+
+ fuse_send_msg(se, ch, &iov, 1);
+ return;
+ }
+
+ req->unique = in->unique;
+ req->ctx.uid = in->uid;
+ req->ctx.gid = in->gid;
+ req->ctx.pid = in->pid;
+ req->ch = ch;
+
+ /*
+ * INIT and DESTROY requests are serialized, all other request types
+ * run in parallel. This prevents races between FUSE_INIT and ordinary
+ * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and
+ * FUSE_DESTROY and FUSE_DESTROY.
+ */
+ if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT ||
+ in->opcode == FUSE_DESTROY) {
+ pthread_rwlock_wrlock(&se->init_rwlock);
+ } else {
+ pthread_rwlock_rdlock(&se->init_rwlock);
+ }
+
+ err = EIO;
+ if (!se->got_init) {
+ enum fuse_opcode expected;
+
+ expected = se->cuse_data ? CUSE_INIT : FUSE_INIT;
+ if (in->opcode != expected) {
+ goto reply_err;
+ }
+ } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) {
+ if (fuse_lowlevel_is_virtio(se)) {
+ /*
+ * TODO: This is after a hard reboot typically, we need to do
+ * a destroy, but we can't reply to this request yet so
+ * we can't use do_destroy
+ */
+ fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__);
+ se->got_destroy = 1;
+ se->got_init = 0;
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+ } else {
+ goto reply_err;
+ }
+ }
+
+ err = EACCES;
+ /* Implement -o allow_root */
+ if (se->deny_others && in->uid != se->owner && in->uid != 0 &&
+ in->opcode != FUSE_INIT && in->opcode != FUSE_READ &&
+ in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC &&
+ in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR &&
+ in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR &&
+ in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) {
+ goto reply_err;
+ }
+
+ err = ENOSYS;
+ if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) {
+ goto reply_err;
+ }
+ if (in->opcode != FUSE_INTERRUPT) {
+ struct fuse_req *intr;
+ pthread_mutex_lock(&se->lock);
+ intr = check_interrupt(se, req);
+ list_add_req(req, &se->list);
+ pthread_mutex_unlock(&se->lock);
+ if (intr) {
+ fuse_reply_err(intr, EAGAIN);
+ }
+ }
+
+ if (in->opcode == FUSE_WRITE && se->op.write_buf) {
+ do_write_buf(req, in->nodeid, &iter, bufv);
+ } else {
+ fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter);
+ }
+
+ pthread_rwlock_unlock(&se->init_rwlock);
+ return;
+
+reply_err:
+ fuse_reply_err(req, err);
+ pthread_rwlock_unlock(&se->init_rwlock);
+}
+
+#define LL_OPTION(n, o, v) \
+ { \
+ n, offsetof(struct fuse_session, o), v \
+ }
+
+static const struct fuse_opt fuse_ll_opts[] = {
+ LL_OPTION("debug", debug, 1),
+ LL_OPTION("-d", debug, 1),
+ LL_OPTION("--debug", debug, 1),
+ LL_OPTION("allow_root", deny_others, 1),
+ LL_OPTION("--socket-path=%s", vu_socket_path, 0),
+ LL_OPTION("--socket-group=%s", vu_socket_group, 0),
+ LL_OPTION("--fd=%d", vu_listen_fd, 0),
+ LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0),
+ FUSE_OPT_END
+};
+
+void fuse_lowlevel_version(void)
+{
+ printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION,
+ FUSE_KERNEL_MINOR_VERSION);
+}
+
+void fuse_lowlevel_help(void)
+{
+ /*
+ * These are not all options, but the ones that are
+ * potentially of interest to an end-user
+ */
+ printf(
+ " -o allow_root allow access by root\n"
+ " --socket-path=PATH path for the vhost-user socket\n"
+ " --socket-group=GRNAME name of group for the vhost-user socket\n"
+ " --fd=FDNUM fd number of vhost-user socket\n"
+ " --thread-pool-size=NUM thread pool size limit (default %d)\n",
+ THREAD_POOL_SIZE);
+}
+
+void fuse_session_destroy(struct fuse_session *se)
+{
+ if (se->got_init && !se->got_destroy) {
+ if (se->op.destroy) {
+ se->op.destroy(se->userdata);
+ }
+ }
+ pthread_rwlock_destroy(&se->init_rwlock);
+ pthread_mutex_destroy(&se->lock);
+ free(se->cuse_data);
+ if (se->fd != -1) {
+ close(se->fd);
+ }
+
+ if (fuse_lowlevel_is_virtio(se)) {
+ virtio_session_close(se);
+ }
+
+ free(se->vu_socket_path);
+ se->vu_socket_path = NULL;
+
+ g_free(se);
+}
+
+
+struct fuse_session *fuse_session_new(struct fuse_args *args,
+ const struct fuse_lowlevel_ops *op,
+ size_t op_size, void *userdata)
+{
+ struct fuse_session *se;
+
+ if (sizeof(struct fuse_lowlevel_ops) < op_size) {
+ fuse_log(
+ FUSE_LOG_ERR,
+ "fuse: warning: library too old, some operations may not work\n");
+ op_size = sizeof(struct fuse_lowlevel_ops);
+ }
+
+ if (args->argc == 0) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: empty argv passed to fuse_session_new().\n");
+ return NULL;
+ }
+
+ se = g_try_new0(struct fuse_session, 1);
+ if (se == NULL) {
+ fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n");
+ goto out1;
+ }
+ se->fd = -1;
+ se->vu_listen_fd = -1;
+ se->thread_pool_size = THREAD_POOL_SIZE;
+ se->conn.max_write = UINT_MAX;
+ se->conn.max_readahead = UINT_MAX;
+
+ /* Parse options */
+ if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) {
+ goto out2;
+ }
+ if (args->argc == 1 && args->argv[0][0] == '-') {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: warning: argv[0] looks like an option, but "
+ "will be ignored\n");
+ } else if (args->argc != 1) {
+ int i;
+ fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `");
+ for (i = 1; i < args->argc - 1; i++) {
+ fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]);
+ }
+ fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]);
+ goto out4;
+ }
+
+ if (!se->vu_socket_path && se->vu_listen_fd < 0) {
+ fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n");
+ goto out4;
+ }
+ if (se->vu_socket_path && se->vu_listen_fd >= 0) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: --socket-path and --fd cannot be given together\n");
+ goto out4;
+ }
+ if (se->vu_socket_group && !se->vu_socket_path) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: --socket-group can only be used with --socket-path\n");
+ goto out4;
+ }
+
+ se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE;
+
+ list_init_req(&se->list);
+ list_init_req(&se->interrupts);
+ fuse_mutex_init(&se->lock);
+ pthread_rwlock_init(&se->init_rwlock, NULL);
+
+ memcpy(&se->op, op, op_size);
+ se->owner = getuid();
+ se->userdata = userdata;
+
+ return se;
+
+out4:
+ fuse_opt_free_args(args);
+out2:
+ g_free(se);
+out1:
+ return NULL;
+}
+
+int fuse_session_mount(struct fuse_session *se)
+{
+ return virtio_session_mount(se);
+}
+
+int fuse_session_fd(struct fuse_session *se)
+{
+ return se->fd;
+}
+
+void fuse_session_unmount(struct fuse_session *se)
+{
+}
+
+int fuse_lowlevel_is_virtio(struct fuse_session *se)
+{
+ return !!se->virtio_dev;
+}
+
+void fuse_session_exit(struct fuse_session *se)
+{
+ se->exited = 1;
+}
+
+void fuse_session_reset(struct fuse_session *se)
+{
+ se->exited = 0;
+ se->error = 0;
+}
+
+int fuse_session_exited(struct fuse_session *se)
+{
+ return se->exited;
+}
diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h
new file mode 100644
index 000000000..c55c0ca2f
--- /dev/null
+++ b/tools/virtiofsd/fuse_lowlevel.h
@@ -0,0 +1,1975 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_LOWLEVEL_H_
+#define FUSE_LOWLEVEL_H_
+
+/**
+ * @file
+ *
+ * Low level API
+ *
+ * IMPORTANT: you should define FUSE_USE_VERSION before including this
+ * header. To use the newest API define it to 31 (recommended for any
+ * new application).
+ */
+
+#ifndef FUSE_USE_VERSION
+#error FUSE_USE_VERSION not defined
+#endif
+
+#include "fuse_common.h"
+
+#include <sys/statvfs.h>
+#include <sys/uio.h>
+#include <utime.h>
+
+/*
+ * Miscellaneous definitions
+ */
+
+/** The node ID of the root inode */
+#define FUSE_ROOT_ID 1
+
+/** Inode number type */
+typedef uint64_t fuse_ino_t;
+
+/** Request pointer type */
+typedef struct fuse_req *fuse_req_t;
+
+/**
+ * Session
+ *
+ * This provides hooks for processing requests, and exiting
+ */
+struct fuse_session;
+
+/** Directory entry parameters supplied to fuse_reply_entry() */
+struct fuse_entry_param {
+ /**
+ * Unique inode number
+ *
+ * In lookup, zero means negative entry (from version 2.5)
+ * Returning ENOENT also means negative entry, but by setting zero
+ * ino the kernel may cache negative entries for entry_timeout
+ * seconds.
+ */
+ fuse_ino_t ino;
+
+ /**
+ * Generation number for this entry.
+ *
+ * If the file system will be exported over NFS, the
+ * ino/generation pairs need to be unique over the file
+ * system's lifetime (rather than just the mount time). So if
+ * the file system reuses an inode after it has been deleted,
+ * it must assign a new, previously unused generation number
+ * to the inode at the same time.
+ *
+ */
+ uint64_t generation;
+
+ /**
+ * Inode attributes.
+ *
+ * Even if attr_timeout == 0, attr must be correct. For example,
+ * for open(), FUSE uses attr.st_size from lookup() to determine
+ * how many bytes to request. If this value is not correct,
+ * incorrect data will be returned.
+ */
+ struct stat attr;
+
+ /**
+ * Validity timeout (in seconds) for inode attributes. If
+ * attributes only change as a result of requests that come
+ * through the kernel, this should be set to a very large
+ * value.
+ */
+ double attr_timeout;
+
+ /**
+ * Validity timeout (in seconds) for the name. If directory
+ * entries are changed/deleted only as a result of requests
+ * that come through the kernel, this should be set to a very
+ * large value.
+ */
+ double entry_timeout;
+
+ /**
+ * Flags for fuse_attr.flags that do not fit into attr.
+ */
+ uint32_t attr_flags;
+};
+
+/**
+ * Additional context associated with requests.
+ *
+ * Note that the reported client uid, gid and pid may be zero in some
+ * situations. For example, if the FUSE file system is running in a
+ * PID or user namespace but then accessed from outside the namespace,
+ * there is no valid uid/pid/gid that could be reported.
+ */
+struct fuse_ctx {
+ /** User ID of the calling process */
+ uid_t uid;
+
+ /** Group ID of the calling process */
+ gid_t gid;
+
+ /** Thread ID of the calling process */
+ pid_t pid;
+
+ /** Umask of the calling process */
+ mode_t umask;
+};
+
+struct fuse_forget_data {
+ fuse_ino_t ino;
+ uint64_t nlookup;
+};
+
+/* 'to_set' flags in setattr */
+#define FUSE_SET_ATTR_MODE (1 << 0)
+#define FUSE_SET_ATTR_UID (1 << 1)
+#define FUSE_SET_ATTR_GID (1 << 2)
+#define FUSE_SET_ATTR_SIZE (1 << 3)
+#define FUSE_SET_ATTR_ATIME (1 << 4)
+#define FUSE_SET_ATTR_MTIME (1 << 5)
+#define FUSE_SET_ATTR_ATIME_NOW (1 << 7)
+#define FUSE_SET_ATTR_MTIME_NOW (1 << 8)
+#define FUSE_SET_ATTR_CTIME (1 << 10)
+#define FUSE_SET_ATTR_KILL_SUIDGID (1 << 11)
+
+/*
+ * Request methods and replies
+ */
+
+/**
+ * Low level filesystem operations
+ *
+ * Most of the methods (with the exception of init and destroy)
+ * receive a request handle (fuse_req_t) as their first argument.
+ * This handle must be passed to one of the specified reply functions.
+ *
+ * This may be done inside the method invocation, or after the call
+ * has returned. The request handle is valid until one of the reply
+ * functions is called.
+ *
+ * Other pointer arguments (name, fuse_file_info, etc) are not valid
+ * after the call has returned, so if they are needed later, their
+ * contents have to be copied.
+ *
+ * In general, all methods are expected to perform any necessary
+ * permission checking. However, a filesystem may delegate this task
+ * to the kernel by passing the `default_permissions` mount option to
+ * `fuse_session_new()`. In this case, methods will only be called if
+ * the kernel's permission check has succeeded.
+ *
+ * The filesystem sometimes needs to handle a return value of -ENOENT
+ * from the reply function, which means, that the request was
+ * interrupted, and the reply discarded. For example if
+ * fuse_reply_open() return -ENOENT means, that the release method for
+ * this file will not be called.
+ */
+struct fuse_lowlevel_ops {
+ /**
+ * Initialize filesystem
+ *
+ * This function is called when libfuse establishes
+ * communication with the FUSE kernel module. The file system
+ * should use this module to inspect and/or modify the
+ * connection parameters provided in the `conn` structure.
+ *
+ * Note that some parameters may be overwritten by options
+ * passed to fuse_session_new() which take precedence over the
+ * values set in this handler.
+ *
+ * There's no reply to this function
+ *
+ * @param userdata the user data passed to fuse_session_new()
+ */
+ void (*init)(void *userdata, struct fuse_conn_info *conn);
+
+ /**
+ * Clean up filesystem.
+ *
+ * Called on filesystem exit. When this method is called, the
+ * connection to the kernel may be gone already, so that eg. calls
+ * to fuse_lowlevel_notify_* will fail.
+ *
+ * There's no reply to this function
+ *
+ * @param userdata the user data passed to fuse_session_new()
+ */
+ void (*destroy)(void *userdata);
+
+ /**
+ * Look up a directory entry by name and get its attributes.
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name the name to look up
+ */
+ void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Forget about an inode
+ *
+ * This function is called when the kernel removes an inode
+ * from its internal caches.
+ *
+ * The inode's lookup count increases by one for every call to
+ * fuse_reply_entry and fuse_reply_create. The nlookup parameter
+ * indicates by how much the lookup count should be decreased.
+ *
+ * Inodes with a non-zero lookup count may receive request from
+ * the kernel even after calls to unlink, rmdir or (when
+ * overwriting an existing file) rename. Filesystems must handle
+ * such requests properly and it is recommended to defer removal
+ * of the inode until the lookup count reaches zero. Calls to
+ * unlink, rmdir or rename will be followed closely by forget
+ * unless the file or directory is open, in which case the
+ * kernel issues forget only after the release or releasedir
+ * calls.
+ *
+ * Note that if a file system will be exported over NFS the
+ * inodes lifetime must extend even beyond forget. See the
+ * generation field in struct fuse_entry_param above.
+ *
+ * On unmount the lookup count for all inodes implicitly drops
+ * to zero. It is not guaranteed that the file system will
+ * receive corresponding forget messages for the affected
+ * inodes.
+ *
+ * Valid replies:
+ * fuse_reply_none
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param nlookup the number of lookups to forget
+ */
+ void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup);
+
+ /**
+ * Get file attributes.
+ *
+ * If writeback caching is enabled, the kernel may have a
+ * better idea of a file's length than the FUSE file system
+ * (eg if there has been a write that extended the file size,
+ * but that has not yet been passed to the filesystem.n
+ *
+ * In this case, the st_size value provided by the file system
+ * will be ignored.
+ *
+ * Valid replies:
+ * fuse_reply_attr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi for future use, currently always NULL
+ */
+ void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Set file attributes
+ *
+ * In the 'attr' argument only members indicated by the 'to_set'
+ * bitmask contain valid values. Other members contain undefined
+ * values.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits if the file
+ * size or owner is being changed.
+ *
+ * If the setattr was invoked from the ftruncate() system call
+ * under Linux kernel versions 2.6.15 or later, the fi->fh will
+ * contain the value set by the open method or will be undefined
+ * if the open method didn't set any value. Otherwise (not
+ * ftruncate call, or kernel version earlier than 2.6.15) the fi
+ * parameter will be NULL.
+ *
+ * Valid replies:
+ * fuse_reply_attr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param attr the attributes
+ * @param to_set bit mask of attributes which should be set
+ * @param fi file information, or NULL
+ */
+ void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
+ int to_set, struct fuse_file_info *fi);
+
+ /**
+ * Read symbolic link
+ *
+ * Valid replies:
+ * fuse_reply_readlink
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ */
+ void (*readlink)(fuse_req_t req, fuse_ino_t ino);
+
+ /**
+ * Create file node
+ *
+ * Create a regular file, character device, block device, fifo or
+ * socket node.
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode file type and mode with which to create the new file
+ * @param rdev the device number (only valid if created file is a device)
+ */
+ void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, dev_t rdev);
+
+ /**
+ * Create a directory
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode with which to create the new file
+ */
+ void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode);
+
+ /**
+ * Remove a file
+ *
+ * If the file's inode's lookup count is non-zero, the file
+ * system is expected to postpone any removal of the inode
+ * until the lookup count reaches zero (see description of the
+ * forget function).
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to remove
+ */
+ void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Remove a directory
+ *
+ * If the directory's inode's lookup count is non-zero, the
+ * file system is expected to postpone any removal of the
+ * inode until the lookup count reaches zero (see description
+ * of the forget function).
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to remove
+ */
+ void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name);
+
+ /**
+ * Create a symbolic link
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param link the contents of the symbolic link
+ * @param parent inode number of the parent directory
+ * @param name to create
+ */
+ void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent,
+ const char *name);
+
+ /**
+ * Rename a file
+ *
+ * If the target exists it should be atomically replaced. If
+ * the target's inode's lookup count is non-zero, the file
+ * system is expected to postpone any removal of the inode
+ * until the lookup count reaches zero (see description of the
+ * forget function).
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EINVAL, i.e. all
+ * future bmap requests will fail with EINVAL without being
+ * send to the filesystem process.
+ *
+ * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If
+ * RENAME_NOREPLACE is specified, the filesystem must not
+ * overwrite *newname* if it exists and return an error
+ * instead. If `RENAME_EXCHANGE` is specified, the filesystem
+ * must atomically exchange the two files, i.e. both must
+ * exist and neither may be deleted.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the old parent directory
+ * @param name old name
+ * @param newparent inode number of the new parent directory
+ * @param newname new name
+ */
+ void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ fuse_ino_t newparent, const char *newname,
+ unsigned int flags);
+
+ /**
+ * Create a hard link
+ *
+ * Valid replies:
+ * fuse_reply_entry
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the old inode number
+ * @param newparent inode number of the new parent directory
+ * @param newname new name to create
+ */
+ void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent,
+ const char *newname);
+
+ /**
+ * Open a file
+ *
+ * Open flags are available in fi->flags. The following rules
+ * apply.
+ *
+ * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be
+ * filtered out / handled by the kernel.
+ *
+ * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used
+ * by the filesystem to check if the operation is
+ * permitted. If the ``-o default_permissions`` mount
+ * option is given, this check is already done by the
+ * kernel before calling open() and may thus be omitted by
+ * the filesystem.
+ *
+ * - When writeback caching is enabled, the kernel may send
+ * read requests even for files opened with O_WRONLY. The
+ * filesystem should be prepared to handle this.
+ *
+ * - When writeback caching is disabled, the filesystem is
+ * expected to properly handle the O_APPEND flag and ensure
+ * that each write is appending to the end of the file.
+ *
+ * - When writeback caching is enabled, the kernel will
+ * handle O_APPEND. However, unless all changes to the file
+ * come through the kernel this will not work reliably. The
+ * filesystem should thus either ignore the O_APPEND flag
+ * (and let the kernel handle it), or return an error
+ * (indicating that reliably O_APPEND is not available).
+ *
+ * Filesystem may store an arbitrary file handle (pointer,
+ * index, etc) in fi->fh, and use this in other all other file
+ * operations (read, write, flush, release, fsync).
+ *
+ * Filesystem may also implement stateless file I/O and not store
+ * anything in fi->fh.
+ *
+ * There are also some flags (direct_io, keep_cache) which the
+ * filesystem may set in fi, to change the way the file is opened.
+ * See fuse_file_info structure in <fuse_common.h> for more details.
+ *
+ * If this request is answered with an error code of ENOSYS
+ * and FUSE_CAP_NO_OPEN_SUPPORT is set in
+ * `fuse_conn_info.capable`, this is treated as success and
+ * future calls to open and release will also succeed without being
+ * sent to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_open
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Read data
+ *
+ * Read should send exactly the number of bytes requested except
+ * on EOF or error, otherwise the rest of the data will be
+ * substituted with zeroes. An exception to this is when the file
+ * has been opened in 'direct_io' mode, in which case the return
+ * value of the read system call will reflect the return value of
+ * this operation.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_iov
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size number of bytes to read
+ * @param off offset to read from
+ * @param fi file information
+ */
+ void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Write data
+ *
+ * Write should return exactly the number of bytes requested
+ * except on error. An exception to this is when the file has
+ * been opened in 'direct_io' mode, in which case the return value
+ * of the write system call will reflect the return value of this
+ * operation.
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param buf data to write
+ * @param size number of bytes to write
+ * @param off offset to write to
+ * @param fi file information
+ */
+ void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size,
+ off_t off, struct fuse_file_info *fi);
+
+ /**
+ * Flush method
+ *
+ * This is called on each close() of the opened file.
+ *
+ * Since file descriptors can be duplicated (dup, dup2, fork), for
+ * one open call there may be many flush calls.
+ *
+ * Filesystems shouldn't assume that flush will always be called
+ * after some writes, or that if will be called at all.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ *
+ * NOTE: the name of the method is misleading, since (unlike
+ * fsync) the filesystem is not forced to flush pending writes.
+ * One reason to flush data is if the filesystem wants to return
+ * write errors during close. However, such use is non-portable
+ * because POSIX does not require [close] to wait for delayed I/O to
+ * complete.
+ *
+ * If the filesystem supports file locking operations (setlk,
+ * getlk) it should remove all locks belonging to 'fi->owner'.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to flush() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ *
+ * [close]:
+ * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html
+ */
+ void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Release an open file
+ *
+ * Release is called when there are no more references to an open
+ * file: all file descriptors are closed and all memory mappings
+ * are unmapped.
+ *
+ * For every open call there will be exactly one release call (unless
+ * the filesystem is force-unmounted).
+ *
+ * The filesystem may reply with an error, but error values are
+ * not returned to close() or munmap() which triggered the
+ * release.
+ *
+ * fi->fh will contain the value set by the open method, or will
+ * be undefined if the open method didn't set any value.
+ * fi->flags will contain the same flags as for open.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Synchronize file contents
+ *
+ * If the datasync parameter is non-zero, then only the user data
+ * should be flushed, not the meta data.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to fsync() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param datasync flag indicating if only data should be flushed
+ * @param fi file information
+ */
+ void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi);
+
+ /**
+ * Open a directory
+ *
+ * Filesystem may store an arbitrary file handle (pointer, index,
+ * etc) in fi->fh, and use this in other all other directory
+ * stream operations (readdir, releasedir, fsyncdir).
+ *
+ * If this request is answered with an error code of ENOSYS and
+ * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`,
+ * this is treated as success and future calls to opendir and
+ * releasedir will also succeed without being sent to the filesystem
+ * process. In addition, the kernel will cache readdir results
+ * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR.
+ *
+ * Valid replies:
+ * fuse_reply_open
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi);
+
+ /**
+ * Read directory
+ *
+ * Send a buffer filled using fuse_add_direntry(), with size not
+ * exceeding the requested size. Send an empty buffer on end of
+ * stream.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * Returning a directory entry from readdir() does not affect
+ * its lookup count.
+ *
+ * If off_t is non-zero, then it will correspond to one of the off_t
+ * values that was previously returned by readdir() for the same
+ * directory handle. In this case, readdir() should skip over entries
+ * coming before the position defined by the off_t value. If entries
+ * are added or removed while the directory handle is open, they filesystem
+ * may still include the entries that have been removed, and may not
+ * report the entries that have been created. However, addition or
+ * removal of entries must never cause readdir() to skip over unrelated
+ * entries or to report them more than once. This means
+ * that off_t can not be a simple index that enumerates the entries
+ * that have been returned but must contain sufficient information to
+ * uniquely determine the next directory entry to return even when the
+ * set of entries is changing.
+ *
+ * The function does not have to report the '.' and '..'
+ * entries, but is allowed to do so. Note that, if readdir does
+ * not return '.' or '..', they will not be implicitly returned,
+ * and this behavior is observable by the caller.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum number of bytes to send
+ * @param off offset to continue reading the directory stream
+ * @param fi file information
+ */
+ void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Release an open directory
+ *
+ * For every opendir call there will be exactly one releasedir
+ * call (unless the filesystem is force-unmounted).
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ */
+ void (*releasedir)(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi);
+
+ /**
+ * Synchronize directory contents
+ *
+ * If the datasync parameter is non-zero, then only the directory
+ * contents should be flushed, not the meta data.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * If this request is answered with an error code of ENOSYS,
+ * this is treated as success and future calls to fsyncdir() will
+ * succeed automatically without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param datasync flag indicating if only data should be flushed
+ * @param fi file information
+ */
+ void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi);
+
+ /**
+ * Get file system statistics
+ *
+ * Valid replies:
+ * fuse_reply_statfs
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number, zero means "undefined"
+ */
+ void (*statfs)(fuse_req_t req, fuse_ino_t ino);
+
+ /**
+ * Set an extended attribute
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future setxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ */
+ void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name,
+ const char *value, size_t size, int flags,
+ uint32_t setxattr_flags);
+
+ /**
+ * Get an extended attribute
+ *
+ * If size is zero, the size of the value should be sent with
+ * fuse_reply_xattr.
+ *
+ * If the size is non-zero, and the value fits in the buffer, the
+ * value should be sent with fuse_reply_buf.
+ *
+ * If the size is too small for the value, the ERANGE error should
+ * be sent.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future getxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_xattr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param name of the extended attribute
+ * @param size maximum size of the value to send
+ */
+ void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name,
+ size_t size);
+
+ /**
+ * List extended attribute names
+ *
+ * If size is zero, the total size of the attribute list should be
+ * sent with fuse_reply_xattr.
+ *
+ * If the size is non-zero, and the null character separated
+ * attribute list fits in the buffer, the list should be sent with
+ * fuse_reply_buf.
+ *
+ * If the size is too small for the list, the ERANGE error should
+ * be sent.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future listxattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_xattr
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum size of the list to send
+ */
+ void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size);
+
+ /**
+ * Remove an extended attribute
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future removexattr() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param name of the extended attribute
+ */
+ void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name);
+
+ /**
+ * Check file access permissions
+ *
+ * This will be called for the access() and chdir() system
+ * calls. If the 'default_permissions' mount option is given,
+ * this method is not called.
+ *
+ * This method is not called under Linux kernel versions 2.4.x
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent success, i.e. this and all future access()
+ * requests will succeed without being send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param mask requested access mode
+ */
+ void (*access)(fuse_req_t req, fuse_ino_t ino, int mask);
+
+ /**
+ * Create and open a file
+ *
+ * If the file does not exist, first create it with the specified
+ * mode, and then open it.
+ *
+ * See the description of the open handler for more
+ * information.
+ *
+ * If this method is not implemented or under Linux kernel
+ * versions earlier than 2.6.15, the mknod() and open() methods
+ * will be called instead.
+ *
+ * If this request is answered with an error code of ENOSYS, the handler
+ * is treated as not implemented (i.e., for this and future requests the
+ * mknod() and open() handlers will be called instead).
+ *
+ * Valid replies:
+ * fuse_reply_create
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param parent inode number of the parent directory
+ * @param name to create
+ * @param mode file type and mode with which to create the new file
+ * @param fi file information
+ */
+ void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, struct fuse_file_info *fi);
+
+ /**
+ * Test for a POSIX file lock
+ *
+ * Valid replies:
+ * fuse_reply_lock
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param lock the region/type to test
+ */
+ void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock);
+
+ /**
+ * Acquire, modify or release a POSIX file lock
+ *
+ * For POSIX threads (NPTL) there's a 1-1 relation between pid and
+ * owner, but otherwise this is not always the case. For checking
+ * lock ownership, 'fi->owner' must be used. The l_pid field in
+ * 'struct flock' should only be used to fill in this field in
+ * getlk().
+ *
+ * Note: if the locking methods are not implemented, the kernel
+ * will still allow file locking to work locally. Hence these are
+ * only interesting for network filesystems and similar.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param lock the region/type to set
+ * @param sleep locking operation may sleep
+ */
+ void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock, int sleep);
+
+ /**
+ * Map block index within file to block index within device
+ *
+ * Note: This makes sense only for block device backed filesystems
+ * mounted with the 'blkdev' option
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure, i.e. all future bmap() requests will
+ * fail with the same error code without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_bmap
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param blocksize unit of block index
+ * @param idx block index within file
+ */
+ void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize,
+ uint64_t idx);
+
+ /**
+ * Ioctl
+ *
+ * Note: For unrestricted ioctls (not allowed for FUSE
+ * servers), data in and out areas can be discovered by giving
+ * iovs and setting FUSE_IOCTL_RETRY in *flags*. For
+ * restricted ioctls, kernel prepares in/out data area
+ * according to the information encoded in cmd.
+ *
+ * Valid replies:
+ * fuse_reply_ioctl_retry
+ * fuse_reply_ioctl
+ * fuse_reply_ioctl_iov
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param cmd ioctl command
+ * @param arg ioctl argument
+ * @param fi file information
+ * @param flags for FUSE_IOCTL_* flags
+ * @param in_buf data fetched from the caller
+ * @param in_bufsz number of fetched bytes
+ * @param out_bufsz maximum size of output data
+ *
+ * Note : the unsigned long request submitted by the application
+ * is truncated to 32 bits.
+ */
+ void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg,
+ struct fuse_file_info *fi, unsigned flags, const void *in_buf,
+ size_t in_bufsz, size_t out_bufsz);
+
+ /**
+ * Poll for IO readiness
+ *
+ * Note: If ph is non-NULL, the client should notify
+ * when IO readiness events occur by calling
+ * fuse_lowlevel_notify_poll() with the specified ph.
+ *
+ * Regardless of the number of times poll with a non-NULL ph
+ * is received, single notification is enough to clear all.
+ * Notifying more times incurs overhead but doesn't harm
+ * correctness.
+ *
+ * The callee is responsible for destroying ph with
+ * fuse_pollhandle_destroy() when no longer in use.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as success (with a kernel-defined default poll-mask) and
+ * future calls to pull() will succeed the same way without being send
+ * to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_poll
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param ph poll handle to be used for notification
+ */
+ void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct fuse_pollhandle *ph);
+
+ /**
+ * Write data made available in a buffer
+ *
+ * This is a more generic version of the ->write() method. If
+ * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the
+ * kernel supports splicing from the fuse device, then the
+ * data will be made available in pipe for supporting zero
+ * copy data transfer.
+ *
+ * buf->count is guaranteed to be one (and thus buf->idx is
+ * always zero). The write_buf handler must ensure that
+ * bufv->off is correctly updated (reflecting the number of
+ * bytes read from bufv->buf[0]).
+ *
+ * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is
+ * expected to reset the setuid and setgid bits.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param bufv buffer containing the data
+ * @param off offset to write to
+ * @param fi file information
+ */
+ void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv,
+ off_t off, struct fuse_file_info *fi);
+
+ /**
+ * Forget about multiple inodes
+ *
+ * See description of the forget function for more
+ * information.
+ *
+ * Valid replies:
+ * fuse_reply_none
+ *
+ * @param req request handle
+ */
+ void (*forget_multi)(fuse_req_t req, size_t count,
+ struct fuse_forget_data *forgets);
+
+ /**
+ * Acquire, modify or release a BSD file lock
+ *
+ * Note: if the locking methods are not implemented, the kernel
+ * will still allow file locking to work locally. Hence these are
+ * only interesting for network filesystems and similar.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param fi file information
+ * @param op the locking operation, see flock(2)
+ */
+ void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ int op);
+
+ /**
+ * Allocate requested space. If this function returns success then
+ * subsequent writes to the specified range shall not fail due to the lack
+ * of free space on the file system storage media.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future fallocate() requests will fail with EOPNOTSUPP without being
+ * send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param offset starting point for allocated region
+ * @param length size of allocated region
+ * @param mode determines the operation to be performed on the given range,
+ * see fallocate(2)
+ */
+ void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
+ off_t length, struct fuse_file_info *fi);
+
+ /**
+ * Read directory with attributes
+ *
+ * Send a buffer filled using fuse_add_direntry_plus(), with size not
+ * exceeding the requested size. Send an empty buffer on end of
+ * stream.
+ *
+ * fi->fh will contain the value set by the opendir method, or
+ * will be undefined if the opendir method didn't set any value.
+ *
+ * In contrast to readdir() (which does not affect the lookup counts),
+ * the lookup count of every entry returned by readdirplus(), except "."
+ * and "..", is incremented by one.
+ *
+ * Valid replies:
+ * fuse_reply_buf
+ * fuse_reply_data
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param size maximum number of bytes to send
+ * @param off offset to continue reading the directory stream
+ * @param fi file information
+ */
+ void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off,
+ struct fuse_file_info *fi);
+
+ /**
+ * Copy a range of data from one file to another
+ *
+ * Performs an optimized copy between two file descriptors without the
+ * additional cost of transferring data through the FUSE kernel module
+ * to user space (glibc) and then back into the FUSE filesystem again.
+ *
+ * In case this method is not implemented, glibc falls back to reading
+ * data from the source and writing to the destination. Effectively
+ * doing an inefficient copy of the data.
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure with error code EOPNOTSUPP, i.e. all
+ * future copy_file_range() requests will fail with EOPNOTSUPP without
+ * being send to the filesystem process.
+ *
+ * Valid replies:
+ * fuse_reply_write
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino_in the inode number or the source file
+ * @param off_in starting point from were the data should be read
+ * @param fi_in file information of the source file
+ * @param ino_out the inode number or the destination file
+ * @param off_out starting point where the data should be written
+ * @param fi_out file information of the destination file
+ * @param len maximum size of the data to copy
+ * @param flags passed along with the copy_file_range() syscall
+ */
+ void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
+ struct fuse_file_info *fi_in, fuse_ino_t ino_out,
+ off_t off_out, struct fuse_file_info *fi_out,
+ size_t len, int flags);
+
+ /**
+ * Find next data or hole after the specified offset
+ *
+ * If this request is answered with an error code of ENOSYS, this is
+ * treated as a permanent failure, i.e. all future lseek() requests will
+ * fail with the same error code without being send to the filesystem
+ * process.
+ *
+ * Valid replies:
+ * fuse_reply_lseek
+ * fuse_reply_err
+ *
+ * @param req request handle
+ * @param ino the inode number
+ * @param off offset to start search from
+ * @param whence either SEEK_DATA or SEEK_HOLE
+ * @param fi file information
+ */
+ void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
+ struct fuse_file_info *fi);
+};
+
+/**
+ * Reply with an error code or success.
+ *
+ * Possible requests:
+ * all except forget
+ *
+ * Whereever possible, error codes should be chosen from the list of
+ * documented error conditions in the corresponding system calls
+ * manpage.
+ *
+ * An error code of ENOSYS is sometimes treated specially. This is
+ * indicated in the documentation of the affected handler functions.
+ *
+ * The following requests may be answered with a zero error code:
+ * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr,
+ * removexattr, setlk.
+ *
+ * @param req request handle
+ * @param err the positive error value, or zero for success
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_err(fuse_req_t req, int err);
+
+/**
+ * Don't send reply
+ *
+ * Possible requests:
+ * forget
+ * forget_multi
+ * retrieve_reply
+ *
+ * @param req request handle
+ */
+void fuse_reply_none(fuse_req_t req);
+
+/**
+ * Reply with a directory entry
+ *
+ * Possible requests:
+ * lookup, mknod, mkdir, symlink, link
+ *
+ * Side effects:
+ * increments the lookup count on success
+ *
+ * @param req request handle
+ * @param e the entry parameters
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e);
+
+/**
+ * Reply with a directory entry and open parameters
+ *
+ * currently the following members of 'fi' are used:
+ * fh, direct_io, keep_cache
+ *
+ * Possible requests:
+ * create
+ *
+ * Side effects:
+ * increments the lookup count on success
+ *
+ * @param req request handle
+ * @param e the entry parameters
+ * @param fi file information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e,
+ const struct fuse_file_info *fi);
+
+/**
+ * Reply with attributes
+ *
+ * Possible requests:
+ * getattr, setattr
+ *
+ * @param req request handle
+ * @param attr the attributes
+ * @param attr_timeout validity timeout (in seconds) for the attributes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_attr(fuse_req_t req, const struct stat *attr,
+ double attr_timeout);
+
+/**
+ * Reply with the contents of a symbolic link
+ *
+ * Possible requests:
+ * readlink
+ *
+ * @param req request handle
+ * @param link symbolic link contents
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_readlink(fuse_req_t req, const char *link);
+
+/**
+ * Reply with open parameters
+ *
+ * currently the following members of 'fi' are used:
+ * fh, direct_io, keep_cache
+ *
+ * Possible requests:
+ * open, opendir
+ *
+ * @param req request handle
+ * @param fi file information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi);
+
+/**
+ * Reply with number of bytes written
+ *
+ * Possible requests:
+ * write
+ *
+ * @param req request handle
+ * @param count the number of bytes written
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_write(fuse_req_t req, size_t count);
+
+/**
+ * Reply with data
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * @param req request handle
+ * @param buf buffer containing data
+ * @param size the size of data in bytes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size);
+
+/**
+ * Reply with data copied/moved from buffer(s)
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * Side effects:
+ * when used to return data from a readdirplus() (but not readdir())
+ * call, increments the lookup count of each returned entry by one
+ * on success.
+ *
+ * @param req request handle
+ * @param bufv buffer vector
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv);
+
+/**
+ * Reply with data vector
+ *
+ * Possible requests:
+ * read, readdir, getxattr, listxattr
+ *
+ * @param req request handle
+ * @param iov the vector containing the data
+ * @param count the size of vector
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count);
+
+/**
+ * Reply with filesystem statistics
+ *
+ * Possible requests:
+ * statfs
+ *
+ * @param req request handle
+ * @param stbuf filesystem statistics
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf);
+
+/**
+ * Reply with needed buffer size
+ *
+ * Possible requests:
+ * getxattr, listxattr
+ *
+ * @param req request handle
+ * @param count the buffer size needed in bytes
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_xattr(fuse_req_t req, size_t count);
+
+/**
+ * Reply with file lock information
+ *
+ * Possible requests:
+ * getlk
+ *
+ * @param req request handle
+ * @param lock the lock information
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_lock(fuse_req_t req, const struct flock *lock);
+
+/**
+ * Reply with block index
+ *
+ * Possible requests:
+ * bmap
+ *
+ * @param req request handle
+ * @param idx block index within device
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_bmap(fuse_req_t req, uint64_t idx);
+
+/*
+ * Filling a buffer in readdir
+ */
+
+/**
+ * Add a directory entry to the buffer
+ *
+ * Buffer needs to be large enough to hold the entry. If it's not,
+ * then the entry is not filled in but the size of the entry is still
+ * returned. The caller can check this by comparing the bufsize
+ * parameter with the returned entry size. If the entry size is
+ * larger than the buffer size, the operation failed.
+ *
+ * From the 'stbuf' argument the st_ino field and bits 12-15 of the
+ * st_mode field are used. The other fields are ignored.
+ *
+ * *off* should be any non-zero value that the filesystem can use to
+ * identify the current point in the directory stream. It does not
+ * need to be the actual physical position. A value of zero is
+ * reserved to mean "from the beginning", and should therefore never
+ * be used (the first call to fuse_add_direntry should be passed the
+ * offset of the second directory entry).
+ *
+ * @param req request handle
+ * @param buf the point where the new entry will be added to the buffer
+ * @param bufsize remaining size of the buffer
+ * @param name the name of the entry
+ * @param stbuf the file attributes
+ * @param off the offset of the next entry
+ * @return the space needed for the entry
+ */
+size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name, const struct stat *stbuf, off_t off);
+
+/**
+ * Add a directory entry to the buffer with the attributes
+ *
+ * See documentation of `fuse_add_direntry()` for more details.
+ *
+ * @param req request handle
+ * @param buf the point where the new entry will be added to the buffer
+ * @param bufsize remaining size of the buffer
+ * @param name the name of the entry
+ * @param e the directory entry
+ * @param off the offset of the next entry
+ * @return the space needed for the entry
+ */
+size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize,
+ const char *name,
+ const struct fuse_entry_param *e, off_t off);
+
+/**
+ * Reply to ask for data fetch and output buffer preparation. ioctl
+ * will be retried with the specified input data fetched and output
+ * buffer prepared.
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param in_iov iovec specifying data to fetch from the caller
+ * @param in_count number of entries in in_iov
+ * @param out_iov iovec specifying addresses to write output to
+ * @param out_count number of entries in out_iov
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov,
+ size_t in_count, const struct iovec *out_iov,
+ size_t out_count);
+
+/**
+ * Reply to finish ioctl
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param result result to be passed to the caller
+ * @param buf buffer containing output data
+ * @param size length of output data
+ */
+int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size);
+
+/**
+ * Reply to finish ioctl with iov buffer
+ *
+ * Possible requests:
+ * ioctl
+ *
+ * @param req request handle
+ * @param result result to be passed to the caller
+ * @param iov the vector containing the data
+ * @param count the size of vector
+ */
+int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov,
+ int count);
+
+/**
+ * Reply with poll result event mask
+ *
+ * @param req request handle
+ * @param revents poll result event mask
+ */
+int fuse_reply_poll(fuse_req_t req, unsigned revents);
+
+/**
+ * Reply with offset
+ *
+ * Possible requests:
+ * lseek
+ *
+ * @param req request handle
+ * @param off offset of next data or hole
+ * @return zero for success, -errno for failure to send reply
+ */
+int fuse_reply_lseek(fuse_req_t req, off_t off);
+
+/*
+ * Notification
+ */
+
+/**
+ * Notify IO readiness event
+ *
+ * For more information, please read comment for poll operation.
+ *
+ * @param ph poll handle to notify IO readiness event for
+ */
+int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph);
+
+/**
+ * Notify to invalidate cache for an inode.
+ *
+ * Added in FUSE protocol version 7.12. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * If the filesystem has writeback caching enabled, invalidating an
+ * inode will first trigger a writeback of all dirty pages. The call
+ * will block until all writeback requests have completed and the
+ * inode has been invalidated. It will, however, not wait for
+ * completion of pending writeback requests that have been issued
+ * before.
+ *
+ * If there are no dirty pages, this function will never block.
+ *
+ * @param se the session object
+ * @param ino the inode number
+ * @param off the offset in the inode where to start invalidating
+ * or negative to invalidate attributes only
+ * @param len the amount of cache to invalidate or 0 for all
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino,
+ off_t off, off_t len);
+
+/**
+ * Notify to invalidate parent attributes and the dentry matching
+ * parent/name
+ *
+ * To avoid a deadlock this function must not be called in the
+ * execution path of a related filesystem operation or within any code
+ * that could hold a lock that could be needed to execute such an
+ * operation. As of kernel 4.18, a "related operation" is a lookup(),
+ * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create()
+ * request for the parent, and a setattr(), unlink(), rmdir(),
+ * rename(), setxattr(), removexattr(), readdir() or readdirplus()
+ * request for the inode itself.
+ *
+ * When called correctly, this function will never block.
+ *
+ * Added in FUSE protocol version 7.12. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param parent inode number
+ * @param name file name
+ * @param namelen strlen() of file name
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent,
+ const char *name, size_t namelen);
+
+/**
+ * This function behaves like fuse_lowlevel_notify_inval_entry() with
+ * the following additional effect (at least as of Linux kernel 4.8):
+ *
+ * If the provided *child* inode matches the inode that is currently
+ * associated with the cached dentry, and if there are any inotify
+ * watches registered for the dentry, then the watchers are informed
+ * that the dentry has been deleted.
+ *
+ * To avoid a deadlock this function must not be called while
+ * executing a related filesystem operation or while holding a lock
+ * that could be needed to execute such an operation (see the
+ * description of fuse_lowlevel_notify_inval_entry() for more
+ * details).
+ *
+ * When called correctly, this function will never block.
+ *
+ * Added in FUSE protocol version 7.18. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param parent inode number
+ * @param child inode number
+ * @param name file name
+ * @param namelen strlen() of file name
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent,
+ fuse_ino_t child, const char *name,
+ size_t namelen);
+
+/**
+ * Store data to the kernel buffers
+ *
+ * Synchronously store data in the kernel buffers belonging to the
+ * given inode. The stored data is marked up-to-date (no read will be
+ * performed against it, unless it's invalidated or evicted from the
+ * cache).
+ *
+ * If the stored data overflows the current file size, then the size
+ * is extended, similarly to a write(2) on the filesystem.
+ *
+ * If this function returns an error, then the store wasn't fully
+ * completed, but it may have been partially completed.
+ *
+ * Added in FUSE protocol version 7.15. If the kernel does not support
+ * this (or a newer) version, the function will return -ENOSYS and do
+ * nothing.
+ *
+ * @param se the session object
+ * @param ino the inode number
+ * @param offset the starting offset into the file to store to
+ * @param bufv buffer vector
+ * @return zero for success, -errno for failure
+ */
+int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino,
+ off_t offset, struct fuse_bufvec *bufv);
+
+/*
+ * Utility functions
+ */
+
+/**
+ * Get the userdata from the request
+ *
+ * @param req request handle
+ * @return the user data passed to fuse_session_new()
+ */
+void *fuse_req_userdata(fuse_req_t req);
+
+/**
+ * Get the context from the request
+ *
+ * The pointer returned by this function will only be valid for the
+ * request's lifetime
+ *
+ * @param req request handle
+ * @return the context structure
+ */
+const struct fuse_ctx *fuse_req_ctx(fuse_req_t req);
+
+/**
+ * Callback function for an interrupt
+ *
+ * @param req interrupted request
+ * @param data user data
+ */
+typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data);
+
+/**
+ * Register/unregister callback for an interrupt
+ *
+ * If an interrupt has already happened, then the callback function is
+ * called from within this function, hence it's not possible for
+ * interrupts to be lost.
+ *
+ * @param req request handle
+ * @param func the callback function or NULL for unregister
+ * @param data user data passed to the callback function
+ */
+void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func,
+ void *data);
+
+/**
+ * Check if a request has already been interrupted
+ *
+ * @param req request handle
+ * @return 1 if the request has been interrupted, 0 otherwise
+ */
+int fuse_req_interrupted(fuse_req_t req);
+
+/**
+ * Check if the session is connected via virtio
+ *
+ * @param se session object
+ * @return 1 if the session is a virtio session
+ */
+int fuse_lowlevel_is_virtio(struct fuse_session *se);
+
+/*
+ * Inquiry functions
+ */
+
+/**
+ * Print low-level version information to stdout.
+ */
+void fuse_lowlevel_version(void);
+
+/**
+ * Print available low-level options to stdout. This is not an
+ * exhaustive list, but includes only those options that may be of
+ * interest to an end-user of a file system.
+ */
+void fuse_lowlevel_help(void);
+
+/**
+ * Print available options for `fuse_parse_cmdline()`.
+ */
+void fuse_cmdline_help(void);
+
+/*
+ * Filesystem setup & teardown
+ */
+
+struct fuse_cmdline_opts {
+ int foreground;
+ int debug;
+ int nodefault_subtype;
+ int show_version;
+ int show_help;
+ int print_capabilities;
+ int syslog;
+ int log_level;
+ unsigned int max_idle_threads;
+ unsigned long rlimit_nofile;
+};
+
+/**
+ * Utility function to parse common options for simple file systems
+ * using the low-level API. A help text that describes the available
+ * options can be printed with `fuse_cmdline_help`. A single
+ * non-option argument is treated as the mountpoint. Multiple
+ * non-option arguments will result in an error.
+ *
+ * If neither -o subtype= or -o fsname= options are given, a new
+ * subtype option will be added and set to the basename of the program
+ * (the fsname will remain unset, and then defaults to "fuse").
+ *
+ * Known options will be removed from *args*, unknown options will
+ * remain.
+ *
+ * @param args argument vector (input+output)
+ * @param opts output argument for parsed options
+ * @return 0 on success, -1 on failure
+ */
+int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts);
+
+/**
+ * Create a low level session.
+ *
+ * Returns a session structure suitable for passing to
+ * fuse_session_mount() and fuse_session_loop().
+ *
+ * This function accepts most file-system independent mount options
+ * (like context, nodev, ro - see mount(8)), as well as the general
+ * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and
+ * -o default_permissions, but not ``-o use_ino``). Instead of `-o
+ * debug`, debugging may also enabled with `-d` or `--debug`.
+ *
+ * If not all options are known, an error message is written to stderr
+ * and the function returns NULL.
+ *
+ * Option parsing skips argv[0], which is assumed to contain the
+ * program name. To prevent accidentally passing an option in
+ * argv[0], this element must always be present (even if no options
+ * are specified). It may be set to the empty string ('\0') if no
+ * reasonable value can be provided.
+ *
+ * @param args argument vector
+ * @param op the (low-level) filesystem operations
+ * @param op_size sizeof(struct fuse_lowlevel_ops)
+ * @param userdata user data
+ *
+ * @return the fuse session on success, NULL on failure
+ **/
+struct fuse_session *fuse_session_new(struct fuse_args *args,
+ const struct fuse_lowlevel_ops *op,
+ size_t op_size, void *userdata);
+
+/**
+ * Mount a FUSE file system.
+ *
+ * @param se session object
+ *
+ * @return 0 on success, -1 on failure.
+ **/
+int fuse_session_mount(struct fuse_session *se);
+
+/**
+ * Enter a single threaded, blocking event loop.
+ *
+ * When the event loop terminates because the connection to the FUSE
+ * kernel module has been closed, this function returns zero. This
+ * happens when the filesystem is unmounted regularly (by the
+ * filesystem owner or root running the umount(8) or fusermount(1)
+ * command), or if connection is explicitly severed by writing ``1``
+ * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only
+ * way to distinguish between these two conditions is to check if the
+ * filesystem is still mounted after the session loop returns.
+ *
+ * When some error occurs during request processing, the function
+ * returns a negated errno(3) value.
+ *
+ * If the loop has been terminated because of a signal handler
+ * installed by fuse_set_signal_handlers(), this function returns the
+ * (positive) signal value that triggered the exit.
+ *
+ * @param se the session
+ * @return 0, -errno, or a signal value
+ */
+int fuse_session_loop(struct fuse_session *se);
+
+/**
+ * Flag a session as terminated.
+ *
+ * This function is invoked by the POSIX signal handlers, when
+ * registered using fuse_set_signal_handlers(). It will cause any
+ * running event loops to terminate on the next opportunity.
+ *
+ * @param se the session
+ */
+void fuse_session_exit(struct fuse_session *se);
+
+/**
+ * Reset the terminated flag of a session
+ *
+ * @param se the session
+ */
+void fuse_session_reset(struct fuse_session *se);
+
+/**
+ * Query the terminated flag of a session
+ *
+ * @param se the session
+ * @return 1 if exited, 0 if not exited
+ */
+int fuse_session_exited(struct fuse_session *se);
+
+/**
+ * Ensure that file system is unmounted.
+ *
+ * In regular operation, the file system is typically unmounted by the
+ * user calling umount(8) or fusermount(1), which then terminates the
+ * FUSE session loop. However, the session loop may also terminate as
+ * a result of an explicit call to fuse_session_exit() (e.g. by a
+ * signal handler installed by fuse_set_signal_handler()). In this
+ * case the filesystem remains mounted, but any attempt to access it
+ * will block (while the filesystem process is still running) or give
+ * an ESHUTDOWN error (after the filesystem process has terminated).
+ *
+ * If the communication channel with the FUSE kernel module is still
+ * open (i.e., if the session loop was terminated by an explicit call
+ * to fuse_session_exit()), this function will close it and unmount
+ * the filesystem. If the communication channel has been closed by the
+ * kernel, this method will do (almost) nothing.
+ *
+ * NOTE: The above semantics mean that if the connection to the kernel
+ * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file,
+ * this method will *not* unmount the filesystem.
+ *
+ * @param se the session
+ */
+void fuse_session_unmount(struct fuse_session *se);
+
+/**
+ * Destroy a session
+ *
+ * @param se the session
+ */
+void fuse_session_destroy(struct fuse_session *se);
+
+/*
+ * Custom event loop support
+ */
+
+/**
+ * Return file descriptor for communication with kernel.
+ *
+ * The file selector can be used to integrate FUSE with a custom event
+ * loop. Whenever data is available for reading on the provided fd,
+ * the event loop should call `fuse_session_receive_buf` followed by
+ * `fuse_session_process_buf` to process the request.
+ *
+ * The returned file descriptor is valid until `fuse_session_unmount`
+ * is called.
+ *
+ * @param se the session
+ * @return a file descriptor
+ */
+int fuse_session_fd(struct fuse_session *se);
+
+/**
+ * Process a raw request supplied in a generic buffer
+ *
+ * The fuse_buf may contain a memory buffer or a pipe file descriptor.
+ *
+ * @param se the session
+ * @param buf the fuse_buf containing the request
+ */
+void fuse_session_process_buf(struct fuse_session *se,
+ const struct fuse_buf *buf);
+
+/**
+ * Read a raw request from the kernel into the supplied buffer.
+ *
+ * Depending on file system options, system capabilities, and request
+ * size the request is either read into a memory buffer or spliced
+ * into a temporary pipe.
+ *
+ * @param se the session
+ * @param buf the fuse_buf to store the request in
+ * @return the actual size of the raw request, or -errno on error
+ */
+int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf);
+
+#endif /* FUSE_LOWLEVEL_H_ */
diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h
new file mode 100644
index 000000000..f252baa75
--- /dev/null
+++ b/tools/virtiofsd/fuse_misc.h
@@ -0,0 +1,59 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include <pthread.h>
+
+/*
+ * Versioned symbols cannot be used in some cases because it
+ * - confuse the dynamic linker in uClibc
+ * - not supported on MacOSX (in MachO binary format)
+ */
+#if (!defined(__UCLIBC__) && !defined(__APPLE__))
+#define FUSE_SYMVER(x) __asm__(x)
+#else
+#define FUSE_SYMVER(x)
+#endif
+
+#ifndef USE_UCLIBC
+#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL)
+#else
+/* Is this hack still needed? */
+static inline void fuse_mutex_init(pthread_mutex_t *mut)
+{
+ pthread_mutexattr_t attr;
+ pthread_mutexattr_init(&attr);
+ pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP);
+ pthread_mutex_init(mut, &attr);
+ pthread_mutexattr_destroy(&attr);
+}
+#endif
+
+#ifdef HAVE_STRUCT_STAT_ST_ATIM
+/* Linux */
+#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec)
+#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec)
+#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec)
+#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val)
+#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val)
+#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val)
+#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC)
+/* FreeBSD */
+#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec)
+#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec)
+#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec)
+#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val)
+#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val)
+#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val)
+#else
+#define ST_ATIM_NSEC(stbuf) 0
+#define ST_CTIM_NSEC(stbuf) 0
+#define ST_MTIM_NSEC(stbuf) 0
+#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0)
+#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0)
+#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0)
+#endif
diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c
new file mode 100644
index 000000000..9d371448e
--- /dev/null
+++ b/tools/virtiofsd/fuse_opt.c
@@ -0,0 +1,446 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Implementation of option parsing routines (dealing with `struct
+ * fuse_args`).
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_opt.h"
+#include "fuse_i.h"
+#include "fuse_misc.h"
+
+
+struct fuse_opt_context {
+ void *data;
+ const struct fuse_opt *opt;
+ fuse_opt_proc_t proc;
+ int argctr;
+ int argc;
+ char **argv;
+ struct fuse_args outargs;
+ char *opts;
+ int nonopt;
+};
+
+void fuse_opt_free_args(struct fuse_args *args)
+{
+ if (args) {
+ if (args->argv && args->allocated) {
+ int i;
+ for (i = 0; i < args->argc; i++) {
+ free(args->argv[i]);
+ }
+ free(args->argv);
+ }
+ args->argc = 0;
+ args->argv = NULL;
+ args->allocated = 0;
+ }
+}
+
+static int alloc_failed(void)
+{
+ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n");
+ return -1;
+}
+
+int fuse_opt_add_arg(struct fuse_args *args, const char *arg)
+{
+ char **newargv;
+ char *newarg;
+
+ assert(!args->argv || args->allocated);
+
+ newarg = strdup(arg);
+ if (!newarg) {
+ return alloc_failed();
+ }
+
+ newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *));
+ if (!newargv) {
+ free(newarg);
+ return alloc_failed();
+ }
+
+ args->argv = newargv;
+ args->allocated = 1;
+ args->argv[args->argc++] = newarg;
+ args->argv[args->argc] = NULL;
+ return 0;
+}
+
+static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos,
+ const char *arg)
+{
+ assert(pos <= args->argc);
+ if (fuse_opt_add_arg(args, arg) == -1) {
+ return -1;
+ }
+
+ if (pos != args->argc - 1) {
+ char *newarg = args->argv[args->argc - 1];
+ memmove(&args->argv[pos + 1], &args->argv[pos],
+ sizeof(char *) * (args->argc - pos - 1));
+ args->argv[pos] = newarg;
+ }
+ return 0;
+}
+
+int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg)
+{
+ return fuse_opt_insert_arg_common(args, pos, arg);
+}
+
+static int next_arg(struct fuse_opt_context *ctx, const char *opt)
+{
+ if (ctx->argctr + 1 >= ctx->argc) {
+ fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt);
+ return -1;
+ }
+ ctx->argctr++;
+ return 0;
+}
+
+static int add_arg(struct fuse_opt_context *ctx, const char *arg)
+{
+ return fuse_opt_add_arg(&ctx->outargs, arg);
+}
+
+static int add_opt_common(char **opts, const char *opt, int esc)
+{
+ unsigned oldlen = *opts ? strlen(*opts) : 0;
+ char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1);
+
+ if (!d) {
+ return alloc_failed();
+ }
+
+ *opts = d;
+ if (oldlen) {
+ d += oldlen;
+ *d++ = ',';
+ }
+
+ for (; *opt; opt++) {
+ if (esc && (*opt == ',' || *opt == '\\')) {
+ *d++ = '\\';
+ }
+ *d++ = *opt;
+ }
+ *d = '\0';
+
+ return 0;
+}
+
+int fuse_opt_add_opt(char **opts, const char *opt)
+{
+ return add_opt_common(opts, opt, 0);
+}
+
+int fuse_opt_add_opt_escaped(char **opts, const char *opt)
+{
+ return add_opt_common(opts, opt, 1);
+}
+
+static int add_opt(struct fuse_opt_context *ctx, const char *opt)
+{
+ return add_opt_common(&ctx->opts, opt, 1);
+}
+
+static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key,
+ int iso)
+{
+ if (key == FUSE_OPT_KEY_DISCARD) {
+ return 0;
+ }
+
+ if (key != FUSE_OPT_KEY_KEEP && ctx->proc) {
+ int res = ctx->proc(ctx->data, arg, key, &ctx->outargs);
+ if (res == -1 || !res) {
+ return res;
+ }
+ }
+ if (iso) {
+ return add_opt(ctx, arg);
+ } else {
+ return add_arg(ctx, arg);
+ }
+}
+
+static int match_template(const char *t, const char *arg, unsigned *sepp)
+{
+ int arglen = strlen(arg);
+ const char *sep = strchr(t, '=');
+ sep = sep ? sep : strchr(t, ' ');
+ if (sep && (!sep[1] || sep[1] == '%')) {
+ int tlen = sep - t;
+ if (sep[0] == '=') {
+ tlen++;
+ }
+ if (arglen >= tlen && strncmp(arg, t, tlen) == 0) {
+ *sepp = sep - t;
+ return 1;
+ }
+ }
+ if (strcmp(t, arg) == 0) {
+ *sepp = 0;
+ return 1;
+ }
+ return 0;
+}
+
+static const struct fuse_opt *find_opt(const struct fuse_opt *opt,
+ const char *arg, unsigned *sepp)
+{
+ for (; opt && opt->templ; opt++) {
+ if (match_template(opt->templ, arg, sepp)) {
+ return opt;
+ }
+ }
+ return NULL;
+}
+
+int fuse_opt_match(const struct fuse_opt *opts, const char *opt)
+{
+ unsigned dummy;
+ return find_opt(opts, opt, &dummy) ? 1 : 0;
+}
+
+static int process_opt_param(void *var, const char *format, const char *param,
+ const char *arg)
+{
+ assert(format[0] == '%');
+ if (format[1] == 's') {
+ char **s = var;
+ char *copy = strdup(param);
+ if (!copy) {
+ return alloc_failed();
+ }
+
+ free(*s);
+ *s = copy;
+ } else {
+ if (sscanf(param, format, var) != 1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n",
+ arg);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt,
+ unsigned sep, const char *arg, int iso)
+{
+ if (opt->offset == -1U) {
+ if (call_proc(ctx, arg, opt->value, iso) == -1) {
+ return -1;
+ }
+ } else {
+ void *var = (char *)ctx->data + opt->offset;
+ if (sep && opt->templ[sep + 1]) {
+ const char *param = arg + sep;
+ if (opt->templ[sep] == '=') {
+ param++;
+ }
+ if (process_opt_param(var, opt->templ + sep + 1, param, arg) ==
+ -1) {
+ return -1;
+ }
+ } else {
+ *(int *)var = opt->value;
+ }
+ }
+ return 0;
+}
+
+static int process_opt_sep_arg(struct fuse_opt_context *ctx,
+ const struct fuse_opt *opt, unsigned sep,
+ const char *arg, int iso)
+{
+ int res;
+ char *newarg;
+ char *param;
+
+ if (next_arg(ctx, arg) == -1) {
+ return -1;
+ }
+
+ param = ctx->argv[ctx->argctr];
+ newarg = g_try_malloc(sep + strlen(param) + 1);
+ if (!newarg) {
+ return alloc_failed();
+ }
+
+ memcpy(newarg, arg, sep);
+ strcpy(newarg + sep, param);
+ res = process_opt(ctx, opt, sep, newarg, iso);
+ g_free(newarg);
+
+ return res;
+}
+
+static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso)
+{
+ unsigned sep;
+ const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep);
+ if (opt) {
+ for (; opt; opt = find_opt(opt + 1, arg, &sep)) {
+ int res;
+ if (sep && opt->templ[sep] == ' ' && !arg[sep]) {
+ res = process_opt_sep_arg(ctx, opt, sep, arg, iso);
+ } else {
+ res = process_opt(ctx, opt, sep, arg, iso);
+ }
+ if (res == -1) {
+ return -1;
+ }
+ }
+ return 0;
+ } else {
+ return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso);
+ }
+}
+
+static int process_real_option_group(struct fuse_opt_context *ctx, char *opts)
+{
+ char *s = opts;
+ char *d = s;
+ int end = 0;
+
+ while (!end) {
+ if (*s == '\0') {
+ end = 1;
+ }
+ if (*s == ',' || end) {
+ int res;
+
+ *d = '\0';
+ res = process_gopt(ctx, opts, 1);
+ if (res == -1) {
+ return -1;
+ }
+ d = opts;
+ } else {
+ if (s[0] == '\\' && s[1] != '\0') {
+ s++;
+ if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' &&
+ s[2] >= '0' && s[2] <= '7') {
+ *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 +
+ (s[2] - '0');
+ s += 2;
+ } else {
+ *d++ = *s;
+ }
+ } else {
+ *d++ = *s;
+ }
+ }
+ s++;
+ }
+
+ return 0;
+}
+
+static int process_option_group(struct fuse_opt_context *ctx, const char *opts)
+{
+ int res;
+ char *copy = strdup(opts);
+
+ if (!copy) {
+ fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n");
+ return -1;
+ }
+ res = process_real_option_group(ctx, copy);
+ free(copy);
+ return res;
+}
+
+static int process_one(struct fuse_opt_context *ctx, const char *arg)
+{
+ if (ctx->nonopt || arg[0] != '-') {
+ return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0);
+ } else if (arg[1] == 'o') {
+ if (arg[2]) {
+ return process_option_group(ctx, arg + 2);
+ } else {
+ if (next_arg(ctx, arg) == -1) {
+ return -1;
+ }
+
+ return process_option_group(ctx, ctx->argv[ctx->argctr]);
+ }
+ } else if (arg[1] == '-' && !arg[2]) {
+ if (add_arg(ctx, arg) == -1) {
+ return -1;
+ }
+ ctx->nonopt = ctx->outargs.argc;
+ return 0;
+ } else {
+ return process_gopt(ctx, arg, 0);
+ }
+}
+
+static int opt_parse(struct fuse_opt_context *ctx)
+{
+ if (ctx->argc) {
+ if (add_arg(ctx, ctx->argv[0]) == -1) {
+ return -1;
+ }
+ }
+
+ for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) {
+ if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) {
+ return -1;
+ }
+ }
+
+ if (ctx->opts) {
+ if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 ||
+ fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) {
+ return -1;
+ }
+ }
+
+ /* If option separator ("--") is the last argument, remove it */
+ if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc &&
+ strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) {
+ free(ctx->outargs.argv[ctx->outargs.argc - 1]);
+ ctx->outargs.argv[--ctx->outargs.argc] = NULL;
+ }
+
+ return 0;
+}
+
+int fuse_opt_parse(struct fuse_args *args, void *data,
+ const struct fuse_opt opts[], fuse_opt_proc_t proc)
+{
+ int res;
+ struct fuse_opt_context ctx = {
+ .data = data,
+ .opt = opts,
+ .proc = proc,
+ };
+
+ if (!args || !args->argv || !args->argc) {
+ return 0;
+ }
+
+ ctx.argc = args->argc;
+ ctx.argv = args->argv;
+
+ res = opt_parse(&ctx);
+ if (res != -1) {
+ struct fuse_args tmp = *args;
+ *args = ctx.outargs;
+ ctx.outargs = tmp;
+ }
+ free(ctx.opts);
+ fuse_opt_free_args(&ctx.outargs);
+ return res;
+}
diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h
new file mode 100644
index 000000000..8f59b4d30
--- /dev/null
+++ b/tools/virtiofsd/fuse_opt.h
@@ -0,0 +1,272 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#ifndef FUSE_OPT_H_
+#define FUSE_OPT_H_
+
+/** @file
+ *
+ * This file defines the option parsing interface of FUSE
+ */
+
+/**
+ * Option description
+ *
+ * This structure describes a single option, and action associated
+ * with it, in case it matches.
+ *
+ * More than one such match may occur, in which case the action for
+ * each match is executed.
+ *
+ * There are three possible actions in case of a match:
+ *
+ * i) An integer (int or unsigned) variable determined by 'offset' is
+ * set to 'value'
+ *
+ * ii) The processing function is called, with 'value' as the key
+ *
+ * iii) An integer (any) or string (char *) variable determined by
+ * 'offset' is set to the value of an option parameter
+ *
+ * 'offset' should normally be either set to
+ *
+ * - 'offsetof(struct foo, member)' actions i) and iii)
+ *
+ * - -1 action ii)
+ *
+ * The 'offsetof()' macro is defined in the <stddef.h> header.
+ *
+ * The template determines which options match, and also have an
+ * effect on the action. Normally the action is either i) or ii), but
+ * if a format is present in the template, then action iii) is
+ * performed.
+ *
+ * The types of templates are:
+ *
+ * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only
+ * themselves. Invalid values are "--" and anything beginning
+ * with "-o"
+ *
+ * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or
+ * the relevant option in a comma separated option list
+ *
+ * 3) "bar=", "--foo=", etc. These are variations of 1) and 2)
+ * which have a parameter
+ *
+ * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform
+ * action iii).
+ *
+ * 5) "-x ", etc. Matches either "-xparam" or "-x param" as
+ * two separate arguments
+ *
+ * 6) "-x %s", etc. Combination of 4) and 5)
+ *
+ * If the format is "%s", memory is allocated for the string unlike with
+ * scanf(). The previous value (if non-NULL) stored at the this location is
+ * freed.
+ */
+struct fuse_opt {
+ /** Matching template and optional parameter formatting */
+ const char *templ;
+
+ /**
+ * Offset of variable within 'data' parameter of fuse_opt_parse()
+ * or -1
+ */
+ unsigned long offset;
+
+ /**
+ * Value to set the variable to, or to be passed as 'key' to the
+ * processing function. Ignored if template has a format
+ */
+ int value;
+};
+
+/**
+ * Key option. In case of a match, the processing function will be
+ * called with the specified key.
+ */
+#define FUSE_OPT_KEY(templ, key) \
+ { \
+ templ, -1U, key \
+ }
+
+/**
+ * Last option. An array of 'struct fuse_opt' must end with a NULL
+ * template value
+ */
+#define FUSE_OPT_END \
+ { \
+ NULL, 0, 0 \
+ }
+
+/**
+ * Argument list
+ */
+struct fuse_args {
+ /** Argument count */
+ int argc;
+
+ /** Argument vector. NULL terminated */
+ char **argv;
+
+ /** Is 'argv' allocated? */
+ int allocated;
+};
+
+/**
+ * Initializer for 'struct fuse_args'
+ */
+#define FUSE_ARGS_INIT(argc, argv) \
+ { \
+ argc, argv, 0 \
+ }
+
+/**
+ * Key value passed to the processing function if an option did not
+ * match any template
+ */
+#define FUSE_OPT_KEY_OPT -1
+
+/**
+ * Key value passed to the processing function for all non-options
+ *
+ * Non-options are the arguments beginning with a character other than
+ * '-' or all arguments after the special '--' option
+ */
+#define FUSE_OPT_KEY_NONOPT -2
+
+/**
+ * Special key value for options to keep
+ *
+ * Argument is not passed to processing function, but behave as if the
+ * processing function returned 1
+ */
+#define FUSE_OPT_KEY_KEEP -3
+
+/**
+ * Special key value for options to discard
+ *
+ * Argument is not passed to processing function, but behave as if the
+ * processing function returned zero
+ */
+#define FUSE_OPT_KEY_DISCARD -4
+
+/**
+ * Processing function
+ *
+ * This function is called if
+ * - option did not match any 'struct fuse_opt'
+ * - argument is a non-option
+ * - option did match and offset was set to -1
+ *
+ * The 'arg' parameter will always contain the whole argument or
+ * option including the parameter if exists. A two-argument option
+ * ("-x foo") is always converted to single argument option of the
+ * form "-xfoo" before this function is called.
+ *
+ * Options of the form '-ofoo' are passed to this function without the
+ * '-o' prefix.
+ *
+ * The return value of this function determines whether this argument
+ * is to be inserted into the output argument vector, or discarded.
+ *
+ * @param data is the user data passed to the fuse_opt_parse() function
+ * @param arg is the whole argument or option
+ * @param key determines why the processing function was called
+ * @param outargs the current output argument list
+ * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept
+ */
+typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key,
+ struct fuse_args *outargs);
+
+/**
+ * Option parsing function
+ *
+ * If 'args' was returned from a previous call to fuse_opt_parse() or
+ * it was constructed from
+ *
+ * A NULL 'args' is equivalent to an empty argument vector
+ *
+ * A NULL 'opts' is equivalent to an 'opts' array containing a single
+ * end marker
+ *
+ * A NULL 'proc' is equivalent to a processing function always
+ * returning '1'
+ *
+ * @param args is the input and output argument list
+ * @param data is the user data
+ * @param opts is the option description array
+ * @param proc is the processing function
+ * @return -1 on error, 0 on success
+ */
+int fuse_opt_parse(struct fuse_args *args, void *data,
+ const struct fuse_opt opts[], fuse_opt_proc_t proc);
+
+/**
+ * Add an option to a comma separated option list
+ *
+ * @param opts is a pointer to an option list, may point to a NULL value
+ * @param opt is the option to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_opt(char **opts, const char *opt);
+
+/**
+ * Add an option, escaping commas, to a comma separated option list
+ *
+ * @param opts is a pointer to an option list, may point to a NULL value
+ * @param opt is the option to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_opt_escaped(char **opts, const char *opt);
+
+/**
+ * Add an argument to a NULL terminated argument vector
+ *
+ * @param args is the structure containing the current argument list
+ * @param arg is the new argument to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_add_arg(struct fuse_args *args, const char *arg);
+
+/**
+ * Add an argument at the specified position in a NULL terminated
+ * argument vector
+ *
+ * Adds the argument to the N-th position. This is useful for adding
+ * options at the beginning of the array which must not come after the
+ * special '--' option.
+ *
+ * @param args is the structure containing the current argument list
+ * @param pos is the position at which to add the argument
+ * @param arg is the new argument to add
+ * @return -1 on allocation error, 0 on success
+ */
+int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg);
+
+/**
+ * Free the contents of argument list
+ *
+ * The structure itself is not freed
+ *
+ * @param args is the structure containing the argument list
+ */
+void fuse_opt_free_args(struct fuse_args *args);
+
+
+/**
+ * Check if an option matches
+ *
+ * @param opts is the option description array
+ * @param opt is the option to match
+ * @return 1 if a match is found, 0 if not
+ */
+int fuse_opt_match(const struct fuse_opt opts[], const char *opt);
+
+#endif /* FUSE_OPT_H_ */
diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c
new file mode 100644
index 000000000..1de46de1c
--- /dev/null
+++ b/tools/virtiofsd/fuse_signals.c
@@ -0,0 +1,93 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Utility functions for setting signal handlers.
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+
+
+static struct fuse_session *fuse_instance;
+
+static void exit_handler(int sig)
+{
+ if (fuse_instance) {
+ fuse_session_exit(fuse_instance);
+ if (sig <= 0) {
+ fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n");
+ abort();
+ }
+ fuse_instance->error = sig;
+ }
+}
+
+static void do_nothing(int sig)
+{
+ (void)sig;
+}
+
+static int set_one_signal_handler(int sig, void (*handler)(int), int remove)
+{
+ struct sigaction sa;
+ struct sigaction old_sa;
+
+ memset(&sa, 0, sizeof(struct sigaction));
+ sa.sa_handler = remove ? SIG_DFL : handler;
+ sigemptyset(&(sa.sa_mask));
+ sa.sa_flags = 0;
+
+ if (sigaction(sig, NULL, &old_sa) == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ if (old_sa.sa_handler == (remove ? handler : SIG_DFL) &&
+ sigaction(sig, &sa, NULL) == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+int fuse_set_signal_handlers(struct fuse_session *se)
+{
+ /*
+ * If we used SIG_IGN instead of the do_nothing function,
+ * then we would be unable to tell if we set SIG_IGN (and
+ * thus should reset to SIG_DFL in fuse_remove_signal_handlers)
+ * or if it was already set to SIG_IGN (and should be left
+ * untouched.
+ */
+ if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGINT, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 ||
+ set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) {
+ return -1;
+ }
+
+ fuse_instance = se;
+ return 0;
+}
+
+void fuse_remove_signal_handlers(struct fuse_session *se)
+{
+ if (fuse_instance != se) {
+ fuse_log(FUSE_LOG_ERR,
+ "fuse: fuse_remove_signal_handlers: unknown session\n");
+ } else {
+ fuse_instance = NULL;
+ }
+
+ set_one_signal_handler(SIGHUP, exit_handler, 1);
+ set_one_signal_handler(SIGINT, exit_handler, 1);
+ set_one_signal_handler(SIGTERM, exit_handler, 1);
+ set_one_signal_handler(SIGPIPE, do_nothing, 1);
+}
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
new file mode 100644
index 000000000..60b96470c
--- /dev/null
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -0,0 +1,1079 @@
+/*
+ * virtio-fs glue for FUSE
+ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * Implements the glue between libfuse and libvhost-user
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iov.h"
+#include "qapi/error.h"
+#include "fuse_i.h"
+#include "standard-headers/linux/fuse.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+#include "fuse_virtio.h"
+
+#include <sys/eventfd.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <grp.h>
+
+#include "libvhost-user.h"
+
+struct fv_VuDev;
+struct fv_QueueInfo {
+ pthread_t thread;
+ /*
+ * This lock protects the VuVirtq preventing races between
+ * fv_queue_thread() and fv_queue_worker().
+ */
+ pthread_mutex_t vq_lock;
+
+ struct fv_VuDev *virtio_dev;
+
+ /* Our queue index, corresponds to array position */
+ int qidx;
+ int kick_fd;
+ int kill_fd; /* For killing the thread */
+};
+
+/* A FUSE request */
+typedef struct {
+ VuVirtqElement elem;
+ struct fuse_chan ch;
+
+ /* Used to complete requests that involve no reply */
+ bool reply_sent;
+} FVRequest;
+
+/*
+ * We pass the dev element into libvhost-user
+ * and then use it to get back to the outer
+ * container for other data.
+ */
+struct fv_VuDev {
+ VuDev dev;
+ struct fuse_session *se;
+
+ /*
+ * Either handle virtqueues or vhost-user protocol messages. Don't do
+ * both at the same time since that could lead to race conditions if
+ * virtqueues or memory tables change while another thread is accessing
+ * them.
+ *
+ * The assumptions are:
+ * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev.
+ * 2. virtio_loop() reads/writes virtqueues and VuDev.
+ */
+ pthread_rwlock_t vu_dispatch_rwlock;
+
+ /*
+ * The following pair of fields are only accessed in the main
+ * virtio_loop
+ */
+ size_t nqueues;
+ struct fv_QueueInfo **qi;
+};
+
+/* Callback from libvhost-user */
+static uint64_t fv_get_features(VuDev *dev)
+{
+ return 1ULL << VIRTIO_F_VERSION_1;
+}
+
+/* Callback from libvhost-user */
+static void fv_set_features(VuDev *dev, uint64_t features)
+{
+}
+
+/*
+ * Callback from libvhost-user if there's a new fd we're supposed to listen
+ * to, typically a queue kick?
+ */
+static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb,
+ void *data)
+{
+ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
+}
+
+/*
+ * Callback from libvhost-user if we're no longer supposed to listen on an fd
+ */
+static void fv_remove_watch(VuDev *dev, int fd)
+{
+ fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd);
+}
+
+/* Callback from libvhost-user to panic */
+static void fv_panic(VuDev *dev, const char *err)
+{
+ fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err);
+ /* TODO: Allow reconnects?? */
+ exit(EXIT_FAILURE);
+}
+
+/*
+ * Copy from an iovec into a fuse_buf (memory only)
+ * Caller must ensure there is space
+ */
+static size_t copy_from_iov(struct fuse_buf *buf, size_t out_num,
+ const struct iovec *out_sg,
+ size_t max)
+{
+ void *dest = buf->mem;
+ size_t copied = 0;
+
+ while (out_num && max) {
+ size_t onelen = out_sg->iov_len;
+ onelen = MIN(onelen, max);
+ memcpy(dest, out_sg->iov_base, onelen);
+ dest += onelen;
+ copied += onelen;
+ out_sg++;
+ out_num--;
+ max -= onelen;
+ }
+
+ return copied;
+}
+
+/*
+ * Skip 'skip' bytes in the iov; 'sg_1stindex' is set as
+ * the index for the 1st iovec to read data from, and
+ * 'sg_1stskip' is the number of bytes to skip in that entry.
+ *
+ * Returns True if there are at least 'skip' bytes in the iovec
+ *
+ */
+static bool skip_iov(const struct iovec *sg, size_t sg_size,
+ size_t skip,
+ size_t *sg_1stindex, size_t *sg_1stskip)
+{
+ size_t vec;
+
+ for (vec = 0; vec < sg_size; vec++) {
+ if (sg[vec].iov_len > skip) {
+ *sg_1stskip = skip;
+ *sg_1stindex = vec;
+
+ return true;
+ }
+
+ skip -= sg[vec].iov_len;
+ }
+
+ *sg_1stindex = vec;
+ *sg_1stskip = 0;
+ return skip == 0;
+}
+
+/*
+ * Copy from one iov to another, the given number of bytes
+ * The caller must have checked sizes.
+ */
+static void copy_iov(struct iovec *src_iov, int src_count,
+ struct iovec *dst_iov, int dst_count, size_t to_copy)
+{
+ size_t dst_offset = 0;
+ /* Outer loop copies 'src' elements */
+ while (to_copy) {
+ assert(src_count);
+ size_t src_len = src_iov[0].iov_len;
+ size_t src_offset = 0;
+
+ if (src_len > to_copy) {
+ src_len = to_copy;
+ }
+ /* Inner loop copies contents of one 'src' to maybe multiple dst. */
+ while (src_len) {
+ assert(dst_count);
+ size_t dst_len = dst_iov[0].iov_len - dst_offset;
+ if (dst_len > src_len) {
+ dst_len = src_len;
+ }
+
+ memcpy(dst_iov[0].iov_base + dst_offset,
+ src_iov[0].iov_base + src_offset, dst_len);
+ src_len -= dst_len;
+ to_copy -= dst_len;
+ src_offset += dst_len;
+ dst_offset += dst_len;
+
+ assert(dst_offset <= dst_iov[0].iov_len);
+ if (dst_offset == dst_iov[0].iov_len) {
+ dst_offset = 0;
+ dst_iov++;
+ dst_count--;
+ }
+ }
+ src_iov++;
+ src_count--;
+ }
+}
+
+/*
+ * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if
+ * a deadlock condition is detected or the current thread already
+ * owns the lock. They can also fail, like pthread_rwlock_unlock(),
+ * if the mutex wasn't properly initialized. None of these are ever
+ * expected to happen.
+ */
+static void vu_dispatch_rdlock(struct fv_VuDev *vud)
+{
+ int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock);
+ assert(ret == 0);
+}
+
+static void vu_dispatch_wrlock(struct fv_VuDev *vud)
+{
+ int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock);
+ assert(ret == 0);
+}
+
+static void vu_dispatch_unlock(struct fv_VuDev *vud)
+{
+ int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock);
+ assert(ret == 0);
+}
+
+static void vq_send_element(struct fv_QueueInfo *qi, VuVirtqElement *elem,
+ ssize_t len)
+{
+ struct fuse_session *se = qi->virtio_dev->se;
+ VuDev *dev = &se->virtio_dev->dev;
+ VuVirtq *q = vu_get_queue(dev, qi->qidx);
+
+ vu_dispatch_rdlock(qi->virtio_dev);
+ pthread_mutex_lock(&qi->vq_lock);
+ vu_queue_push(dev, q, elem, len);
+ vu_queue_notify(dev, q);
+ pthread_mutex_unlock(&qi->vq_lock);
+ vu_dispatch_unlock(qi->virtio_dev);
+}
+
+/*
+ * Called back by ll whenever it wants to send a reply/message back
+ * The 1st element of the iov starts with the fuse_out_header
+ * 'unique'==0 means it's a notify message.
+ */
+int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count)
+{
+ FVRequest *req = container_of(ch, FVRequest, ch);
+ struct fv_QueueInfo *qi = ch->qi;
+ VuVirtqElement *elem = &req->elem;
+ int ret = 0;
+
+ assert(count >= 1);
+ assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
+
+ struct fuse_out_header *out = iov[0].iov_base;
+ /* TODO: Endianness! */
+
+ size_t tosend_len = iov_size(iov, count);
+
+ /* unique == 0 is notification, which we don't support */
+ assert(out->unique);
+ assert(!req->reply_sent);
+
+ /* The 'in' part of the elem is to qemu */
+ unsigned int in_num = elem->in_num;
+ struct iovec *in_sg = elem->in_sg;
+ size_t in_len = iov_size(in_sg, in_num);
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
+ __func__, elem->index, in_num, in_len);
+
+ /*
+ * The elem should have room for a 'fuse_out_header' (out from fuse)
+ * plus the data based on the len in the header.
+ */
+ if (in_len < sizeof(struct fuse_out_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
+ __func__, elem->index);
+ ret = -E2BIG;
+ goto err;
+ }
+ if (in_len < tosend_len) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
+ __func__, elem->index, tosend_len);
+ ret = -E2BIG;
+ goto err;
+ }
+
+ copy_iov(iov, count, in_sg, in_num, tosend_len);
+
+ vq_send_element(qi, elem, tosend_len);
+ req->reply_sent = true;
+
+err:
+ return ret;
+}
+
+/*
+ * Callback from fuse_send_data_iov_* when it's virtio and the buffer
+ * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK
+ * We need send the iov and then the buffer.
+ * Return 0 on success
+ */
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count, struct fuse_bufvec *buf,
+ size_t len)
+{
+ FVRequest *req = container_of(ch, FVRequest, ch);
+ struct fv_QueueInfo *qi = ch->qi;
+ VuVirtqElement *elem = &req->elem;
+ int ret = 0;
+ g_autofree struct iovec *in_sg_cpy = NULL;
+
+ assert(count >= 1);
+ assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
+
+ struct fuse_out_header *out = iov[0].iov_base;
+ /* TODO: Endianness! */
+
+ size_t iov_len = iov_size(iov, count);
+ size_t tosend_len = iov_len + len;
+
+ out->len = tosend_len;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__,
+ count, len, iov_len);
+
+ /* unique == 0 is notification which we don't support */
+ assert(out->unique);
+
+ assert(!req->reply_sent);
+
+ /* The 'in' part of the elem is to qemu */
+ unsigned int in_num = elem->in_num;
+ struct iovec *in_sg = elem->in_sg;
+ size_t in_len = iov_size(in_sg, in_num);
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n",
+ __func__, elem->index, in_num, in_len);
+
+ /*
+ * The elem should have room for a 'fuse_out_header' (out from fuse)
+ * plus the data based on the len in the header.
+ */
+ if (in_len < sizeof(struct fuse_out_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n",
+ __func__, elem->index);
+ return E2BIG;
+ }
+ if (in_len < tosend_len) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n",
+ __func__, elem->index, tosend_len);
+ return E2BIG;
+ }
+
+ /* TODO: Limit to 'len' */
+
+ /* First copy the header data from iov->in_sg */
+ copy_iov(iov, count, in_sg, in_num, iov_len);
+
+ /*
+ * Build a copy of the the in_sg iov so we can skip bits in it,
+ * including changing the offsets
+ */
+ in_sg_cpy = g_new(struct iovec, in_num);
+ memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num);
+ /* These get updated as we skip */
+ struct iovec *in_sg_ptr = in_sg_cpy;
+ unsigned int in_sg_cpy_count = in_num;
+
+ /* skip over parts of in_sg that contained the header iov */
+ iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, iov_len);
+
+ do {
+ fuse_log(FUSE_LOG_DEBUG, "%s: in_sg_cpy_count=%d len remaining=%zd\n",
+ __func__, in_sg_cpy_count, len);
+
+ ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count,
+ buf->buf[0].pos);
+
+ if (ret == -1) {
+ ret = errno;
+ if (ret == EINTR) {
+ continue;
+ }
+ fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n",
+ __func__, len);
+ return ret;
+ }
+
+ if (!ret) {
+ /* EOF case? */
+ fuse_log(FUSE_LOG_DEBUG, "%s: !ret len remaining=%zd\n", __func__,
+ len);
+ break;
+ }
+ fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__,
+ ret, len);
+
+ len -= ret;
+ /* Short read. Retry reading remaining bytes */
+ if (len) {
+ fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__);
+ /* Skip over this much next time around */
+ iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, ret);
+ buf->buf[0].pos += ret;
+ }
+ } while (len);
+
+ /* Need to fix out->len on EOF */
+ if (len) {
+ struct fuse_out_header *out_sg = in_sg[0].iov_base;
+
+ tosend_len -= len;
+ out_sg->len = tosend_len;
+ }
+
+ vq_send_element(qi, elem, tosend_len);
+ req->reply_sent = true;
+ return 0;
+}
+
+static __thread bool clone_fs_called;
+
+/* Process one FVRequest in a thread pool */
+static void fv_queue_worker(gpointer data, gpointer user_data)
+{
+ struct fv_QueueInfo *qi = user_data;
+ struct fuse_session *se = qi->virtio_dev->se;
+ FVRequest *req = data;
+ VuVirtqElement *elem = &req->elem;
+ struct fuse_buf fbuf = {};
+ bool allocated_bufv = false;
+ struct fuse_bufvec bufv;
+ struct fuse_bufvec *pbufv;
+ struct fuse_in_header inh;
+
+ assert(se->bufsize > sizeof(struct fuse_in_header));
+
+ if (!clone_fs_called) {
+ int ret;
+
+ /* unshare FS for xattr operation */
+ ret = unshare(CLONE_FS);
+ /* should not fail */
+ assert(ret == 0);
+
+ clone_fs_called = true;
+ }
+
+ /*
+ * An element contains one request and the space to send our response
+ * They're spread over multiple descriptors in a scatter/gather set
+ * and we can't trust the guest to keep them still; so copy in/out.
+ */
+ fbuf.mem = g_malloc(se->bufsize);
+
+ fuse_mutex_init(&req->ch.lock);
+ req->ch.fd = -1;
+ req->ch.qi = qi;
+
+ /* The 'out' part of the elem is from qemu */
+ unsigned int out_num = elem->out_num;
+ struct iovec *out_sg = elem->out_sg;
+ size_t out_len = iov_size(out_sg, out_num);
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: elem %d: with %d out desc of length %zd\n",
+ __func__, elem->index, out_num, out_len);
+
+ /*
+ * The elem should contain a 'fuse_in_header' (in to fuse)
+ * plus the data based on the len in the header.
+ */
+ if (out_len < sizeof(struct fuse_in_header)) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n",
+ __func__, elem->index);
+ assert(0); /* TODO */
+ }
+ if (out_len > se->bufsize) {
+ fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__,
+ elem->index);
+ assert(0); /* TODO */
+ }
+ /* Copy just the fuse_in_header and look at it */
+ copy_from_iov(&fbuf, out_num, out_sg,
+ sizeof(struct fuse_in_header));
+ memcpy(&inh, fbuf.mem, sizeof(struct fuse_in_header));
+
+ pbufv = NULL; /* Compiler thinks an unitialised path */
+ if (inh.opcode == FUSE_WRITE &&
+ out_len >= (sizeof(struct fuse_in_header) +
+ sizeof(struct fuse_write_in))) {
+ /*
+ * For a write we don't actually need to copy the
+ * data, we can just do it straight out of guest memory
+ * but we must still copy the headers in case the guest
+ * was nasty and changed them while we were using them.
+ */
+ fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__);
+
+ fbuf.size = copy_from_iov(&fbuf, out_num, out_sg,
+ sizeof(struct fuse_in_header) +
+ sizeof(struct fuse_write_in));
+ /* That copy reread the in_header, make sure we use the original */
+ memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
+
+ /* Allocate the bufv, with space for the rest of the iov */
+ pbufv = g_try_malloc(sizeof(struct fuse_bufvec) +
+ sizeof(struct fuse_buf) * out_num);
+ if (!pbufv) {
+ fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n",
+ __func__);
+ goto out;
+ }
+
+ allocated_bufv = true;
+ pbufv->count = 1;
+ pbufv->buf[0] = fbuf;
+
+ size_t iovindex, pbufvindex, iov_bytes_skip;
+ pbufvindex = 1; /* 2 headers, 1 fusebuf */
+
+ if (!skip_iov(out_sg, out_num,
+ sizeof(struct fuse_in_header) +
+ sizeof(struct fuse_write_in),
+ &iovindex, &iov_bytes_skip)) {
+ fuse_log(FUSE_LOG_ERR, "%s: skip failed\n",
+ __func__);
+ goto out;
+ }
+
+ for (; iovindex < out_num; iovindex++, pbufvindex++) {
+ pbufv->count++;
+ pbufv->buf[pbufvindex].pos = ~0; /* Dummy */
+ pbufv->buf[pbufvindex].flags = 0;
+ pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base;
+ pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len;
+
+ if (iov_bytes_skip) {
+ pbufv->buf[pbufvindex].mem += iov_bytes_skip;
+ pbufv->buf[pbufvindex].size -= iov_bytes_skip;
+ iov_bytes_skip = 0;
+ }
+ }
+ } else {
+ /* Normal (non fast write) path */
+
+ copy_from_iov(&fbuf, out_num, out_sg, se->bufsize);
+ /* That copy reread the in_header, make sure we use the original */
+ memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header));
+ fbuf.size = out_len;
+
+ /* TODO! Endianness of header */
+
+ /* TODO: Add checks for fuse_session_exited */
+ bufv.buf[0] = fbuf;
+ bufv.count = 1;
+ pbufv = &bufv;
+ }
+ pbufv->idx = 0;
+ pbufv->off = 0;
+ fuse_session_process_buf_int(se, pbufv, &req->ch);
+
+out:
+ if (allocated_bufv) {
+ g_free(pbufv);
+ }
+
+ /* If the request has no reply, still recycle the virtqueue element */
+ if (!req->reply_sent) {
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
+ elem->index);
+ vq_send_element(qi, elem, 0);
+ }
+
+ pthread_mutex_destroy(&req->ch.lock);
+ g_free(fbuf.mem);
+ free(req);
+}
+
+/* Thread function for individual queues, created when a queue is 'started' */
+static void *fv_queue_thread(void *opaque)
+{
+ struct fv_QueueInfo *qi = opaque;
+ struct VuDev *dev = &qi->virtio_dev->dev;
+ struct VuVirtq *q = vu_get_queue(dev, qi->qidx);
+ struct fuse_session *se = qi->virtio_dev->se;
+ GThreadPool *pool = NULL;
+ GList *req_list = NULL;
+
+ if (se->thread_pool_size) {
+ fuse_log(FUSE_LOG_DEBUG, "%s: Creating thread pool for Queue %d\n",
+ __func__, qi->qidx);
+ pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size,
+ FALSE, NULL);
+ if (!pool) {
+ fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__);
+ return NULL;
+ }
+ }
+
+ fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__,
+ qi->qidx, qi->kick_fd);
+ while (1) {
+ struct pollfd pf[2];
+
+ pf[0].fd = qi->kick_fd;
+ pf[0].events = POLLIN;
+ pf[0].revents = 0;
+ pf[1].fd = qi->kill_fd;
+ pf[1].events = POLLIN;
+ pf[1].revents = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__,
+ qi->qidx);
+ int poll_res = ppoll(pf, 2, NULL, NULL);
+
+ if (poll_res == -1) {
+ if (errno == EINTR) {
+ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
+ __func__);
+ continue;
+ }
+ fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n");
+ break;
+ }
+ assert(poll_res >= 1);
+ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n",
+ __func__, pf[0].revents, qi->qidx);
+ break;
+ }
+ if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Unexpected poll revents %x Queue %d killfd\n",
+ __func__, pf[1].revents, qi->qidx);
+ break;
+ }
+ if (pf[1].revents) {
+ fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n",
+ __func__, qi->qidx);
+ break;
+ }
+ assert(pf[0].revents & POLLIN);
+ fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__,
+ qi->qidx);
+
+ eventfd_t evalue;
+ if (eventfd_read(qi->kick_fd, &evalue)) {
+ fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n");
+ break;
+ }
+ /* Mutual exclusion with virtio_loop() */
+ vu_dispatch_rdlock(qi->virtio_dev);
+ pthread_mutex_lock(&qi->vq_lock);
+ /* out is from guest, in is too guest */
+ unsigned int in_bytes, out_bytes;
+ vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n",
+ __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes);
+
+ while (1) {
+ FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest));
+ if (!req) {
+ break;
+ }
+
+ req->reply_sent = false;
+
+ if (!se->thread_pool_size) {
+ req_list = g_list_prepend(req_list, req);
+ } else {
+ g_thread_pool_push(pool, req, NULL);
+ }
+ }
+
+ pthread_mutex_unlock(&qi->vq_lock);
+ vu_dispatch_unlock(qi->virtio_dev);
+
+ /* Process all the requests. */
+ if (!se->thread_pool_size && req_list != NULL) {
+ req_list = g_list_reverse(req_list);
+ g_list_foreach(req_list, fv_queue_worker, qi);
+ g_list_free(req_list);
+ req_list = NULL;
+ }
+ }
+
+ if (pool) {
+ g_thread_pool_free(pool, FALSE, TRUE);
+ }
+
+ return NULL;
+}
+
+static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx)
+{
+ int ret;
+ struct fv_QueueInfo *ourqi;
+
+ assert(qidx < vud->nqueues);
+ ourqi = vud->qi[qidx];
+
+ /* Kill the thread */
+ if (eventfd_write(ourqi->kill_fd, 1)) {
+ fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n",
+ qidx, strerror(errno));
+ }
+ ret = pthread_join(ourqi->thread, NULL);
+ if (ret) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n",
+ __func__, qidx, ret);
+ }
+ pthread_mutex_destroy(&ourqi->vq_lock);
+ close(ourqi->kill_fd);
+ ourqi->kick_fd = -1;
+ g_free(vud->qi[qidx]);
+ vud->qi[qidx] = NULL;
+}
+
+static void stop_all_queues(struct fv_VuDev *vud)
+{
+ for (int i = 0; i < vud->nqueues; i++) {
+ if (!vud->qi[i]) {
+ continue;
+ }
+
+ fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i);
+ fv_queue_cleanup_thread(vud, i);
+ }
+}
+
+/* Callback from libvhost-user on start or stop of a queue */
+static void fv_queue_set_started(VuDev *dev, int qidx, bool started)
+{
+ struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev);
+ struct fv_QueueInfo *ourqi;
+
+ fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx,
+ started);
+ assert(qidx >= 0);
+
+ /*
+ * Ignore additional request queues for now. passthrough_ll.c must be
+ * audited for thread-safety issues first. It was written with a
+ * well-behaved client in mind and may not protect against all types of
+ * races yet.
+ */
+ if (qidx > 1) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: multiple request queues not yet implemented, please only "
+ "configure 1 request queue\n",
+ __func__);
+ exit(EXIT_FAILURE);
+ }
+
+ if (started) {
+ /* Fire up a thread to watch this queue */
+ if (qidx >= vud->nqueues) {
+ vud->qi = g_realloc_n(vud->qi, qidx + 1, sizeof(vud->qi[0]));
+ memset(vud->qi + vud->nqueues, 0,
+ sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues)));
+ vud->nqueues = qidx + 1;
+ }
+ if (!vud->qi[qidx]) {
+ vud->qi[qidx] = g_new0(struct fv_QueueInfo, 1);
+ vud->qi[qidx]->virtio_dev = vud;
+ vud->qi[qidx]->qidx = qidx;
+ } else {
+ /* Shouldn't have been started */
+ assert(vud->qi[qidx]->kick_fd == -1);
+ }
+ ourqi = vud->qi[qidx];
+ ourqi->kick_fd = dev->vq[qidx].kick_fd;
+
+ ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE);
+ assert(ourqi->kill_fd != -1);
+ pthread_mutex_init(&ourqi->vq_lock, NULL);
+
+ if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n",
+ __func__, qidx);
+ assert(0);
+ }
+ } else {
+ /*
+ * Temporarily drop write-lock taken in virtio_loop() so that
+ * the queue thread doesn't block in virtio_send_msg().
+ */
+ vu_dispatch_unlock(vud);
+ fv_queue_cleanup_thread(vud, qidx);
+ vu_dispatch_wrlock(vud);
+ }
+}
+
+static bool fv_queue_order(VuDev *dev, int qidx)
+{
+ return false;
+}
+
+static const VuDevIface fv_iface = {
+ .get_features = fv_get_features,
+ .set_features = fv_set_features,
+
+ /* Don't need process message, we've not got any at vhost-user level */
+ .queue_set_started = fv_queue_set_started,
+
+ .queue_is_processed_in_order = fv_queue_order,
+};
+
+/*
+ * Main loop; this mostly deals with events on the vhost-user
+ * socket itself, and not actual fuse data.
+ */
+int virtio_loop(struct fuse_session *se)
+{
+ fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__);
+
+ while (!fuse_session_exited(se)) {
+ struct pollfd pf[1];
+ bool ok;
+ pf[0].fd = se->vu_socketfd;
+ pf[0].events = POLLIN;
+ pf[0].revents = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__);
+ int poll_res = ppoll(pf, 1, NULL, NULL);
+
+ if (poll_res == -1) {
+ if (errno == EINTR) {
+ fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n",
+ __func__);
+ continue;
+ }
+ fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n");
+ break;
+ }
+ assert(poll_res == 1);
+ if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) {
+ fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__,
+ pf[0].revents);
+ break;
+ }
+ assert(pf[0].revents & POLLIN);
+ fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__);
+ /* Mutual exclusion with fv_queue_thread() */
+ vu_dispatch_wrlock(se->virtio_dev);
+
+ ok = vu_dispatch(&se->virtio_dev->dev);
+
+ vu_dispatch_unlock(se->virtio_dev);
+
+ if (!ok) {
+ fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__);
+ break;
+ }
+ }
+
+ /*
+ * Make sure all fv_queue_thread()s quit on exit, as we're about to
+ * free virtio dev and fuse session, no one should access them anymore.
+ */
+ stop_all_queues(se->virtio_dev);
+ fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__);
+
+ return 0;
+}
+
+static void strreplace(char *s, char old, char new)
+{
+ for (; *s; ++s) {
+ if (*s == old) {
+ *s = new;
+ }
+ }
+}
+
+static bool fv_socket_lock(struct fuse_session *se)
+{
+ g_autofree gchar *sk_name = NULL;
+ g_autofree gchar *pidfile = NULL;
+ g_autofree gchar *dir = NULL;
+ Error *local_err = NULL;
+
+ dir = qemu_get_local_state_pathname("run/virtiofsd");
+
+ if (g_mkdir_with_parents(dir, S_IRWXU) < 0) {
+ fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s\n",
+ __func__, dir, strerror(errno));
+ return false;
+ }
+
+ sk_name = g_strdup(se->vu_socket_path);
+ strreplace(sk_name, '/', '.');
+ pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name);
+
+ if (!qemu_write_pidfile(pidfile, &local_err)) {
+ error_report_err(local_err);
+ return false;
+ }
+
+ return true;
+}
+
+static int fv_create_listen_socket(struct fuse_session *se)
+{
+ struct sockaddr_un un;
+ mode_t old_umask;
+
+ /* Nothing to do if fd is already initialized */
+ if (se->vu_listen_fd >= 0) {
+ return 0;
+ }
+
+ if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) {
+ fuse_log(FUSE_LOG_ERR, "Socket path too long\n");
+ return -1;
+ }
+
+ if (!strlen(se->vu_socket_path)) {
+ fuse_log(FUSE_LOG_ERR, "Socket path is empty\n");
+ return -1;
+ }
+
+ /* Check the vu_socket_path is already used */
+ if (!fv_socket_lock(se)) {
+ return -1;
+ }
+
+ /*
+ * Create the Unix socket to communicate with qemu
+ * based on QEMU's vhost-user-bridge
+ */
+ unlink(se->vu_socket_path);
+ strcpy(un.sun_path, se->vu_socket_path);
+ size_t addr_len = sizeof(un);
+
+ int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0);
+ if (listen_sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n");
+ return -1;
+ }
+ un.sun_family = AF_UNIX;
+
+ /*
+ * Unfortunately bind doesn't let you set the mask on the socket,
+ * so set umask appropriately and restore it later.
+ */
+ if (se->vu_socket_group) {
+ old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH);
+ } else {
+ old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP |
+ S_IROTH | S_IWOTH | S_IXOTH);
+ }
+ if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n");
+ close(listen_sock);
+ umask(old_umask);
+ return -1;
+ }
+ if (se->vu_socket_group) {
+ struct group *g = getgrnam(se->vu_socket_group);
+ if (g) {
+ if (chown(se->vu_socket_path, -1, g->gr_gid) == -1) {
+ fuse_log(FUSE_LOG_WARNING,
+ "vhost socket failed to set group to %s (%d): %m\n",
+ se->vu_socket_group, g->gr_gid);
+ }
+ } else {
+ fuse_log(FUSE_LOG_ERR,
+ "vhost socket: unable to find group '%s'\n",
+ se->vu_socket_group);
+ close(listen_sock);
+ umask(old_umask);
+ return -1;
+ }
+ }
+ umask(old_umask);
+
+ if (listen(listen_sock, 1) == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n");
+ close(listen_sock);
+ return -1;
+ }
+
+ se->vu_listen_fd = listen_sock;
+ return 0;
+}
+
+int virtio_session_mount(struct fuse_session *se)
+{
+ int ret;
+
+ /*
+ * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's
+ * an unprivileged system call but some Docker/Moby versions are known to
+ * reject it via seccomp when CAP_SYS_ADMIN is not given.
+ *
+ * Note that the program is single-threaded here so this syscall has no
+ * visible effect and is safe to make.
+ */
+ ret = unshare(CLONE_FS);
+ if (ret == -1 && errno == EPERM) {
+ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If "
+ "running in a container please check that the container "
+ "runtime seccomp policy allows unshare.\n");
+ return -1;
+ }
+
+ ret = fv_create_listen_socket(se);
+ if (ret < 0) {
+ return ret;
+ }
+
+ se->fd = -1;
+
+ fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n",
+ __func__);
+ int data_sock = accept(se->vu_listen_fd, NULL, NULL);
+ if (data_sock == -1) {
+ fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n");
+ close(se->vu_listen_fd);
+ return -1;
+ }
+ close(se->vu_listen_fd);
+ se->vu_listen_fd = -1;
+ fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n",
+ __func__);
+
+ /* TODO: Some cleanup/deallocation! */
+ se->virtio_dev = g_new0(struct fv_VuDev, 1);
+
+ se->vu_socketfd = data_sock;
+ se->virtio_dev->se = se;
+ pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL);
+ if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL,
+ fv_set_watch, fv_remove_watch, &fv_iface)) {
+ fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__);
+ return -1;
+ }
+
+ return 0;
+}
+
+void virtio_session_close(struct fuse_session *se)
+{
+ close(se->vu_socketfd);
+
+ if (!se->virtio_dev) {
+ return;
+ }
+
+ g_free(se->virtio_dev->qi);
+ pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock);
+ g_free(se->virtio_dev);
+ se->virtio_dev = NULL;
+}
diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h
new file mode 100644
index 000000000..111684032
--- /dev/null
+++ b/tools/virtiofsd/fuse_virtio.h
@@ -0,0 +1,33 @@
+/*
+ * virtio-fs glue for FUSE
+ * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ * Dave Gilbert <dgilbert@redhat.com>
+ *
+ * Implements the glue between libfuse and libvhost-user
+ *
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB
+ */
+
+#ifndef FUSE_VIRTIO_H
+#define FUSE_VIRTIO_H
+
+#include "fuse_i.h"
+
+struct fuse_session;
+
+int virtio_session_mount(struct fuse_session *se);
+void virtio_session_close(struct fuse_session *se);
+int virtio_loop(struct fuse_session *se);
+
+
+int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count);
+
+int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch,
+ struct iovec *iov, int count,
+ struct fuse_bufvec *buf, size_t len);
+
+#endif
diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c
new file mode 100644
index 000000000..a8295d975
--- /dev/null
+++ b/tools/virtiofsd/helper.c
@@ -0,0 +1,405 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * Helper functions to create (simple) standalone programs. With the
+ * aid of these functions it should be possible to create full FUSE
+ * file system by implementing nothing but the request handlers.
+
+ * This program can be distributed under the terms of the GNU LGPLv2.
+ * See the file COPYING.LIB.
+ */
+
+#include "qemu/osdep.h"
+#include "fuse_i.h"
+#include "fuse_lowlevel.h"
+#include "fuse_misc.h"
+#include "fuse_opt.h"
+
+#include <sys/param.h>
+#include <sys/resource.h>
+
+#define FUSE_HELPER_OPT(t, p) \
+ { \
+ t, offsetof(struct fuse_cmdline_opts, p), 1 \
+ }
+#define FUSE_HELPER_OPT_VALUE(t, p, v) \
+ { \
+ t, offsetof(struct fuse_cmdline_opts, p), v \
+ }
+
+static const struct fuse_opt fuse_helper_opts[] = {
+ FUSE_HELPER_OPT("-h", show_help),
+ FUSE_HELPER_OPT("--help", show_help),
+ FUSE_HELPER_OPT("-V", show_version),
+ FUSE_HELPER_OPT("--version", show_version),
+ FUSE_HELPER_OPT("--print-capabilities", print_capabilities),
+ FUSE_HELPER_OPT("-d", debug),
+ FUSE_HELPER_OPT("debug", debug),
+ FUSE_HELPER_OPT("-d", foreground),
+ FUSE_HELPER_OPT("debug", foreground),
+ FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP),
+ FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("-f", foreground),
+ FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0),
+ FUSE_HELPER_OPT("fsname=", nodefault_subtype),
+ FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("subtype=", nodefault_subtype),
+ FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP),
+ FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads),
+ FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile),
+ FUSE_HELPER_OPT("--syslog", syslog),
+ FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG),
+ FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO),
+ FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING),
+ FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR),
+ FUSE_OPT_END
+};
+
+struct fuse_conn_info_opts {
+ int atomic_o_trunc;
+ int no_remote_posix_lock;
+ int no_remote_flock;
+ int splice_write;
+ int splice_move;
+ int splice_read;
+ int no_splice_write;
+ int no_splice_move;
+ int no_splice_read;
+ int auto_inval_data;
+ int no_auto_inval_data;
+ int no_readdirplus;
+ int no_readdirplus_auto;
+ int async_dio;
+ int no_async_dio;
+ int writeback_cache;
+ int no_writeback_cache;
+ int async_read;
+ int sync_read;
+ unsigned max_write;
+ unsigned max_readahead;
+ unsigned max_background;
+ unsigned congestion_threshold;
+ unsigned time_gran;
+ int set_max_write;
+ int set_max_readahead;
+ int set_max_background;
+ int set_congestion_threshold;
+ int set_time_gran;
+};
+
+#define CONN_OPTION(t, p, v) \
+ { \
+ t, offsetof(struct fuse_conn_info_opts, p), v \
+ }
+static const struct fuse_opt conn_info_opt_spec[] = {
+ CONN_OPTION("max_write=%u", max_write, 0),
+ CONN_OPTION("max_write=", set_max_write, 1),
+ CONN_OPTION("max_readahead=%u", max_readahead, 0),
+ CONN_OPTION("max_readahead=", set_max_readahead, 1),
+ CONN_OPTION("max_background=%u", max_background, 0),
+ CONN_OPTION("max_background=", set_max_background, 1),
+ CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0),
+ CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1),
+ CONN_OPTION("sync_read", sync_read, 1),
+ CONN_OPTION("async_read", async_read, 1),
+ CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1),
+ CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1),
+ CONN_OPTION("no_remote_lock", no_remote_flock, 1),
+ CONN_OPTION("no_remote_flock", no_remote_flock, 1),
+ CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1),
+ CONN_OPTION("splice_write", splice_write, 1),
+ CONN_OPTION("no_splice_write", no_splice_write, 1),
+ CONN_OPTION("splice_move", splice_move, 1),
+ CONN_OPTION("no_splice_move", no_splice_move, 1),
+ CONN_OPTION("splice_read", splice_read, 1),
+ CONN_OPTION("no_splice_read", no_splice_read, 1),
+ CONN_OPTION("auto_inval_data", auto_inval_data, 1),
+ CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1),
+ CONN_OPTION("readdirplus=no", no_readdirplus, 1),
+ CONN_OPTION("readdirplus=yes", no_readdirplus, 0),
+ CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1),
+ CONN_OPTION("readdirplus=auto", no_readdirplus, 0),
+ CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0),
+ CONN_OPTION("async_dio", async_dio, 1),
+ CONN_OPTION("no_async_dio", no_async_dio, 1),
+ CONN_OPTION("writeback_cache", writeback_cache, 1),
+ CONN_OPTION("no_writeback_cache", no_writeback_cache, 1),
+ CONN_OPTION("time_gran=%u", time_gran, 0),
+ CONN_OPTION("time_gran=", set_time_gran, 1),
+ FUSE_OPT_END
+};
+
+
+void fuse_cmdline_help(void)
+{
+ printf(" -h --help print help\n"
+ " -V --version print version\n"
+ " --print-capabilities print vhost-user.json\n"
+ " -d -o debug enable debug output (implies -f)\n"
+ " --syslog log to syslog (default stderr)\n"
+ " -f foreground operation\n"
+ " --daemonize run in background\n"
+ " -o cache=<mode> cache mode. could be one of \"auto, "
+ "always, none\"\n"
+ " default: auto\n"
+ " -o flock|no_flock enable/disable flock\n"
+ " default: no_flock\n"
+ " -o log_level=<level> log level, default to \"info\"\n"
+ " level could be one of \"debug, "
+ "info, warn, err\"\n"
+ " -o max_idle_threads the maximum number of idle worker "
+ "threads\n"
+ " allowed (default: 10)\n"
+ " -o posix_lock|no_posix_lock\n"
+ " enable/disable remote posix lock\n"
+ " default: no_posix_lock\n"
+ " -o readdirplus|no_readdirplus\n"
+ " enable/disable readirplus\n"
+ " default: readdirplus except with "
+ "cache=none\n"
+ " -o sandbox=namespace|chroot\n"
+ " sandboxing mode:\n"
+ " - namespace: mount, pid, and net\n"
+ " namespaces with pivot_root(2)\n"
+ " into shared directory\n"
+ " - chroot: chroot(2) into shared\n"
+ " directory (use in containers)\n"
+ " default: namespace\n"
+ " -o timeout=<number> I/O timeout (seconds)\n"
+ " default: depends on cache= option.\n"
+ " -o writeback|no_writeback enable/disable writeback cache\n"
+ " default: no_writeback\n"
+ " -o xattr|no_xattr enable/disable xattr\n"
+ " default: no_xattr\n"
+ " -o xattrmap=<mapping> Enable xattr mapping (enables xattr)\n"
+ " <mapping> is a string consists of a series of rules\n"
+ " e.g. -o xattrmap=:map::user.virtiofs.:\n"
+ " -o modcaps=CAPLIST Modify the list of capabilities\n"
+ " e.g. -o modcaps=+sys_admin:-chown\n"
+ " --rlimit-nofile=<num> set maximum number of file descriptors\n"
+ " (0 leaves rlimit unchanged)\n"
+ " default: min(1000000, fs.file-max - 16384)\n"
+ " if the current rlimit is lower\n"
+ " -o allow_direct_io|no_allow_direct_io\n"
+ " retain/discard O_DIRECT flags passed down\n"
+ " to virtiofsd from guest applications.\n"
+ " default: no_allow_direct_io\n"
+ " -o announce_submounts Announce sub-mount points to the guest\n"
+ " -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n"
+ );
+}
+
+static int fuse_helper_opt_proc(void *data, const char *arg, int key,
+ struct fuse_args *outargs)
+{
+ (void)data;
+ (void)outargs;
+
+ switch (key) {
+ case FUSE_OPT_KEY_NONOPT:
+ fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg);
+ return -1;
+
+ default:
+ /* Pass through unknown options */
+ return 1;
+ }
+}
+
+static unsigned long get_default_rlimit_nofile(void)
+{
+ g_autofree gchar *file_max_str = NULL;
+ const rlim_t reserved_fds = 16384; /* leave at least this many fds free */
+ rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */
+ rlim_t file_max;
+ struct rlimit rlim;
+
+ /*
+ * Reduce max_fds below the system-wide maximum, if necessary. This
+ * ensures there are fds available for other processes so we don't
+ * cause resource exhaustion.
+ */
+ if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str,
+ NULL, NULL)) {
+ fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n");
+ exit(1);
+ }
+ file_max = g_ascii_strtoull(file_max_str, NULL, 10);
+ if (file_max < 2 * reserved_fds) {
+ fuse_log(FUSE_LOG_ERR,
+ "The fs.file-max sysctl is too low (%lu) to allow a "
+ "reasonable number of open files.\n",
+ (unsigned long)file_max);
+ exit(1);
+ }
+ max_fds = MIN(file_max - reserved_fds, max_fds);
+
+ if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+
+ if (rlim.rlim_cur >= max_fds) {
+ return 0; /* we have more fds available than required! */
+ }
+ return max_fds;
+}
+
+int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts)
+{
+ memset(opts, 0, sizeof(struct fuse_cmdline_opts));
+
+ opts->max_idle_threads = 10;
+ opts->rlimit_nofile = get_default_rlimit_nofile();
+ opts->foreground = 1;
+
+ if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) ==
+ -1) {
+ return -1;
+ }
+
+ return 0;
+}
+
+
+int fuse_daemonize(int foreground)
+{
+ int ret = 0, rett;
+ if (!foreground) {
+ int nullfd;
+ int waiter[2];
+ char completed;
+
+ if (pipe(waiter)) {
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ /*
+ * demonize current process by forking it and killing the
+ * parent. This makes current process as a child of 'init'.
+ */
+ switch (fork()) {
+ case -1:
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n",
+ strerror(errno));
+ return -1;
+ case 0:
+ break;
+ default:
+ _exit(read(waiter[0], &completed,
+ sizeof(completed) != sizeof(completed)));
+ }
+
+ if (setsid() == -1) {
+ fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n",
+ strerror(errno));
+ return -1;
+ }
+
+ ret = chdir("/");
+
+ nullfd = open("/dev/null", O_RDWR, 0);
+ if (nullfd != -1) {
+ rett = dup2(nullfd, 0);
+ if (!ret) {
+ ret = rett;
+ }
+ rett = dup2(nullfd, 1);
+ if (!ret) {
+ ret = rett;
+ }
+ rett = dup2(nullfd, 2);
+ if (!ret) {
+ ret = rett;
+ }
+ if (nullfd > 2) {
+ close(nullfd);
+ }
+ }
+
+ /* Propagate completion of daemon initialization */
+ completed = 1;
+ rett = write(waiter[1], &completed, sizeof(completed));
+ if (!ret) {
+ ret = rett;
+ }
+ close(waiter[0]);
+ close(waiter[1]);
+ } else {
+ ret = chdir("/");
+ }
+ return ret;
+}
+
+void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts,
+ struct fuse_conn_info *conn)
+{
+ if (opts->set_max_write) {
+ conn->max_write = opts->max_write;
+ }
+ if (opts->set_max_background) {
+ conn->max_background = opts->max_background;
+ }
+ if (opts->set_congestion_threshold) {
+ conn->congestion_threshold = opts->congestion_threshold;
+ }
+ if (opts->set_time_gran) {
+ conn->time_gran = opts->time_gran;
+ }
+ if (opts->set_max_readahead) {
+ conn->max_readahead = opts->max_readahead;
+ }
+
+#define LL_ENABLE(cond, cap) \
+ if (cond) \
+ conn->want |= (cap)
+#define LL_DISABLE(cond, cap) \
+ if (cond) \
+ conn->want &= ~(cap)
+
+ LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ);
+ LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ);
+
+ LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE);
+ LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE);
+
+ LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE);
+ LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE);
+
+ LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA);
+ LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA);
+
+ LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS);
+ LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO);
+
+ LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO);
+ LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO);
+
+ LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE);
+ LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE);
+
+ LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ);
+ LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ);
+
+ LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS);
+ LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS);
+}
+
+struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args)
+{
+ struct fuse_conn_info_opts *opts;
+
+ opts = calloc(1, sizeof(struct fuse_conn_info_opts));
+ if (opts == NULL) {
+ fuse_log(FUSE_LOG_ERR, "calloc failed\n");
+ return NULL;
+ }
+ if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) {
+ free(opts);
+ return NULL;
+ }
+ return opts;
+}
diff --git a/tools/virtiofsd/meson.build b/tools/virtiofsd/meson.build
new file mode 100644
index 000000000..c134ba633
--- /dev/null
+++ b/tools/virtiofsd/meson.build
@@ -0,0 +1,18 @@
+executable('virtiofsd', files(
+ 'buffer.c',
+ 'fuse_opt.c',
+ 'fuse_log.c',
+ 'fuse_lowlevel.c',
+ 'fuse_signals.c',
+ 'fuse_virtio.c',
+ 'helper.c',
+ 'passthrough_ll.c',
+ 'passthrough_seccomp.c'),
+ dependencies: [seccomp, qemuutil, libcap_ng, vhost_user],
+ install: true,
+ install_dir: get_option('libexecdir'))
+
+configure_file(input: '50-qemu-virtiofsd.json.in',
+ output: '50-qemu-virtiofsd.json',
+ configuration: { 'libexecdir' : get_option('prefix') / get_option('libexecdir') },
+ install_dir: qemu_datadir / 'vhost-user')
diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h
new file mode 100644
index 000000000..0b98275ed
--- /dev/null
+++ b/tools/virtiofsd/passthrough_helpers.h
@@ -0,0 +1,51 @@
+/*
+ * FUSE: Filesystem in Userspace
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE
+ */
+
+/*
+ * Creates files on the underlying file system in response to a FUSE_MKNOD
+ * operation
+ */
+static int mknod_wrapper(int dirfd, const char *path, const char *link,
+ int mode, dev_t rdev)
+{
+ int res;
+
+ if (S_ISREG(mode)) {
+ res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode);
+ if (res >= 0) {
+ res = close(res);
+ }
+ } else if (S_ISDIR(mode)) {
+ res = mkdirat(dirfd, path, mode);
+ } else if (S_ISLNK(mode) && link != NULL) {
+ res = symlinkat(link, dirfd, path);
+ } else if (S_ISFIFO(mode)) {
+ res = mkfifoat(dirfd, path, mode);
+ } else {
+ res = mknodat(dirfd, path, mode, rdev);
+ }
+
+ return res;
+}
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
new file mode 100644
index 000000000..64b5b4fbb
--- /dev/null
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -0,0 +1,4090 @@
+/*
+ * FUSE: Filesystem in Userspace
+ * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu>
+ *
+ * This program can be distributed under the terms of the GNU GPLv2.
+ * See the file COPYING.
+ */
+
+/*
+ *
+ * This file system mirrors the existing file system hierarchy of the
+ * system, starting at the root file system. This is implemented by
+ * just "passing through" all requests to the corresponding user-space
+ * libc functions. In contrast to passthrough.c and passthrough_fh.c,
+ * this implementation uses the low-level API. Its performance should
+ * be the least bad among the three, but many operations are not
+ * implemented. In particular, it is not possible to remove files (or
+ * directories) because the code necessary to defer actual removal
+ * until the file is not opened anymore would make the example much
+ * more complicated.
+ *
+ * When writeback caching is enabled (-o writeback mount option), it
+ * is only possible to write to files for which the mounting user has
+ * read permissions. This is because the writeback cache requires the
+ * kernel to be able to issue read requests for all files (which the
+ * passthrough filesystem cannot satisfy if it can't read the file in
+ * the underlying filesystem).
+ *
+ * Compile with:
+ *
+ * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o
+ * passthrough_ll
+ *
+ * ## Source code ##
+ * \include passthrough_ll.c
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/timer.h"
+#include "qemu-version.h"
+#include "qemu-common.h"
+#include "fuse_virtio.h"
+#include "fuse_log.h"
+#include "fuse_lowlevel.h"
+#include "standard-headers/linux/fuse.h"
+#include <cap-ng.h>
+#include <dirent.h>
+#include <pthread.h>
+#include <sys/file.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <sys/xattr.h>
+#include <syslog.h>
+
+#include "qemu/cutils.h"
+#include "passthrough_helpers.h"
+#include "passthrough_seccomp.h"
+
+/* Keep track of inode posix locks for each owner. */
+struct lo_inode_plock {
+ uint64_t lock_owner;
+ int fd; /* fd for OFD locks */
+};
+
+struct lo_map_elem {
+ union {
+ struct lo_inode *inode;
+ struct lo_dirp *dirp;
+ int fd;
+ ssize_t freelist;
+ };
+ bool in_use;
+};
+
+/* Maps FUSE fh or ino values to internal objects */
+struct lo_map {
+ struct lo_map_elem *elems;
+ size_t nelems;
+ ssize_t freelist;
+};
+
+struct lo_key {
+ ino_t ino;
+ dev_t dev;
+ uint64_t mnt_id;
+};
+
+struct lo_inode {
+ int fd;
+
+ /*
+ * Atomic reference count for this object. The nlookup field holds a
+ * reference and release it when nlookup reaches 0.
+ */
+ gint refcount;
+
+ struct lo_key key;
+
+ /*
+ * This counter keeps the inode alive during the FUSE session.
+ * Incremented when the FUSE inode number is sent in a reply
+ * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is
+ * released by a FUSE_FORGET request.
+ *
+ * Note that this value is untrusted because the client can manipulate
+ * it arbitrarily using FUSE_FORGET requests.
+ *
+ * Protected by lo->mutex.
+ */
+ uint64_t nlookup;
+
+ fuse_ino_t fuse_ino;
+ pthread_mutex_t plock_mutex;
+ GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */
+
+ mode_t filetype;
+};
+
+struct lo_cred {
+ uid_t euid;
+ gid_t egid;
+ mode_t umask;
+};
+
+enum {
+ CACHE_NONE,
+ CACHE_AUTO,
+ CACHE_ALWAYS,
+};
+
+enum {
+ SANDBOX_NAMESPACE,
+ SANDBOX_CHROOT,
+};
+
+typedef struct xattr_map_entry {
+ char *key;
+ char *prepend;
+ unsigned int flags;
+} XattrMapEntry;
+
+struct lo_data {
+ pthread_mutex_t mutex;
+ int sandbox;
+ int debug;
+ int writeback;
+ int flock;
+ int posix_lock;
+ int xattr;
+ char *xattrmap;
+ char *xattr_security_capability;
+ char *source;
+ char *modcaps;
+ double timeout;
+ int cache;
+ int timeout_set;
+ int readdirplus_set;
+ int readdirplus_clear;
+ int allow_direct_io;
+ int announce_submounts;
+ bool use_statx;
+ struct lo_inode root;
+ GHashTable *inodes; /* protected by lo->mutex */
+ struct lo_map ino_map; /* protected by lo->mutex */
+ struct lo_map dirp_map; /* protected by lo->mutex */
+ struct lo_map fd_map; /* protected by lo->mutex */
+ XattrMapEntry *xattr_map_list;
+ size_t xattr_map_nentries;
+
+ /* An O_PATH file descriptor to /proc/self/fd/ */
+ int proc_self_fd;
+ int user_killpriv_v2, killpriv_v2;
+ /* If set, virtiofsd is responsible for setting umask during creation */
+ bool change_umask;
+ int user_posix_acl, posix_acl;
+};
+
+static const struct fuse_opt lo_opts[] = {
+ { "sandbox=namespace",
+ offsetof(struct lo_data, sandbox),
+ SANDBOX_NAMESPACE },
+ { "sandbox=chroot",
+ offsetof(struct lo_data, sandbox),
+ SANDBOX_CHROOT },
+ { "writeback", offsetof(struct lo_data, writeback), 1 },
+ { "no_writeback", offsetof(struct lo_data, writeback), 0 },
+ { "source=%s", offsetof(struct lo_data, source), 0 },
+ { "flock", offsetof(struct lo_data, flock), 1 },
+ { "no_flock", offsetof(struct lo_data, flock), 0 },
+ { "posix_lock", offsetof(struct lo_data, posix_lock), 1 },
+ { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 },
+ { "xattr", offsetof(struct lo_data, xattr), 1 },
+ { "no_xattr", offsetof(struct lo_data, xattr), 0 },
+ { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 },
+ { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 },
+ { "timeout=%lf", offsetof(struct lo_data, timeout), 0 },
+ { "timeout=", offsetof(struct lo_data, timeout_set), 1 },
+ { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE },
+ { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO },
+ { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS },
+ { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 },
+ { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 },
+ { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 },
+ { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 },
+ { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 },
+ { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 },
+ { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 },
+ { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 },
+ { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 },
+ FUSE_OPT_END
+};
+static bool use_syslog = false;
+static int current_log_level;
+static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
+ uint64_t n);
+
+static struct {
+ pthread_mutex_t mutex;
+ void *saved;
+} cap;
+/* That we loaded cap-ng in the current thread from the saved */
+static __thread bool cap_loaded = 0;
+
+static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
+ uint64_t mnt_id);
+static int xattr_map_client(const struct lo_data *lo, const char *client_name,
+ char **out_name);
+
+static bool is_dot_or_dotdot(const char *name)
+{
+ return name[0] == '.' &&
+ (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'));
+}
+
+/* Is `path` a single path component that is not "." or ".."? */
+static bool is_safe_path_component(const char *path)
+{
+ if (strchr(path, '/')) {
+ return false;
+ }
+
+ return !is_dot_or_dotdot(path);
+}
+
+static bool is_empty(const char *name)
+{
+ return name[0] == '\0';
+}
+
+static struct lo_data *lo_data(fuse_req_t req)
+{
+ return (struct lo_data *)fuse_req_userdata(req);
+}
+
+/*
+ * Load capng's state from our saved state if the current thread
+ * hadn't previously been loaded.
+ * returns 0 on success
+ */
+static int load_capng(void)
+{
+ if (!cap_loaded) {
+ pthread_mutex_lock(&cap.mutex);
+ capng_restore_state(&cap.saved);
+ /*
+ * restore_state free's the saved copy
+ * so make another.
+ */
+ cap.saved = capng_save_state();
+ if (!cap.saved) {
+ pthread_mutex_unlock(&cap.mutex);
+ fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n");
+ return -EINVAL;
+ }
+ pthread_mutex_unlock(&cap.mutex);
+
+ /*
+ * We want to use the loaded state for our pid,
+ * not the original
+ */
+ capng_setpid(syscall(SYS_gettid));
+ cap_loaded = true;
+ }
+ return 0;
+}
+
+/*
+ * Helpers for dropping and regaining effective capabilities. Returns 0
+ * on success, error otherwise
+ */
+static int drop_effective_cap(const char *cap_name, bool *cap_dropped)
+{
+ int cap, ret;
+
+ cap = capng_name_to_capability(cap_name);
+ if (cap < 0) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
+ cap_name, strerror(errno));
+ goto out;
+ }
+
+ if (load_capng()) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
+ goto out;
+ }
+
+ /* We dont have this capability in effective set already. */
+ if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) {
+ ret = 0;
+ goto out;
+ }
+
+ if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n");
+ goto out;
+ }
+
+ if (capng_apply(CAPNG_SELECT_CAPS)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n");
+ goto out;
+ }
+
+ ret = 0;
+ if (cap_dropped) {
+ *cap_dropped = true;
+ }
+
+out:
+ return ret;
+}
+
+static int gain_effective_cap(const char *cap_name)
+{
+ int cap;
+ int ret = 0;
+
+ cap = capng_name_to_capability(cap_name);
+ if (cap < 0) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n",
+ cap_name, strerror(errno));
+ goto out;
+ }
+
+ if (load_capng()) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "load_capng() failed\n");
+ goto out;
+ }
+
+ if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n");
+ goto out;
+ }
+
+ if (capng_apply(CAPNG_SELECT_CAPS)) {
+ ret = errno;
+ fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n");
+ goto out;
+ }
+ ret = 0;
+
+out:
+ return ret;
+}
+
+/*
+ * The host kernel normally drops security.capability xattr's on
+ * any write, however if we're remapping xattr names we need to drop
+ * whatever the clients security.capability is actually stored as.
+ */
+static int drop_security_capability(const struct lo_data *lo, int fd)
+{
+ if (!lo->xattr_security_capability) {
+ /* We didn't remap the name, let the host kernel do it */
+ return 0;
+ }
+ if (!fremovexattr(fd, lo->xattr_security_capability)) {
+ /* All good */
+ return 0;
+ }
+
+ switch (errno) {
+ case ENODATA:
+ /* Attribute didn't exist, that's fine */
+ return 0;
+
+ case ENOTSUP:
+ /* FS didn't support attribute anyway, also fine */
+ return 0;
+
+ default:
+ /* Hmm other error */
+ return errno;
+ }
+}
+
+static void lo_map_init(struct lo_map *map)
+{
+ map->elems = NULL;
+ map->nelems = 0;
+ map->freelist = -1;
+}
+
+static void lo_map_destroy(struct lo_map *map)
+{
+ g_free(map->elems);
+}
+
+static int lo_map_grow(struct lo_map *map, size_t new_nelems)
+{
+ struct lo_map_elem *new_elems;
+ size_t i;
+
+ if (new_nelems <= map->nelems) {
+ return 1;
+ }
+
+ new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0]));
+ if (!new_elems) {
+ return 0;
+ }
+
+ for (i = map->nelems; i < new_nelems; i++) {
+ new_elems[i].freelist = i + 1;
+ new_elems[i].in_use = false;
+ }
+ new_elems[new_nelems - 1].freelist = -1;
+
+ map->elems = new_elems;
+ map->freelist = map->nelems;
+ map->nelems = new_nelems;
+ return 1;
+}
+
+static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map)
+{
+ struct lo_map_elem *elem;
+
+ if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) {
+ return NULL;
+ }
+
+ elem = &map->elems[map->freelist];
+ map->freelist = elem->freelist;
+
+ elem->in_use = true;
+
+ return elem;
+}
+
+static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key)
+{
+ ssize_t *prev;
+
+ if (!lo_map_grow(map, key + 1)) {
+ return NULL;
+ }
+
+ for (prev = &map->freelist; *prev != -1;
+ prev = &map->elems[*prev].freelist) {
+ if (*prev == key) {
+ struct lo_map_elem *elem = &map->elems[key];
+
+ *prev = elem->freelist;
+ elem->in_use = true;
+ return elem;
+ }
+ }
+ return NULL;
+}
+
+static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key)
+{
+ if (key >= map->nelems) {
+ return NULL;
+ }
+ if (!map->elems[key].in_use) {
+ return NULL;
+ }
+ return &map->elems[key];
+}
+
+static void lo_map_remove(struct lo_map *map, size_t key)
+{
+ struct lo_map_elem *elem;
+
+ if (key >= map->nelems) {
+ return;
+ }
+
+ elem = &map->elems[key];
+ if (!elem->in_use) {
+ return;
+ }
+
+ elem->in_use = false;
+
+ elem->freelist = map->freelist;
+ map->freelist = key;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo->fd_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->fd = fd;
+ return elem - lo->fd_map.elems;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo_data(req)->dirp_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->dirp = dirp;
+ return elem - lo_data(req)->dirp_map.elems;
+}
+
+/* Assumes lo->mutex is held */
+static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode)
+{
+ struct lo_map_elem *elem;
+
+ elem = lo_map_alloc_elem(&lo_data(req)->ino_map);
+ if (!elem) {
+ return -1;
+ }
+
+ elem->inode = inode;
+ return elem - lo_data(req)->ino_map.elems;
+}
+
+static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep)
+{
+ struct lo_inode *inode = *inodep;
+
+ if (!inode) {
+ return;
+ }
+
+ *inodep = NULL;
+
+ if (g_atomic_int_dec_and_test(&inode->refcount)) {
+ close(inode->fd);
+ free(inode);
+ }
+}
+
+/* Caller must release refcount using lo_inode_put() */
+static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->ino_map, ino);
+ if (elem) {
+ g_atomic_int_inc(&elem->inode->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ if (!elem) {
+ return NULL;
+ }
+
+ return elem->inode;
+}
+
+/*
+ * TODO Remove this helper and force callers to hold an inode refcount until
+ * they are done with the fd. This will be done in a later patch to make
+ * review easier.
+ */
+static int lo_fd(fuse_req_t req, fuse_ino_t ino)
+{
+ struct lo_inode *inode = lo_inode(req, ino);
+ int fd;
+
+ if (!inode) {
+ return -1;
+ }
+
+ fd = inode->fd;
+ lo_inode_put(lo_data(req), &inode);
+ return fd;
+}
+
+/*
+ * Open a file descriptor for an inode. Returns -EBADF if the inode is not a
+ * regular file or a directory.
+ *
+ * Use this helper function instead of raw openat(2) to prevent security issues
+ * when a malicious client opens special files such as block device nodes.
+ * Symlink inodes are also rejected since symlinks must already have been
+ * traversed on the client side.
+ */
+static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode,
+ int open_flags)
+{
+ g_autofree char *fd_str = g_strdup_printf("%d", inode->fd);
+ int fd;
+
+ if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) {
+ return -EBADF;
+ }
+
+ /*
+ * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier
+ * that the inode is not a special file but if an external process races
+ * with us then symlinks are traversed here. It is not possible to escape
+ * the shared directory since it is mounted as "/" though.
+ */
+ fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW);
+ if (fd < 0) {
+ return -errno;
+ }
+ return fd;
+}
+
+static void lo_init(void *userdata, struct fuse_conn_info *conn)
+{
+ struct lo_data *lo = (struct lo_data *)userdata;
+
+ if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) {
+ conn->want |= FUSE_CAP_EXPORT_SUPPORT;
+ }
+
+ if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n");
+ conn->want |= FUSE_CAP_WRITEBACK_CACHE;
+ }
+ if (conn->capable & FUSE_CAP_FLOCK_LOCKS) {
+ if (lo->flock) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n");
+ conn->want |= FUSE_CAP_FLOCK_LOCKS;
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n");
+ conn->want &= ~FUSE_CAP_FLOCK_LOCKS;
+ }
+ }
+
+ if (conn->capable & FUSE_CAP_POSIX_LOCKS) {
+ if (lo->posix_lock) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n");
+ conn->want |= FUSE_CAP_POSIX_LOCKS;
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n");
+ conn->want &= ~FUSE_CAP_POSIX_LOCKS;
+ }
+ }
+
+ if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) ||
+ lo->readdirplus_clear) {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n");
+ conn->want &= ~FUSE_CAP_READDIRPLUS;
+ }
+
+ if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) {
+ fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client "
+ "does not support it\n");
+ lo->announce_submounts = false;
+ }
+
+ if (lo->user_killpriv_v2 == 1) {
+ /*
+ * User explicitly asked for this option. Enable it unconditionally.
+ * If connection does not have this capability, it should fail
+ * in fuse_lowlevel.c
+ */
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
+ conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
+ lo->killpriv_v2 = 1;
+ } else if (lo->user_killpriv_v2 == -1 &&
+ conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) {
+ /*
+ * User did not specify a value for killpriv_v2. By default enable it
+ * if connection offers this capability
+ */
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n");
+ conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2;
+ lo->killpriv_v2 = 1;
+ } else {
+ /*
+ * Either user specified to disable killpriv_v2, or connection does
+ * not offer this capability. Disable killpriv_v2 in both the cases
+ */
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n");
+ conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2;
+ lo->killpriv_v2 = 0;
+ }
+
+ if (lo->user_posix_acl == 1) {
+ /*
+ * User explicitly asked for this option. Enable it unconditionally.
+ * If connection does not have this capability, print error message
+ * now. It will fail later in fuse_lowlevel.c
+ */
+ if (!(conn->capable & FUSE_CAP_POSIX_ACL) ||
+ !(conn->capable & FUSE_CAP_DONT_MASK) ||
+ !(conn->capable & FUSE_CAP_SETXATTR_EXT)) {
+ fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl."
+ " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK"
+ " or FUSE_SETXATTR_EXT capability.\n");
+ } else {
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n");
+ }
+
+ conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK |
+ FUSE_CAP_SETXATTR_EXT;
+ lo->change_umask = true;
+ lo->posix_acl = true;
+ } else {
+ /* User either did not specify anything or wants it disabled */
+ fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n");
+ conn->want &= ~FUSE_CAP_POSIX_ACL;
+ }
+}
+
+static void lo_getattr(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ int res;
+ struct stat buf;
+ struct lo_data *lo = lo_data(req);
+
+ (void)fi;
+
+ res =
+ fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ fuse_reply_attr(req, &buf, lo->timeout);
+}
+
+static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->fd_map, fi->fh);
+ pthread_mutex_unlock(&lo->mutex);
+
+ if (!elem) {
+ return -1;
+ }
+
+ return elem->fd;
+}
+
+static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr,
+ int valid, struct fuse_file_info *fi)
+{
+ int saverr;
+ char procname[64];
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ int ifd;
+ int res;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ ifd = inode->fd;
+
+ /* If fi->fh is invalid we'll report EBADF later */
+ if (fi) {
+ fd = lo_fi_fd(req, fi);
+ }
+
+ if (valid & FUSE_SET_ATTR_MODE) {
+ if (fi) {
+ res = fchmod(fd, attr->st_mode);
+ } else {
+ sprintf(procname, "%i", ifd);
+ res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0);
+ }
+ if (res == -1) {
+ saverr = errno;
+ goto out_err;
+ }
+ }
+ if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) {
+ uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1;
+ gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1;
+
+ saverr = drop_security_capability(lo, ifd);
+ if (saverr) {
+ goto out_err;
+ }
+
+ res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ saverr = errno;
+ goto out_err;
+ }
+ }
+ if (valid & FUSE_SET_ATTR_SIZE) {
+ int truncfd;
+ bool kill_suidgid;
+ bool cap_fsetid_dropped = false;
+
+ kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID);
+ if (fi) {
+ truncfd = fd;
+ } else {
+ truncfd = lo_inode_open(lo, inode, O_RDWR);
+ if (truncfd < 0) {
+ saverr = -truncfd;
+ goto out_err;
+ }
+ }
+
+ saverr = drop_security_capability(lo, truncfd);
+ if (saverr) {
+ if (!fi) {
+ close(truncfd);
+ }
+ goto out_err;
+ }
+
+ if (kill_suidgid) {
+ res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
+ if (res != 0) {
+ saverr = res;
+ if (!fi) {
+ close(truncfd);
+ }
+ goto out_err;
+ }
+ }
+
+ res = ftruncate(truncfd, attr->st_size);
+ saverr = res == -1 ? errno : 0;
+
+ if (cap_fsetid_dropped) {
+ if (gain_effective_cap("FSETID")) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
+ }
+ }
+ if (!fi) {
+ close(truncfd);
+ }
+ if (res == -1) {
+ goto out_err;
+ }
+ }
+ if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) {
+ struct timespec tv[2];
+
+ tv[0].tv_sec = 0;
+ tv[1].tv_sec = 0;
+ tv[0].tv_nsec = UTIME_OMIT;
+ tv[1].tv_nsec = UTIME_OMIT;
+
+ if (valid & FUSE_SET_ATTR_ATIME_NOW) {
+ tv[0].tv_nsec = UTIME_NOW;
+ } else if (valid & FUSE_SET_ATTR_ATIME) {
+ tv[0] = attr->st_atim;
+ }
+
+ if (valid & FUSE_SET_ATTR_MTIME_NOW) {
+ tv[1].tv_nsec = UTIME_NOW;
+ } else if (valid & FUSE_SET_ATTR_MTIME) {
+ tv[1] = attr->st_mtim;
+ }
+
+ if (fi) {
+ res = futimens(fd, tv);
+ } else {
+ sprintf(procname, "%i", inode->fd);
+ res = utimensat(lo->proc_self_fd, procname, tv, 0);
+ }
+ if (res == -1) {
+ saverr = errno;
+ goto out_err;
+ }
+ }
+ lo_inode_put(lo, &inode);
+
+ return lo_getattr(req, ino, fi);
+
+out_err:
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st,
+ uint64_t mnt_id)
+{
+ struct lo_inode *p;
+ struct lo_key key = {
+ .ino = st->st_ino,
+ .dev = st->st_dev,
+ .mnt_id = mnt_id,
+ };
+
+ pthread_mutex_lock(&lo->mutex);
+ p = g_hash_table_lookup(lo->inodes, &key);
+ if (p) {
+ assert(p->nlookup > 0);
+ p->nlookup++;
+ g_atomic_int_inc(&p->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ return p;
+}
+
+/* value_destroy_func for posix_locks GHashTable */
+static void posix_locks_value_destroy(gpointer data)
+{
+ struct lo_inode_plock *plock = data;
+
+ /*
+ * We had used open() for locks and had only one fd. So
+ * closing this fd should release all OFD locks.
+ */
+ close(plock->fd);
+ free(plock);
+}
+
+static int do_statx(struct lo_data *lo, int dirfd, const char *pathname,
+ struct stat *statbuf, int flags, uint64_t *mnt_id)
+{
+ int res;
+
+#if defined(CONFIG_STATX) && defined(STATX_MNT_ID)
+ if (lo->use_statx) {
+ struct statx statxbuf;
+
+ res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID,
+ &statxbuf);
+ if (!res) {
+ memset(statbuf, 0, sizeof(*statbuf));
+ statbuf->st_dev = makedev(statxbuf.stx_dev_major,
+ statxbuf.stx_dev_minor);
+ statbuf->st_ino = statxbuf.stx_ino;
+ statbuf->st_mode = statxbuf.stx_mode;
+ statbuf->st_nlink = statxbuf.stx_nlink;
+ statbuf->st_uid = statxbuf.stx_uid;
+ statbuf->st_gid = statxbuf.stx_gid;
+ statbuf->st_rdev = makedev(statxbuf.stx_rdev_major,
+ statxbuf.stx_rdev_minor);
+ statbuf->st_size = statxbuf.stx_size;
+ statbuf->st_blksize = statxbuf.stx_blksize;
+ statbuf->st_blocks = statxbuf.stx_blocks;
+ statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec;
+ statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec;
+ statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec;
+ statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec;
+ statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec;
+ statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec;
+
+ if (statxbuf.stx_mask & STATX_MNT_ID) {
+ *mnt_id = statxbuf.stx_mnt_id;
+ } else {
+ *mnt_id = 0;
+ }
+ return 0;
+ } else if (errno != ENOSYS) {
+ return -1;
+ }
+ lo->use_statx = false;
+ /* fallback */
+ }
+#endif
+ res = fstatat(dirfd, pathname, statbuf, flags);
+ if (res == -1) {
+ return -1;
+ }
+ *mnt_id = 0;
+
+ return 0;
+}
+
+/*
+ * Increments nlookup on the inode on success. unref_inode_lolocked() must be
+ * called eventually to decrement nlookup again. If inodep is non-NULL, the
+ * inode pointer is stored and the caller must call lo_inode_put().
+ */
+static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name,
+ struct fuse_entry_param *e,
+ struct lo_inode **inodep)
+{
+ int newfd;
+ int res;
+ int saverr;
+ uint64_t mnt_id;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode = NULL;
+ struct lo_inode *dir = lo_inode(req, parent);
+
+ if (inodep) {
+ *inodep = NULL; /* in case there is an error */
+ }
+
+ /*
+ * name_to_handle_at() and open_by_handle_at() can reach here with fuse
+ * mount point in guest, but we don't have its inode info in the
+ * ino_map.
+ */
+ if (!dir) {
+ return ENOENT;
+ }
+
+ memset(e, 0, sizeof(*e));
+ e->attr_timeout = lo->timeout;
+ e->entry_timeout = lo->timeout;
+
+ /* Do not allow escaping root directory */
+ if (dir == &lo->root && strcmp(name, "..") == 0) {
+ name = ".";
+ }
+
+ newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW);
+ if (newfd == -1) {
+ goto out_err;
+ }
+
+ res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+ &mnt_id);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts &&
+ (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) {
+ e->attr_flags |= FUSE_ATTR_SUBMOUNT;
+ }
+
+ inode = lo_find(lo, &e->attr, mnt_id);
+ if (inode) {
+ close(newfd);
+ } else {
+ inode = calloc(1, sizeof(struct lo_inode));
+ if (!inode) {
+ goto out_err;
+ }
+
+ /* cache only filetype */
+ inode->filetype = (e->attr.st_mode & S_IFMT);
+
+ /*
+ * One for the caller and one for nlookup (released in
+ * unref_inode_lolocked())
+ */
+ g_atomic_int_set(&inode->refcount, 2);
+
+ inode->nlookup = 1;
+ inode->fd = newfd;
+ inode->key.ino = e->attr.st_ino;
+ inode->key.dev = e->attr.st_dev;
+ inode->key.mnt_id = mnt_id;
+ if (lo->posix_lock) {
+ pthread_mutex_init(&inode->plock_mutex, NULL);
+ inode->posix_locks = g_hash_table_new_full(
+ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
+ }
+ pthread_mutex_lock(&lo->mutex);
+ inode->fuse_ino = lo_add_inode_mapping(req, inode);
+ g_hash_table_insert(lo->inodes, &inode->key, inode);
+ pthread_mutex_unlock(&lo->mutex);
+ }
+ e->ino = inode->fuse_ino;
+
+ /* Transfer ownership of inode pointer to caller or drop it */
+ if (inodep) {
+ *inodep = inode;
+ } else {
+ lo_inode_put(lo, &inode);
+ }
+
+ lo_inode_put(lo, &dir);
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e->ino);
+
+ return 0;
+
+out_err:
+ saverr = errno;
+ if (newfd != -1) {
+ close(newfd);
+ }
+ lo_inode_put(lo, &inode);
+ lo_inode_put(lo, &dir);
+ return saverr;
+}
+
+static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ struct fuse_entry_param e;
+ int err;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent,
+ name);
+
+ if (is_empty(name)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ /*
+ * Don't use is_safe_path_component(), allow "." and ".." for NFS export
+ * support.
+ */
+ if (strchr(name, '/')) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ err = lo_do_lookup(req, parent, name, &e, NULL);
+ if (err) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_entry(req, &e);
+ }
+}
+
+/*
+ * On some archs, setres*id is limited to 2^16 but they
+ * provide setres*id32 variants that allow 2^32.
+ * Others just let setres*id do 2^32 anyway.
+ */
+#ifdef SYS_setresgid32
+#define OURSYS_setresgid SYS_setresgid32
+#else
+#define OURSYS_setresgid SYS_setresgid
+#endif
+
+#ifdef SYS_setresuid32
+#define OURSYS_setresuid SYS_setresuid32
+#else
+#define OURSYS_setresuid SYS_setresuid
+#endif
+
+/*
+ * Change to uid/gid of caller so that file is created with
+ * ownership of caller.
+ * TODO: What about selinux context?
+ */
+static int lo_change_cred(fuse_req_t req, struct lo_cred *old,
+ bool change_umask)
+{
+ int res;
+
+ old->euid = geteuid();
+ old->egid = getegid();
+
+ res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1);
+ if (res == -1) {
+ return errno;
+ }
+
+ res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1);
+ if (res == -1) {
+ int errno_save = errno;
+
+ syscall(OURSYS_setresgid, -1, old->egid, -1);
+ return errno_save;
+ }
+
+ if (change_umask) {
+ old->umask = umask(req->ctx.umask);
+ }
+ return 0;
+}
+
+/* Regain Privileges */
+static void lo_restore_cred(struct lo_cred *old, bool restore_umask)
+{
+ int res;
+
+ res = syscall(OURSYS_setresuid, -1, old->euid, -1);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid);
+ exit(1);
+ }
+
+ res = syscall(OURSYS_setresgid, -1, old->egid, -1);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid);
+ exit(1);
+ }
+
+ if (restore_umask)
+ umask(old->umask);
+}
+
+/*
+ * A helper to change cred and drop capability. Returns 0 on success and
+ * errno on error
+ */
+static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old,
+ bool change_umask, const char *cap_name,
+ bool *cap_dropped)
+{
+ int ret;
+ bool __cap_dropped;
+
+ assert(cap_name);
+
+ ret = drop_effective_cap(cap_name, &__cap_dropped);
+ if (ret) {
+ return ret;
+ }
+
+ ret = lo_change_cred(req, old, change_umask);
+ if (ret) {
+ if (__cap_dropped) {
+ if (gain_effective_cap(cap_name)) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
+ }
+ }
+ }
+
+ if (cap_dropped) {
+ *cap_dropped = __cap_dropped;
+ }
+ return ret;
+}
+
+static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask,
+ const char *cap_name)
+{
+ assert(cap_name);
+
+ lo_restore_cred(old, restore_umask);
+
+ if (gain_effective_cap(cap_name)) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name);
+ }
+}
+
+static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent,
+ const char *name, mode_t mode, dev_t rdev,
+ const char *link)
+{
+ int res;
+ int saverr;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *dir;
+ struct fuse_entry_param e;
+ struct lo_cred old = {};
+
+ if (is_empty(name)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ dir = lo_inode(req, parent);
+ if (!dir) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode));
+ if (saverr) {
+ goto out;
+ }
+
+ res = mknod_wrapper(dir->fd, name, link, mode, rdev);
+
+ saverr = errno;
+
+ lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode));
+
+ if (res == -1) {
+ goto out;
+ }
+
+ saverr = lo_do_lookup(req, parent, name, &e, NULL);
+ if (saverr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e.ino);
+
+ fuse_reply_entry(req, &e);
+ lo_inode_put(lo, &dir);
+ return;
+
+out:
+ lo_inode_put(lo, &dir);
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, dev_t rdev)
+{
+ lo_mknod_symlink(req, parent, name, mode, rdev, NULL);
+}
+
+static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode)
+{
+ lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL);
+}
+
+static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent,
+ const char *name)
+{
+ lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link);
+}
+
+static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent,
+ const char *name)
+{
+ int res;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *parent_inode;
+ struct lo_inode *inode;
+ struct fuse_entry_param e;
+ char procname[64];
+ int saverr;
+
+ if (is_empty(name)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ inode = lo_inode(req, ino);
+ if (!parent_inode || !inode) {
+ errno = EBADF;
+ goto out_err;
+ }
+
+ memset(&e, 0, sizeof(struct fuse_entry_param));
+ e.attr_timeout = lo->timeout;
+ e.entry_timeout = lo->timeout;
+
+ sprintf(procname, "%i", inode->fd);
+ res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name,
+ AT_SYMLINK_FOLLOW);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
+ if (res == -1) {
+ goto out_err;
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ inode->nlookup++;
+ pthread_mutex_unlock(&lo->mutex);
+ e.ino = inode->fuse_ino;
+
+ fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent,
+ name, (unsigned long long)e.ino);
+
+ fuse_reply_entry(req, &e);
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, saverr);
+}
+
+/* Increments nlookup and caller must release refcount using lo_inode_put() */
+static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent,
+ const char *name)
+{
+ int res;
+ uint64_t mnt_id;
+ struct stat attr;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *dir = lo_inode(req, parent);
+
+ if (!dir) {
+ return NULL;
+ }
+
+ res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id);
+ lo_inode_put(lo, &dir);
+ if (res == -1) {
+ return NULL;
+ }
+
+ return lo_find(lo, &attr, mnt_id);
+}
+
+static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int res;
+ struct lo_inode *inode;
+ struct lo_data *lo = lo_data(req);
+
+ if (is_empty(name)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ inode = lookup_name(req, parent, name);
+ if (!inode) {
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ unref_inode_lolocked(lo, inode, 1);
+ lo_inode_put(lo, &inode);
+}
+
+static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name,
+ fuse_ino_t newparent, const char *newname,
+ unsigned int flags)
+{
+ int res;
+ struct lo_inode *parent_inode;
+ struct lo_inode *newparent_inode;
+ struct lo_inode *oldinode = NULL;
+ struct lo_inode *newinode = NULL;
+ struct lo_data *lo = lo_data(req);
+
+ if (is_empty(name) || is_empty(newname)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ if (!is_safe_path_component(name) || !is_safe_path_component(newname)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ newparent_inode = lo_inode(req, newparent);
+ if (!parent_inode || !newparent_inode) {
+ fuse_reply_err(req, EBADF);
+ goto out;
+ }
+
+ oldinode = lookup_name(req, parent, name);
+ newinode = lookup_name(req, newparent, newname);
+
+ if (!oldinode) {
+ fuse_reply_err(req, EIO);
+ goto out;
+ }
+
+ if (flags) {
+#ifndef SYS_renameat2
+ fuse_reply_err(req, EINVAL);
+#else
+ res = syscall(SYS_renameat2, parent_inode->fd, name,
+ newparent_inode->fd, newname, flags);
+ if (res == -1 && errno == ENOSYS) {
+ fuse_reply_err(req, EINVAL);
+ } else {
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ }
+#endif
+ goto out;
+ }
+
+ res = renameat(parent_inode->fd, name, newparent_inode->fd, newname);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+out:
+ unref_inode_lolocked(lo, oldinode, 1);
+ unref_inode_lolocked(lo, newinode, 1);
+ lo_inode_put(lo, &oldinode);
+ lo_inode_put(lo, &newinode);
+ lo_inode_put(lo, &parent_inode);
+ lo_inode_put(lo, &newparent_inode);
+}
+
+static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name)
+{
+ int res;
+ struct lo_inode *inode;
+ struct lo_data *lo = lo_data(req);
+
+ if (is_empty(name)) {
+ fuse_reply_err(req, ENOENT);
+ return;
+ }
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ inode = lookup_name(req, parent, name);
+ if (!inode) {
+ fuse_reply_err(req, EIO);
+ return;
+ }
+
+ res = unlinkat(lo_fd(req, parent), name, 0);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+ unref_inode_lolocked(lo, inode, 1);
+ lo_inode_put(lo, &inode);
+}
+
+/* To be called with lo->mutex held */
+static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n)
+{
+ if (!inode) {
+ return;
+ }
+
+ assert(inode->nlookup >= n);
+ inode->nlookup -= n;
+ if (!inode->nlookup) {
+ lo_map_remove(&lo->ino_map, inode->fuse_ino);
+ g_hash_table_remove(lo->inodes, &inode->key);
+ if (lo->posix_lock) {
+ if (g_hash_table_size(inode->posix_locks)) {
+ fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n");
+ }
+ g_hash_table_destroy(inode->posix_locks);
+ pthread_mutex_destroy(&inode->plock_mutex);
+ }
+ /* Drop our refcount from lo_do_lookup() */
+ lo_inode_put(lo, &inode);
+ }
+}
+
+static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode,
+ uint64_t n)
+{
+ if (!inode) {
+ return;
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ unref_inode(lo, inode, n);
+ pthread_mutex_unlock(&lo->mutex);
+}
+
+static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ return;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n",
+ (unsigned long long)ino, (unsigned long long)inode->nlookup,
+ (unsigned long long)nlookup);
+
+ unref_inode_lolocked(lo, inode, nlookup);
+ lo_inode_put(lo, &inode);
+}
+
+static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup)
+{
+ lo_forget_one(req, ino, nlookup);
+ fuse_reply_none(req);
+}
+
+static void lo_forget_multi(fuse_req_t req, size_t count,
+ struct fuse_forget_data *forgets)
+{
+ int i;
+
+ for (i = 0; i < count; i++) {
+ lo_forget_one(req, forgets[i].ino, forgets[i].nlookup);
+ }
+ fuse_reply_none(req);
+}
+
+static void lo_readlink(fuse_req_t req, fuse_ino_t ino)
+{
+ char buf[PATH_MAX + 1];
+ int res;
+
+ res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf));
+ if (res == -1) {
+ return (void)fuse_reply_err(req, errno);
+ }
+
+ if (res == sizeof(buf)) {
+ return (void)fuse_reply_err(req, ENAMETOOLONG);
+ }
+
+ buf[res] = '\0';
+
+ fuse_reply_readlink(req, buf);
+}
+
+struct lo_dirp {
+ gint refcount;
+ DIR *dp;
+ struct dirent *entry;
+ off_t offset;
+};
+
+static void lo_dirp_put(struct lo_dirp **dp)
+{
+ struct lo_dirp *d = *dp;
+
+ if (!d) {
+ return;
+ }
+ *dp = NULL;
+
+ if (g_atomic_int_dec_and_test(&d->refcount)) {
+ closedir(d->dp);
+ free(d);
+ }
+}
+
+/* Call lo_dirp_put() on the return value when no longer needed */
+static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->dirp_map, fi->fh);
+ if (elem) {
+ g_atomic_int_inc(&elem->dirp->refcount);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+ if (!elem) {
+ return NULL;
+ }
+
+ return elem->dirp;
+}
+
+static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ int error = ENOMEM;
+ struct lo_data *lo = lo_data(req);
+ struct lo_dirp *d;
+ int fd;
+ ssize_t fh;
+
+ d = calloc(1, sizeof(struct lo_dirp));
+ if (d == NULL) {
+ goto out_err;
+ }
+
+ fd = openat(lo_fd(req, ino), ".", O_RDONLY);
+ if (fd == -1) {
+ goto out_errno;
+ }
+
+ d->dp = fdopendir(fd);
+ if (d->dp == NULL) {
+ goto out_errno;
+ }
+
+ d->offset = 0;
+ d->entry = NULL;
+
+ g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */
+ pthread_mutex_lock(&lo->mutex);
+ fh = lo_add_dirp_mapping(req, d);
+ pthread_mutex_unlock(&lo->mutex);
+ if (fh == -1) {
+ goto out_err;
+ }
+
+ fi->fh = fh;
+ if (lo->cache == CACHE_ALWAYS) {
+ fi->cache_readdir = 1;
+ }
+ fuse_reply_open(req, fi);
+ return;
+
+out_errno:
+ error = errno;
+out_err:
+ if (d) {
+ if (d->dp) {
+ closedir(d->dp);
+ } else if (fd != -1) {
+ close(fd);
+ }
+ free(d);
+ }
+ fuse_reply_err(req, error);
+}
+
+static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi, int plus)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_dirp *d = NULL;
+ struct lo_inode *dinode;
+ g_autofree char *buf = NULL;
+ char *p;
+ size_t rem = size;
+ int err = EBADF;
+
+ dinode = lo_inode(req, ino);
+ if (!dinode) {
+ goto error;
+ }
+
+ d = lo_dirp(req, fi);
+ if (!d) {
+ goto error;
+ }
+
+ err = ENOMEM;
+ buf = g_try_malloc0(size);
+ if (!buf) {
+ goto error;
+ }
+ p = buf;
+
+ if (offset != d->offset) {
+ seekdir(d->dp, offset);
+ d->entry = NULL;
+ d->offset = offset;
+ }
+ while (1) {
+ size_t entsize;
+ off_t nextoff;
+ const char *name;
+
+ if (!d->entry) {
+ errno = 0;
+ d->entry = readdir(d->dp);
+ if (!d->entry) {
+ if (errno) { /* Error */
+ err = errno;
+ goto error;
+ } else { /* End of stream */
+ break;
+ }
+ }
+ }
+ nextoff = d->entry->d_off;
+ name = d->entry->d_name;
+
+ fuse_ino_t entry_ino = 0;
+ struct fuse_entry_param e = (struct fuse_entry_param){
+ .attr.st_ino = d->entry->d_ino,
+ .attr.st_mode = d->entry->d_type << 12,
+ };
+
+ /* Hide root's parent directory */
+ if (dinode == &lo->root && strcmp(name, "..") == 0) {
+ e.attr.st_ino = lo->root.key.ino;
+ e.attr.st_mode = DT_DIR << 12;
+ }
+
+ if (plus) {
+ if (!is_dot_or_dotdot(name)) {
+ err = lo_do_lookup(req, ino, name, &e, NULL);
+ if (err) {
+ goto error;
+ }
+ entry_ino = e.ino;
+ }
+
+ entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff);
+ } else {
+ entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff);
+ }
+ if (entsize > rem) {
+ if (entry_ino != 0) {
+ lo_forget_one(req, entry_ino, 1);
+ }
+ break;
+ }
+
+ p += entsize;
+ rem -= entsize;
+
+ d->entry = NULL;
+ d->offset = nextoff;
+ }
+
+ err = 0;
+error:
+ lo_dirp_put(&d);
+ lo_inode_put(lo, &dinode);
+
+ /*
+ * If there's an error, we can only signal it if we haven't stored
+ * any entries yet - otherwise we'd end up with wrong lookup
+ * counts for the entries that are already in the buffer. So we
+ * return what we've collected until that point.
+ */
+ if (err && rem == size) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_buf(req, buf, size - rem);
+ }
+}
+
+static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ lo_do_readdir(req, ino, size, offset, fi, 0);
+}
+
+static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size,
+ off_t offset, struct fuse_file_info *fi)
+{
+ lo_do_readdir(req, ino, size, offset, fi, 1);
+}
+
+static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+ struct lo_dirp *d;
+
+ (void)ino;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->dirp_map, fi->fh);
+ if (!elem) {
+ pthread_mutex_unlock(&lo->mutex);
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ d = elem->dirp;
+ lo_map_remove(&lo->dirp_map, fi->fh);
+ pthread_mutex_unlock(&lo->mutex);
+
+ lo_dirp_put(&d); /* paired with lo_opendir() */
+
+ fuse_reply_err(req, 0);
+}
+
+static void update_open_flags(int writeback, int allow_direct_io,
+ struct fuse_file_info *fi)
+{
+ /*
+ * With writeback cache, kernel may send read requests even
+ * when userspace opened write-only
+ */
+ if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) {
+ fi->flags &= ~O_ACCMODE;
+ fi->flags |= O_RDWR;
+ }
+
+ /*
+ * With writeback cache, O_APPEND is handled by the kernel.
+ * This breaks atomicity (since the file may change in the
+ * underlying filesystem, so that the kernel's idea of the
+ * end of the file isn't accurate anymore). In this example,
+ * we just accept that. A more rigorous filesystem may want
+ * to return an error here
+ */
+ if (writeback && (fi->flags & O_APPEND)) {
+ fi->flags &= ~O_APPEND;
+ }
+
+ /*
+ * O_DIRECT in guest should not necessarily mean bypassing page
+ * cache on host as well. Therefore, we discard it by default
+ * ('-o no_allow_direct_io'). If somebody needs that behavior,
+ * the '-o allow_direct_io' option should be set.
+ */
+ if (!allow_direct_io) {
+ fi->flags &= ~O_DIRECT;
+ }
+}
+
+/*
+ * Open a regular file, set up an fd mapping, and fill out the struct
+ * fuse_file_info for it. If existing_fd is not negative, use that fd instead
+ * opening a new one. Takes ownership of existing_fd.
+ *
+ * Returns 0 on success or a positive errno.
+ */
+static int lo_do_open(struct lo_data *lo, struct lo_inode *inode,
+ int existing_fd, struct fuse_file_info *fi)
+{
+ ssize_t fh;
+ int fd = existing_fd;
+ int err;
+ bool cap_fsetid_dropped = false;
+ bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv;
+
+ update_open_flags(lo->writeback, lo->allow_direct_io, fi);
+
+ if (fd < 0) {
+ if (kill_suidgid) {
+ err = drop_effective_cap("FSETID", &cap_fsetid_dropped);
+ if (err) {
+ return err;
+ }
+ }
+
+ fd = lo_inode_open(lo, inode, fi->flags);
+
+ if (cap_fsetid_dropped) {
+ if (gain_effective_cap("FSETID")) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
+ }
+ }
+ if (fd < 0) {
+ return -fd;
+ }
+ if (fi->flags & (O_TRUNC)) {
+ int err = drop_security_capability(lo, fd);
+ if (err) {
+ close(fd);
+ return err;
+ }
+ }
+ }
+
+ pthread_mutex_lock(&lo->mutex);
+ fh = lo_add_fd_mapping(lo, fd);
+ pthread_mutex_unlock(&lo->mutex);
+ if (fh == -1) {
+ close(fd);
+ return ENOMEM;
+ }
+
+ fi->fh = fh;
+ if (lo->cache == CACHE_NONE) {
+ fi->direct_io = 1;
+ } else if (lo->cache == CACHE_ALWAYS) {
+ fi->keep_cache = 1;
+ }
+ return 0;
+}
+
+static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name,
+ mode_t mode, struct fuse_file_info *fi)
+{
+ int fd = -1;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *parent_inode;
+ struct lo_inode *inode = NULL;
+ struct fuse_entry_param e;
+ int err;
+ struct lo_cred old = {};
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)"
+ " kill_priv=%d\n", parent, name, fi->kill_priv);
+
+ if (!is_safe_path_component(name)) {
+ fuse_reply_err(req, EINVAL);
+ return;
+ }
+
+ parent_inode = lo_inode(req, parent);
+ if (!parent_inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ err = lo_change_cred(req, &old, lo->change_umask);
+ if (err) {
+ goto out;
+ }
+
+ update_open_flags(lo->writeback, lo->allow_direct_io, fi);
+
+ /* Try to create a new file but don't open existing files */
+ fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode);
+ err = fd == -1 ? errno : 0;
+
+ lo_restore_cred(&old, lo->change_umask);
+
+ /* Ignore the error if file exists and O_EXCL was not given */
+ if (err && (err != EEXIST || (fi->flags & O_EXCL))) {
+ goto out;
+ }
+
+ err = lo_do_lookup(req, parent, name, &e, &inode);
+ if (err) {
+ goto out;
+ }
+
+ err = lo_do_open(lo, inode, fd, fi);
+ fd = -1; /* lo_do_open() takes ownership of fd */
+ if (err) {
+ /* Undo lo_do_lookup() nlookup ref */
+ unref_inode_lolocked(lo, inode, 1);
+ }
+
+out:
+ lo_inode_put(lo, &inode);
+ lo_inode_put(lo, &parent_inode);
+
+ if (err) {
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_create(req, &e, fi);
+ }
+}
+
+/* Should be called with inode->plock_mutex held */
+static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo,
+ struct lo_inode *inode,
+ uint64_t lock_owner,
+ pid_t pid, int *err)
+{
+ struct lo_inode_plock *plock;
+ int fd;
+
+ plock =
+ g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner));
+
+ if (plock) {
+ return plock;
+ }
+
+ plock = malloc(sizeof(struct lo_inode_plock));
+ if (!plock) {
+ *err = ENOMEM;
+ return NULL;
+ }
+
+ /* Open another instance of file which can be used for ofd locks. */
+ /* TODO: What if file is not writable? */
+ fd = lo_inode_open(lo, inode, O_RDWR);
+ if (fd < 0) {
+ *err = -fd;
+ free(plock);
+ return NULL;
+ }
+
+ plock->lock_owner = lock_owner;
+ plock->fd = fd;
+ g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner),
+ plock);
+ return plock;
+}
+
+static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ struct lo_inode_plock *plock;
+ int ret, saverr = 0;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_getlk(ino=%" PRIu64 ", flags=%d)"
+ " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64
+ " l_len=0x%" PRIx64 "\n",
+ ino, fi->flags, fi->lock_owner, lock->l_type,
+ (uint64_t)lock->l_start, (uint64_t)lock->l_len);
+
+ if (!lo->posix_lock) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ pthread_mutex_lock(&inode->plock_mutex);
+ plock =
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
+ if (!plock) {
+ saverr = ret;
+ goto out;
+ }
+
+ ret = fcntl(plock->fd, F_OFD_GETLK, lock);
+ if (ret == -1) {
+ saverr = errno;
+ }
+
+out:
+ pthread_mutex_unlock(&inode->plock_mutex);
+ lo_inode_put(lo, &inode);
+
+ if (saverr) {
+ fuse_reply_err(req, saverr);
+ } else {
+ fuse_reply_lock(req, lock);
+ }
+}
+
+static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ struct flock *lock, int sleep)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ struct lo_inode_plock *plock;
+ int ret, saverr = 0;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_setlk(ino=%" PRIu64 ", flags=%d)"
+ " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d"
+ " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n",
+ ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep,
+ lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len);
+
+ if (!lo->posix_lock) {
+ fuse_reply_err(req, ENOSYS);
+ return;
+ }
+
+ if (sleep) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ pthread_mutex_lock(&inode->plock_mutex);
+ plock =
+ lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret);
+
+ if (!plock) {
+ saverr = ret;
+ goto out;
+ }
+
+ /* TODO: Is it alright to modify flock? */
+ lock->l_pid = 0;
+ ret = fcntl(plock->fd, F_OFD_SETLK, lock);
+ if (ret == -1) {
+ saverr = errno;
+ }
+
+out:
+ pthread_mutex_unlock(&inode->plock_mutex);
+ lo_inode_put(lo, &inode);
+
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi)
+{
+ int res;
+ struct lo_dirp *d;
+ int fd;
+
+ (void)ino;
+
+ d = lo_dirp(req, fi);
+ if (!d) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ fd = dirfd(d->dp);
+ if (datasync) {
+ res = fdatasync(fd);
+ } else {
+ res = fsync(fd);
+ }
+
+ lo_dirp_put(&d);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode = lo_inode(req, ino);
+ int err;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)"
+ "\n", ino, fi->flags, fi->kill_priv);
+
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ err = lo_do_open(lo, inode, -1, fi);
+ lo_inode_put(lo, &inode);
+ if (err) {
+ fuse_reply_err(req, err);
+ } else {
+ fuse_reply_open(req, fi);
+ }
+}
+
+static void lo_release(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_file_info *fi)
+{
+ struct lo_data *lo = lo_data(req);
+ struct lo_map_elem *elem;
+ int fd = -1;
+
+ (void)ino;
+
+ pthread_mutex_lock(&lo->mutex);
+ elem = lo_map_get(&lo->fd_map, fi->fh);
+ if (elem) {
+ fd = elem->fd;
+ elem = NULL;
+ lo_map_remove(&lo->fd_map, fi->fh);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+
+ close(fd);
+ fuse_reply_err(req, 0);
+}
+
+static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
+{
+ int res;
+ (void)ino;
+ struct lo_inode *inode;
+ struct lo_data *lo = lo_data(req);
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ if (!S_ISREG(inode->filetype)) {
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ /* An fd is going away. Cleanup associated posix locks */
+ if (lo->posix_lock) {
+ pthread_mutex_lock(&inode->plock_mutex);
+ g_hash_table_remove(inode->posix_locks,
+ GUINT_TO_POINTER(fi->lock_owner));
+ pthread_mutex_unlock(&inode->plock_mutex);
+ }
+ res = close(dup(lo_fi_fd(req, fi)));
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync,
+ struct fuse_file_info *fi)
+{
+ struct lo_inode *inode = lo_inode(req, ino);
+ struct lo_data *lo = lo_data(req);
+ int res;
+ int fd;
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino,
+ (void *)fi);
+
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ if (!fi) {
+ fd = lo_inode_open(lo, inode, O_RDWR);
+ if (fd < 0) {
+ res = -fd;
+ goto out;
+ }
+ } else {
+ fd = lo_fi_fd(req, fi);
+ }
+
+ if (datasync) {
+ res = fdatasync(fd) == -1 ? errno : 0;
+ } else {
+ res = fsync(fd) == -1 ? errno : 0;
+ }
+ if (!fi) {
+ close(fd);
+ }
+out:
+ lo_inode_put(lo, &inode);
+ fuse_reply_err(req, res);
+}
+
+static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset,
+ struct fuse_file_info *fi)
+{
+ struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_read(ino=%" PRIu64 ", size=%zd, "
+ "off=%lu)\n",
+ ino, size, (unsigned long)offset);
+
+ buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
+ buf.buf[0].fd = lo_fi_fd(req, fi);
+ buf.buf[0].pos = offset;
+
+ fuse_reply_data(req, &buf);
+}
+
+static void lo_write_buf(fuse_req_t req, fuse_ino_t ino,
+ struct fuse_bufvec *in_buf, off_t off,
+ struct fuse_file_info *fi)
+{
+ (void)ino;
+ ssize_t res;
+ struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf));
+ bool cap_fsetid_dropped = false;
+
+ out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK;
+ out_buf.buf[0].fd = lo_fi_fd(req, fi);
+ out_buf.buf[0].pos = off;
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n",
+ ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv);
+
+ res = drop_security_capability(lo_data(req), out_buf.buf[0].fd);
+ if (res) {
+ fuse_reply_err(req, res);
+ return;
+ }
+
+ /*
+ * If kill_priv is set, drop CAP_FSETID which should lead to kernel
+ * clearing setuid/setgid on file. Note, for WRITE, we need to do
+ * this even if killpriv_v2 is not enabled. fuse direct write path
+ * relies on this.
+ */
+ if (fi->kill_priv) {
+ res = drop_effective_cap("FSETID", &cap_fsetid_dropped);
+ if (res != 0) {
+ fuse_reply_err(req, res);
+ return;
+ }
+ }
+
+ res = fuse_buf_copy(&out_buf, in_buf);
+ if (res < 0) {
+ fuse_reply_err(req, -res);
+ } else {
+ fuse_reply_write(req, (size_t)res);
+ }
+
+ if (cap_fsetid_dropped) {
+ res = gain_effective_cap("FSETID");
+ if (res) {
+ fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n");
+ }
+ }
+}
+
+static void lo_statfs(fuse_req_t req, fuse_ino_t ino)
+{
+ int res;
+ struct statvfs stbuf;
+
+ res = fstatvfs(lo_fd(req, ino), &stbuf);
+ if (res == -1) {
+ fuse_reply_err(req, errno);
+ } else {
+ fuse_reply_statfs(req, &stbuf);
+ }
+}
+
+static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset,
+ off_t length, struct fuse_file_info *fi)
+{
+ int err = EOPNOTSUPP;
+ (void)ino;
+
+#ifdef CONFIG_FALLOCATE
+ err = fallocate(lo_fi_fd(req, fi), mode, offset, length);
+ if (err < 0) {
+ err = errno;
+ }
+
+#elif defined(CONFIG_POSIX_FALLOCATE)
+ if (mode) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ err = posix_fallocate(lo_fi_fd(req, fi), offset, length);
+#endif
+
+ fuse_reply_err(req, err);
+}
+
+static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi,
+ int op)
+{
+ int res;
+ (void)ino;
+
+ res = flock(lo_fi_fd(req, fi), op);
+
+ fuse_reply_err(req, res == -1 ? errno : 0);
+}
+
+/* types */
+/*
+ * Exit; process attribute unmodified if matched.
+ * An empty key applies to all.
+ */
+#define XATTR_MAP_FLAG_OK (1 << 0)
+/*
+ * The attribute is unwanted;
+ * EPERM on write, hidden on read.
+ */
+#define XATTR_MAP_FLAG_BAD (1 << 1)
+/*
+ * For attr that start with 'key' prepend 'prepend'
+ * 'key' may be empty to prepend for all attrs
+ * key is defined from set/remove point of view.
+ * Automatically reversed on read
+ */
+#define XATTR_MAP_FLAG_PREFIX (1 << 2)
+/*
+ * The attribute is unsupported;
+ * ENOTSUP on write, hidden on read.
+ */
+#define XATTR_MAP_FLAG_UNSUPPORTED (1 << 3)
+
+/* scopes */
+/* Apply rule to get/set/remove */
+#define XATTR_MAP_FLAG_CLIENT (1 << 16)
+/* Apply rule to list */
+#define XATTR_MAP_FLAG_SERVER (1 << 17)
+/* Apply rule to all */
+#define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT)
+
+static void add_xattrmap_entry(struct lo_data *lo,
+ const XattrMapEntry *new_entry)
+{
+ XattrMapEntry *res = g_realloc_n(lo->xattr_map_list,
+ lo->xattr_map_nentries + 1,
+ sizeof(XattrMapEntry));
+ res[lo->xattr_map_nentries++] = *new_entry;
+
+ lo->xattr_map_list = res;
+}
+
+static void free_xattrmap(struct lo_data *lo)
+{
+ XattrMapEntry *map = lo->xattr_map_list;
+ size_t i;
+
+ if (!map) {
+ return;
+ }
+
+ for (i = 0; i < lo->xattr_map_nentries; i++) {
+ g_free(map[i].key);
+ g_free(map[i].prepend);
+ };
+
+ g_free(map);
+ lo->xattr_map_list = NULL;
+ lo->xattr_map_nentries = -1;
+}
+
+/*
+ * Handle the 'map' type, which is sugar for a set of commands
+ * for the common case of prefixing a subset or everything,
+ * and allowing anything not prefixed through.
+ * It must be the last entry in the stream, although there
+ * can be other entries before it.
+ * The form is:
+ * :map:key:prefix:
+ *
+ * key maybe empty in which case all entries are prefixed.
+ */
+static void parse_xattrmap_map(struct lo_data *lo,
+ const char *rule, char sep)
+{
+ const char *tmp;
+ char *key;
+ char *prefix;
+ XattrMapEntry tmp_entry;
+
+ if (*rule != sep) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Expecting '%c' after 'map' keyword, found '%c'\n",
+ __func__, sep, *rule);
+ exit(1);
+ }
+
+ rule++;
+
+ /* At start of 'key' field */
+ tmp = strchr(rule, sep);
+ if (!tmp) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Missing '%c' at end of key field in map rule\n",
+ __func__, sep);
+ exit(1);
+ }
+
+ key = g_strndup(rule, tmp - rule);
+ rule = tmp + 1;
+
+ /* At start of prefix field */
+ tmp = strchr(rule, sep);
+ if (!tmp) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Missing '%c' at end of prefix field in map rule\n",
+ __func__, sep);
+ exit(1);
+ }
+
+ prefix = g_strndup(rule, tmp - rule);
+ rule = tmp + 1;
+
+ /*
+ * This should be the end of the string, we don't allow
+ * any more commands after 'map'.
+ */
+ if (*rule) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Expecting end of command after map, found '%c'\n",
+ __func__, *rule);
+ exit(1);
+ }
+
+ /* 1st: Prefix matches/everything */
+ tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL;
+ tmp_entry.key = g_strdup(key);
+ tmp_entry.prepend = g_strdup(prefix);
+ add_xattrmap_entry(lo, &tmp_entry);
+
+ if (!*key) {
+ /* Prefix all case */
+
+ /* 2nd: Hide any non-prefixed entries on the host */
+ tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL;
+ tmp_entry.key = g_strdup("");
+ tmp_entry.prepend = g_strdup("");
+ add_xattrmap_entry(lo, &tmp_entry);
+ } else {
+ /* Prefix matching case */
+
+ /* 2nd: Hide non-prefixed but matching entries on the host */
+ tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER;
+ tmp_entry.key = g_strdup(""); /* Not used */
+ tmp_entry.prepend = g_strdup(key);
+ add_xattrmap_entry(lo, &tmp_entry);
+
+ /* 3rd: Stop the client accessing prefixed attributes directly */
+ tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT;
+ tmp_entry.key = g_strdup(prefix);
+ tmp_entry.prepend = g_strdup(""); /* Not used */
+ add_xattrmap_entry(lo, &tmp_entry);
+
+ /* 4th: Everything else is OK */
+ tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL;
+ tmp_entry.key = g_strdup("");
+ tmp_entry.prepend = g_strdup("");
+ add_xattrmap_entry(lo, &tmp_entry);
+ }
+
+ g_free(key);
+ g_free(prefix);
+}
+
+static void parse_xattrmap(struct lo_data *lo)
+{
+ const char *map = lo->xattrmap;
+ const char *tmp;
+ int ret;
+
+ lo->xattr_map_nentries = 0;
+ while (*map) {
+ XattrMapEntry tmp_entry;
+ char sep;
+
+ if (isspace(*map)) {
+ map++;
+ continue;
+ }
+ /* The separator is the first non-space of the rule */
+ sep = *map++;
+ if (!sep) {
+ break;
+ }
+
+ tmp_entry.flags = 0;
+ /* Start of 'type' */
+ if (strstart(map, "prefix", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX;
+ } else if (strstart(map, "ok", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_OK;
+ } else if (strstart(map, "bad", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_BAD;
+ } else if (strstart(map, "unsupported", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_UNSUPPORTED;
+ } else if (strstart(map, "map", &map)) {
+ /*
+ * map is sugar that adds a number of rules, and must be
+ * the last entry.
+ */
+ parse_xattrmap_map(lo, map, sep);
+ break;
+ } else {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Unexpected type;"
+ "Expecting 'prefix', 'ok', 'bad', 'unsupported' or 'map'"
+ " in rule %zu\n", __func__, lo->xattr_map_nentries);
+ exit(1);
+ }
+
+ if (*map++ != sep) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Missing '%c' at end of type field of rule %zu\n",
+ __func__, sep, lo->xattr_map_nentries);
+ exit(1);
+ }
+
+ /* Start of 'scope' */
+ if (strstart(map, "client", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT;
+ } else if (strstart(map, "server", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_SERVER;
+ } else if (strstart(map, "all", &map)) {
+ tmp_entry.flags |= XATTR_MAP_FLAG_ALL;
+ } else {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Unexpected scope;"
+ " Expecting 'client', 'server', or 'all', in rule %zu\n",
+ __func__, lo->xattr_map_nentries);
+ exit(1);
+ }
+
+ if (*map++ != sep) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Expecting '%c' found '%c'"
+ " after scope in rule %zu\n",
+ __func__, sep, *map, lo->xattr_map_nentries);
+ exit(1);
+ }
+
+ /* At start of 'key' field */
+ tmp = strchr(map, sep);
+ if (!tmp) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Missing '%c' at end of key field of rule %zu",
+ __func__, sep, lo->xattr_map_nentries);
+ exit(1);
+ }
+ tmp_entry.key = g_strndup(map, tmp - map);
+ map = tmp + 1;
+
+ /* At start of 'prepend' field */
+ tmp = strchr(map, sep);
+ if (!tmp) {
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Missing '%c' at end of prepend field of rule %zu",
+ __func__, sep, lo->xattr_map_nentries);
+ exit(1);
+ }
+ tmp_entry.prepend = g_strndup(map, tmp - map);
+ map = tmp + 1;
+
+ add_xattrmap_entry(lo, &tmp_entry);
+ /* End of rule - go around again for another rule */
+ }
+
+ if (!lo->xattr_map_nentries) {
+ fuse_log(FUSE_LOG_ERR, "Empty xattr map\n");
+ exit(1);
+ }
+
+ ret = xattr_map_client(lo, "security.capability",
+ &lo->xattr_security_capability);
+ if (ret) {
+ fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n",
+ strerror(ret));
+ exit(1);
+ }
+ if (!lo->xattr_security_capability ||
+ !strcmp(lo->xattr_security_capability, "security.capability")) {
+ /* 1-1 mapping, don't need to do anything */
+ free(lo->xattr_security_capability);
+ lo->xattr_security_capability = NULL;
+ }
+}
+
+/*
+ * For use with getxattr/setxattr/removexattr, where the client
+ * gives us a name and we may need to choose a different one.
+ * Allocates a buffer for the result placing it in *out_name.
+ * If there's no change then *out_name is not set.
+ * Returns 0 on success
+ * Can return -EPERM to indicate we block a given attribute
+ * (in which case out_name is not allocated)
+ * Can return -ENOMEM to indicate out_name couldn't be allocated.
+ */
+static int xattr_map_client(const struct lo_data *lo, const char *client_name,
+ char **out_name)
+{
+ size_t i;
+ for (i = 0; i < lo->xattr_map_nentries; i++) {
+ const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
+
+ if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) &&
+ (strstart(client_name, cur_entry->key, NULL))) {
+ if (cur_entry->flags & XATTR_MAP_FLAG_BAD) {
+ return -EPERM;
+ }
+ if (cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
+ return -ENOTSUP;
+ }
+ if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
+ /* Unmodified name */
+ return 0;
+ }
+ if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
+ *out_name = g_try_malloc(strlen(client_name) +
+ strlen(cur_entry->prepend) + 1);
+ if (!*out_name) {
+ return -ENOMEM;
+ }
+ sprintf(*out_name, "%s%s", cur_entry->prepend, client_name);
+ return 0;
+ }
+ }
+ }
+
+ return -EPERM;
+}
+
+/*
+ * For use with listxattr where the server fs gives us a name and we may need
+ * to sanitize this for the client.
+ * Returns a pointer to the result in *out_name
+ * This is always the original string or the current string with some prefix
+ * removed; no reallocation is done.
+ * Returns 0 on success
+ * Can return -ENODATA to indicate the name should be dropped from the list.
+ */
+static int xattr_map_server(const struct lo_data *lo, const char *server_name,
+ const char **out_name)
+{
+ size_t i;
+ const char *end;
+
+ for (i = 0; i < lo->xattr_map_nentries; i++) {
+ const XattrMapEntry *cur_entry = lo->xattr_map_list + i;
+
+ if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) &&
+ (strstart(server_name, cur_entry->prepend, &end))) {
+ if (cur_entry->flags & XATTR_MAP_FLAG_BAD ||
+ cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) {
+ return -ENODATA;
+ }
+ if (cur_entry->flags & XATTR_MAP_FLAG_OK) {
+ *out_name = server_name;
+ return 0;
+ }
+ if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) {
+ /* Remove prefix */
+ *out_name = end;
+ return 0;
+ }
+ }
+ }
+
+ return -ENODATA;
+}
+
+#define FCHDIR_NOFAIL(fd) do { \
+ int fchdir_res = fchdir(fd); \
+ assert(fchdir_res == 0); \
+ } while (0)
+
+static bool block_xattr(struct lo_data *lo, const char *name)
+{
+ /*
+ * If user explicitly enabled posix_acl or did not provide any option,
+ * do not block acl. Otherwise block system.posix_acl_access and
+ * system.posix_acl_default xattrs.
+ */
+ if (lo->user_posix_acl) {
+ return false;
+ }
+ if (!strcmp(name, "system.posix_acl_access") ||
+ !strcmp(name, "system.posix_acl_default"))
+ return true;
+
+ return false;
+}
+
+/*
+ * Returns number of bytes in xattr_list after filtering on success. This
+ * could be zero as well if nothing is left after filtering.
+ *
+ * Returns negative error code on failure.
+ * xattr_list is modified in place.
+ */
+static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list,
+ unsigned in_size)
+{
+ size_t out_index, in_index;
+
+ /*
+ * As of now we only filter out acl xattrs. If acls are enabled or
+ * they have not been explicitly disabled, there is nothing to
+ * filter.
+ */
+ if (lo->user_posix_acl) {
+ return in_size;
+ }
+
+ out_index = 0;
+ in_index = 0;
+ while (in_index < in_size) {
+ char *in_ptr = xattr_list + in_index;
+
+ /* Length of current attribute name */
+ size_t in_len = strlen(xattr_list + in_index) + 1;
+
+ if (!block_xattr(lo, in_ptr)) {
+ if (in_index != out_index) {
+ memmove(xattr_list + out_index, xattr_list + in_index, in_len);
+ }
+ out_index += in_len;
+ }
+ in_index += in_len;
+ }
+ return out_index;
+}
+
+static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
+ size_t size)
+{
+ struct lo_data *lo = lo_data(req);
+ g_autofree char *value = NULL;
+ char procname[64];
+ const char *name;
+ char *mapped_name;
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ mapped_name = NULL;
+ name = in_name;
+ if (lo->xattrmap) {
+ ret = xattr_map_client(lo, in_name, &mapped_name);
+ if (ret < 0) {
+ if (ret == -EPERM) {
+ ret = -ENODATA;
+ }
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ if (mapped_name) {
+ name = mapped_name;
+ }
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ g_free(mapped_name);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n",
+ ino, name, size);
+
+ if (size) {
+ value = g_try_malloc(size);
+ if (!value) {
+ goto out_err;
+ }
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ /*
+ * It is not safe to open() non-regular/non-dir files in file server
+ * unless O_PATH is used, so use that method for regular files/dir
+ * only (as it seems giving less performance overhead).
+ * Otherwise, call fchdir() to avoid open().
+ */
+ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ goto out_err;
+ }
+ ret = fgetxattr(fd, name, value, size);
+ saverr = ret == -1 ? errno : 0;
+ } else {
+ /* fchdir should not fail here */
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ ret = getxattr(procname, name, value, size);
+ saverr = ret == -1 ? errno : 0;
+ FCHDIR_NOFAIL(lo->root.fd);
+ }
+
+ if (ret == -1) {
+ goto out;
+ }
+ if (size) {
+ saverr = 0;
+ if (ret == 0) {
+ goto out;
+ }
+ fuse_reply_buf(req, value, ret);
+ } else {
+ fuse_reply_xattr(req, ret);
+ }
+out_free:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+out:
+ fuse_reply_err(req, saverr);
+ g_free(mapped_name);
+ goto out_free;
+}
+
+static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size)
+{
+ struct lo_data *lo = lo_data(req);
+ g_autofree char *value = NULL;
+ char procname[64];
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino,
+ size);
+
+ if (size) {
+ value = g_try_malloc(size);
+ if (!value) {
+ goto out_err;
+ }
+ }
+
+ sprintf(procname, "%i", inode->fd);
+ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ goto out_err;
+ }
+ ret = flistxattr(fd, value, size);
+ saverr = ret == -1 ? errno : 0;
+ } else {
+ /* fchdir should not fail here */
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ ret = listxattr(procname, value, size);
+ saverr = ret == -1 ? errno : 0;
+ FCHDIR_NOFAIL(lo->root.fd);
+ }
+
+ if (ret == -1) {
+ goto out;
+ }
+ if (size) {
+ saverr = 0;
+ if (ret == 0) {
+ goto out;
+ }
+
+ if (lo->xattr_map_list) {
+ /*
+ * Map the names back, some attributes might be dropped,
+ * some shortened, but not increased, so we shouldn't
+ * run out of room.
+ */
+ size_t out_index, in_index;
+ out_index = 0;
+ in_index = 0;
+ while (in_index < ret) {
+ const char *map_out;
+ char *in_ptr = value + in_index;
+ /* Length of current attribute name */
+ size_t in_len = strlen(value + in_index) + 1;
+
+ int mapret = xattr_map_server(lo, in_ptr, &map_out);
+ if (mapret != -ENODATA && mapret != 0) {
+ /* Shouldn't happen */
+ saverr = -mapret;
+ goto out;
+ }
+ if (mapret == 0) {
+ /* Either unchanged, or truncated */
+ size_t out_len;
+ if (map_out != in_ptr) {
+ /* +1 copies the NIL */
+ out_len = strlen(map_out) + 1;
+ } else {
+ /* No change */
+ out_len = in_len;
+ }
+ /*
+ * Move result along, may still be needed for an unchanged
+ * entry if a previous entry was changed.
+ */
+ memmove(value + out_index, map_out, out_len);
+
+ out_index += out_len;
+ }
+ in_index += in_len;
+ }
+ ret = out_index;
+ if (ret == 0) {
+ goto out;
+ }
+ }
+
+ ret = remove_blocked_xattrs(lo, value, ret);
+ if (ret <= 0) {
+ saverr = -ret;
+ goto out;
+ }
+ fuse_reply_buf(req, value, ret);
+ } else {
+ /*
+ * xattrmap only ever shortens the result,
+ * so we don't need to do anything clever with the
+ * allocation length here.
+ */
+ fuse_reply_xattr(req, ret);
+ }
+out_free:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ return;
+
+out_err:
+ saverr = errno;
+out:
+ fuse_reply_err(req, saverr);
+ goto out_free;
+}
+
+static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name,
+ const char *value, size_t size, int flags,
+ uint32_t extra_flags)
+{
+ char procname[64];
+ const char *name;
+ char *mapped_name;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+ bool switched_creds = false;
+ bool cap_fsetid_dropped = false;
+ struct lo_cred old = {};
+
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ mapped_name = NULL;
+ name = in_name;
+ if (lo->xattrmap) {
+ ret = xattr_map_client(lo, in_name, &mapped_name);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ if (mapped_name) {
+ name = mapped_name;
+ }
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ g_free(mapped_name);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64
+ ", name=%s value=%s size=%zd)\n", ino, name, value, size);
+
+ sprintf(procname, "%i", inode->fd);
+ /*
+ * If we are setting posix access acl and if SGID needs to be
+ * cleared, then switch to caller's gid and drop CAP_FSETID
+ * and that should make sure host kernel clears SGID.
+ *
+ * This probably will not work when we support idmapped mounts.
+ * In that case we will need to find a non-root gid and switch
+ * to it. (Instead of gid in request). Fix it when we support
+ * idmapped mounts.
+ */
+ if (lo->posix_acl && !strcmp(name, "system.posix_acl_access")
+ && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) {
+ ret = lo_drop_cap_change_cred(req, &old, false, "FSETID",
+ &cap_fsetid_dropped);
+ if (ret) {
+ saverr = ret;
+ goto out;
+ }
+ switched_creds = true;
+ }
+ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ saverr = errno;
+ goto out;
+ }
+ ret = fsetxattr(fd, name, value, size, flags);
+ saverr = ret == -1 ? errno : 0;
+ } else {
+ /* fchdir should not fail here */
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ ret = setxattr(procname, name, value, size, flags);
+ saverr = ret == -1 ? errno : 0;
+ FCHDIR_NOFAIL(lo->root.fd);
+ }
+ if (switched_creds) {
+ if (cap_fsetid_dropped)
+ lo_restore_cred_gain_cap(&old, false, "FSETID");
+ else
+ lo_restore_cred(&old, false);
+ }
+
+out:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ g_free(mapped_name);
+ fuse_reply_err(req, saverr);
+}
+
+static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name)
+{
+ char procname[64];
+ const char *name;
+ char *mapped_name;
+ struct lo_data *lo = lo_data(req);
+ struct lo_inode *inode;
+ ssize_t ret;
+ int saverr;
+ int fd = -1;
+
+ if (block_xattr(lo, in_name)) {
+ fuse_reply_err(req, EOPNOTSUPP);
+ return;
+ }
+
+ mapped_name = NULL;
+ name = in_name;
+ if (lo->xattrmap) {
+ ret = xattr_map_client(lo, in_name, &mapped_name);
+ if (ret < 0) {
+ fuse_reply_err(req, -ret);
+ return;
+ }
+ if (mapped_name) {
+ name = mapped_name;
+ }
+ }
+
+ inode = lo_inode(req, ino);
+ if (!inode) {
+ fuse_reply_err(req, EBADF);
+ g_free(mapped_name);
+ return;
+ }
+
+ saverr = ENOSYS;
+ if (!lo_data(req)->xattr) {
+ goto out;
+ }
+
+ fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino,
+ name);
+
+ sprintf(procname, "%i", inode->fd);
+ if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) {
+ fd = openat(lo->proc_self_fd, procname, O_RDONLY);
+ if (fd < 0) {
+ saverr = errno;
+ goto out;
+ }
+ ret = fremovexattr(fd, name);
+ saverr = ret == -1 ? errno : 0;
+ } else {
+ /* fchdir should not fail here */
+ FCHDIR_NOFAIL(lo->proc_self_fd);
+ ret = removexattr(procname, name);
+ saverr = ret == -1 ? errno : 0;
+ FCHDIR_NOFAIL(lo->root.fd);
+ }
+
+out:
+ if (fd >= 0) {
+ close(fd);
+ }
+
+ lo_inode_put(lo, &inode);
+ g_free(mapped_name);
+ fuse_reply_err(req, saverr);
+}
+
+#ifdef HAVE_COPY_FILE_RANGE
+static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in,
+ struct fuse_file_info *fi_in, fuse_ino_t ino_out,
+ off_t off_out, struct fuse_file_info *fi_out,
+ size_t len, int flags)
+{
+ int in_fd, out_fd;
+ ssize_t res;
+
+ in_fd = lo_fi_fd(req, fi_in);
+ out_fd = lo_fi_fd(req, fi_out);
+
+ fuse_log(FUSE_LOG_DEBUG,
+ "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, "
+ "off=%ju, ino=%" PRIu64 "/fd=%d, "
+ "off=%ju, size=%zd, flags=0x%x)\n",
+ ino_in, in_fd, (intmax_t)off_in,
+ ino_out, out_fd, (intmax_t)off_out, len, flags);
+
+ res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags);
+ if (res < 0) {
+ fuse_reply_err(req, errno);
+ } else {
+ fuse_reply_write(req, res);
+ }
+}
+#endif
+
+static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence,
+ struct fuse_file_info *fi)
+{
+ off_t res;
+
+ (void)ino;
+ res = lseek(lo_fi_fd(req, fi), off, whence);
+ if (res != -1) {
+ fuse_reply_lseek(req, res);
+ } else {
+ fuse_reply_err(req, errno);
+ }
+}
+
+static void lo_destroy(void *userdata)
+{
+ struct lo_data *lo = (struct lo_data *)userdata;
+
+ pthread_mutex_lock(&lo->mutex);
+ while (true) {
+ GHashTableIter iter;
+ gpointer key, value;
+
+ g_hash_table_iter_init(&iter, lo->inodes);
+ if (!g_hash_table_iter_next(&iter, &key, &value)) {
+ break;
+ }
+
+ struct lo_inode *inode = value;
+ unref_inode(lo, inode, inode->nlookup);
+ }
+ pthread_mutex_unlock(&lo->mutex);
+}
+
+static struct fuse_lowlevel_ops lo_oper = {
+ .init = lo_init,
+ .lookup = lo_lookup,
+ .mkdir = lo_mkdir,
+ .mknod = lo_mknod,
+ .symlink = lo_symlink,
+ .link = lo_link,
+ .unlink = lo_unlink,
+ .rmdir = lo_rmdir,
+ .rename = lo_rename,
+ .forget = lo_forget,
+ .forget_multi = lo_forget_multi,
+ .getattr = lo_getattr,
+ .setattr = lo_setattr,
+ .readlink = lo_readlink,
+ .opendir = lo_opendir,
+ .readdir = lo_readdir,
+ .readdirplus = lo_readdirplus,
+ .releasedir = lo_releasedir,
+ .fsyncdir = lo_fsyncdir,
+ .create = lo_create,
+ .getlk = lo_getlk,
+ .setlk = lo_setlk,
+ .open = lo_open,
+ .release = lo_release,
+ .flush = lo_flush,
+ .fsync = lo_fsync,
+ .read = lo_read,
+ .write_buf = lo_write_buf,
+ .statfs = lo_statfs,
+ .fallocate = lo_fallocate,
+ .flock = lo_flock,
+ .getxattr = lo_getxattr,
+ .listxattr = lo_listxattr,
+ .setxattr = lo_setxattr,
+ .removexattr = lo_removexattr,
+#ifdef HAVE_COPY_FILE_RANGE
+ .copy_file_range = lo_copy_file_range,
+#endif
+ .lseek = lo_lseek,
+ .destroy = lo_destroy,
+};
+
+/* Print vhost-user.json backend program capabilities */
+static void print_capabilities(void)
+{
+ printf("{\n");
+ printf(" \"type\": \"fs\"\n");
+ printf("}\n");
+}
+
+/*
+ * Drop all Linux capabilities because the wait parent process only needs to
+ * sit in waitpid(2) and terminate.
+ */
+static void setup_wait_parent_capabilities(void)
+{
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ capng_apply(CAPNG_SELECT_BOTH);
+}
+
+/*
+ * Move to a new mount, net, and pid namespaces to isolate this process.
+ */
+static void setup_namespaces(struct lo_data *lo, struct fuse_session *se)
+{
+ pid_t child;
+
+ /*
+ * Create a new pid namespace for *child* processes. We'll have to
+ * fork in order to enter the new pid namespace. A new mount namespace
+ * is also needed so that we can remount /proc for the new pid
+ * namespace.
+ *
+ * Our UNIX domain sockets have been created. Now we can move to
+ * an empty network namespace to prevent TCP/IP and other network
+ * activity in case this process is compromised.
+ */
+ if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) {
+ fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n");
+ exit(1);
+ }
+
+ child = fork();
+ if (child < 0) {
+ fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n");
+ exit(1);
+ }
+ if (child > 0) {
+ pid_t waited;
+ int wstatus;
+
+ setup_wait_parent_capabilities();
+
+ /* The parent waits for the child */
+ do {
+ waited = waitpid(child, &wstatus, 0);
+ } while (waited < 0 && errno == EINTR && !se->exited);
+
+ /* We were terminated by a signal, see fuse_signals.c */
+ if (se->exited) {
+ exit(0);
+ }
+
+ if (WIFEXITED(wstatus)) {
+ exit(WEXITSTATUS(wstatus));
+ }
+
+ exit(1);
+ }
+
+ /* Send us SIGTERM when the parent thread terminates, see prctl(2) */
+ prctl(PR_SET_PDEATHSIG, SIGTERM);
+
+ /*
+ * If the mounts have shared propagation then we want to opt out so our
+ * mount changes don't affect the parent mount namespace.
+ */
+ if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n");
+ exit(1);
+ }
+
+ /* The child must remount /proc to use the new pid namespace */
+ if (mount("proc", "/proc", "proc",
+ MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n");
+ exit(1);
+ }
+
+ /*
+ * We only need /proc/self/fd. Prevent ".." from accessing parent
+ * directories of /proc/self/fd by bind-mounting it over /proc. Since / was
+ * previously remounted with MS_REC | MS_SLAVE this mount change only
+ * affects our process.
+ */
+ if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n");
+ exit(1);
+ }
+
+ /* Get the /proc (actually /proc/self/fd, see above) file descriptor */
+ lo->proc_self_fd = open("/proc", O_PATH);
+ if (lo->proc_self_fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n");
+ exit(1);
+ }
+}
+
+/*
+ * Capture the capability state, we'll need to restore this for individual
+ * threads later; see load_capng.
+ */
+static void setup_capng(void)
+{
+ /* Note this accesses /proc so has to happen before the sandbox */
+ if (capng_get_caps_process()) {
+ fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n");
+ exit(1);
+ }
+ pthread_mutex_init(&cap.mutex, NULL);
+ pthread_mutex_lock(&cap.mutex);
+ cap.saved = capng_save_state();
+ if (!cap.saved) {
+ fuse_log(FUSE_LOG_ERR, "capng_save_state\n");
+ exit(1);
+ }
+ pthread_mutex_unlock(&cap.mutex);
+}
+
+static void cleanup_capng(void)
+{
+ free(cap.saved);
+ cap.saved = NULL;
+ pthread_mutex_destroy(&cap.mutex);
+}
+
+
+/*
+ * Make the source directory our root so symlinks cannot escape and no other
+ * files are accessible. Assumes unshare(CLONE_NEWNS) was already called.
+ */
+static void setup_mounts(const char *source)
+{
+ int oldroot;
+ int newroot;
+
+ if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source);
+ exit(1);
+ }
+
+ /* This magic is based on lxc's lxc_pivot_root() */
+ oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (oldroot < 0) {
+ fuse_log(FUSE_LOG_ERR, "open(/): %m\n");
+ exit(1);
+ }
+
+ newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
+ if (newroot < 0) {
+ fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source);
+ exit(1);
+ }
+
+ if (fchdir(newroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
+ exit(1);
+ }
+
+ if (syscall(__NR_pivot_root, ".", ".") < 0) {
+ fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n");
+ exit(1);
+ }
+
+ if (fchdir(oldroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n");
+ exit(1);
+ }
+
+ if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) {
+ fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n");
+ exit(1);
+ }
+
+ if (umount2(".", MNT_DETACH) < 0) {
+ fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n");
+ exit(1);
+ }
+
+ if (fchdir(newroot) < 0) {
+ fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n");
+ exit(1);
+ }
+
+ close(newroot);
+ close(oldroot);
+}
+
+/*
+ * Only keep capabilities in allowlist that are needed for file system operation
+ * The (possibly NULL) modcaps_in string passed in is free'd before exit.
+ */
+static void setup_capabilities(char *modcaps_in)
+{
+ char *modcaps = modcaps_in;
+ pthread_mutex_lock(&cap.mutex);
+ capng_restore_state(&cap.saved);
+
+ /*
+ * Add to allowlist file system-related capabilities that are needed for a
+ * file server to act like root. Drop everything else like networking and
+ * sysadmin capabilities.
+ *
+ * Exclusions:
+ * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl
+ * and we don't support that.
+ * 2. CAP_MAC_OVERRIDE is not included because it only seems to be
+ * used by the Smack LSM. Omit it until there is demand for it.
+ */
+ capng_setpid(syscall(SYS_gettid));
+ capng_clear(CAPNG_SELECT_BOTH);
+ if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE,
+ CAP_CHOWN,
+ CAP_DAC_OVERRIDE,
+ CAP_FOWNER,
+ CAP_FSETID,
+ CAP_SETGID,
+ CAP_SETUID,
+ CAP_MKNOD,
+ CAP_SETFCAP,
+ -1)) {
+ fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__);
+ exit(1);
+ }
+
+ /*
+ * The modcaps option is a colon separated list of caps,
+ * each preceded by either + or -.
+ */
+ while (modcaps) {
+ capng_act_t action;
+ int cap;
+
+ char *next = strchr(modcaps, ':');
+ if (next) {
+ *next = '\0';
+ next++;
+ }
+
+ switch (modcaps[0]) {
+ case '+':
+ action = CAPNG_ADD;
+ break;
+
+ case '-':
+ action = CAPNG_DROP;
+ break;
+
+ default:
+ fuse_log(FUSE_LOG_ERR,
+ "%s: Expecting '+'/'-' in modcaps but found '%c'\n",
+ __func__, modcaps[0]);
+ exit(1);
+ }
+ cap = capng_name_to_capability(modcaps + 1);
+ if (cap < 0) {
+ fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__,
+ modcaps);
+ exit(1);
+ }
+ if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) {
+ fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n",
+ __func__, modcaps);
+ exit(1);
+ }
+
+ modcaps = next;
+ }
+ g_free(modcaps_in);
+
+ if (capng_apply(CAPNG_SELECT_BOTH)) {
+ fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__);
+ exit(1);
+ }
+
+ cap.saved = capng_save_state();
+ if (!cap.saved) {
+ fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__);
+ exit(1);
+ }
+ pthread_mutex_unlock(&cap.mutex);
+}
+
+/*
+ * Use chroot as a weaker sandbox for environments where the process is
+ * launched without CAP_SYS_ADMIN.
+ */
+static void setup_chroot(struct lo_data *lo)
+{
+ lo->proc_self_fd = open("/proc/self/fd", O_PATH);
+ if (lo->proc_self_fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n");
+ exit(1);
+ }
+
+ /*
+ * Make the shared directory the file system root so that FUSE_OPEN
+ * (lo_open()) cannot escape the shared directory by opening a symlink.
+ *
+ * The chroot(2) syscall is later disabled by seccomp and the
+ * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot
+ * is not possible.
+ *
+ * However, it's still possible to escape the chroot via lo->proc_self_fd
+ * but that requires first gaining control of the process.
+ */
+ if (chroot(lo->source) != 0) {
+ fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source);
+ exit(1);
+ }
+
+ /* Move into the chroot */
+ if (chdir("/") != 0) {
+ fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n");
+ exit(1);
+ }
+}
+
+/*
+ * Lock down this process to prevent access to other processes or files outside
+ * source directory. This reduces the impact of arbitrary code execution bugs.
+ */
+static void setup_sandbox(struct lo_data *lo, struct fuse_session *se,
+ bool enable_syslog)
+{
+ if (lo->sandbox == SANDBOX_NAMESPACE) {
+ setup_namespaces(lo, se);
+ setup_mounts(lo->source);
+ } else {
+ setup_chroot(lo);
+ }
+
+ setup_seccomp(enable_syslog);
+ setup_capabilities(g_strdup(lo->modcaps));
+}
+
+/* Set the maximum number of open file descriptors */
+static void setup_nofile_rlimit(unsigned long rlimit_nofile)
+{
+ struct rlimit rlim = {
+ .rlim_cur = rlimit_nofile,
+ .rlim_max = rlimit_nofile,
+ };
+
+ if (rlimit_nofile == 0) {
+ return; /* nothing to do */
+ }
+
+ if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) {
+ /* Ignore SELinux denials */
+ if (errno == EPERM) {
+ return;
+ }
+
+ fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n");
+ exit(1);
+ }
+}
+
+static void log_func(enum fuse_log_level level, const char *fmt, va_list ap)
+{
+ g_autofree char *localfmt = NULL;
+
+ if (current_log_level < level) {
+ return;
+ }
+
+ if (current_log_level == FUSE_LOG_DEBUG) {
+ if (use_syslog) {
+ /* no timestamp needed */
+ localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid),
+ fmt);
+ } else {
+ g_autoptr(GDateTime) now = g_date_time_new_now_utc();
+ g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z");
+ localfmt = g_strdup_printf("[%s] [ID: %08ld] %s",
+ nowstr, syscall(__NR_gettid), fmt);
+ }
+ fmt = localfmt;
+ }
+
+ if (use_syslog) {
+ int priority = LOG_ERR;
+ switch (level) {
+ case FUSE_LOG_EMERG:
+ priority = LOG_EMERG;
+ break;
+ case FUSE_LOG_ALERT:
+ priority = LOG_ALERT;
+ break;
+ case FUSE_LOG_CRIT:
+ priority = LOG_CRIT;
+ break;
+ case FUSE_LOG_ERR:
+ priority = LOG_ERR;
+ break;
+ case FUSE_LOG_WARNING:
+ priority = LOG_WARNING;
+ break;
+ case FUSE_LOG_NOTICE:
+ priority = LOG_NOTICE;
+ break;
+ case FUSE_LOG_INFO:
+ priority = LOG_INFO;
+ break;
+ case FUSE_LOG_DEBUG:
+ priority = LOG_DEBUG;
+ break;
+ }
+ vsyslog(priority, fmt, ap);
+ } else {
+ vfprintf(stderr, fmt, ap);
+ }
+}
+
+static void setup_root(struct lo_data *lo, struct lo_inode *root)
+{
+ int fd, res;
+ struct stat stat;
+ uint64_t mnt_id;
+
+ fd = open("/", O_PATH);
+ if (fd == -1) {
+ fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source);
+ exit(1);
+ }
+
+ res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW,
+ &mnt_id);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source);
+ exit(1);
+ }
+
+ root->filetype = S_IFDIR;
+ root->fd = fd;
+ root->key.ino = stat.st_ino;
+ root->key.dev = stat.st_dev;
+ root->key.mnt_id = mnt_id;
+ root->nlookup = 2;
+ g_atomic_int_set(&root->refcount, 2);
+ if (lo->posix_lock) {
+ pthread_mutex_init(&root->plock_mutex, NULL);
+ root->posix_locks = g_hash_table_new_full(
+ g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy);
+ }
+}
+
+static guint lo_key_hash(gconstpointer key)
+{
+ const struct lo_key *lkey = key;
+
+ return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id;
+}
+
+static gboolean lo_key_equal(gconstpointer a, gconstpointer b)
+{
+ const struct lo_key *la = a;
+ const struct lo_key *lb = b;
+
+ return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id;
+}
+
+static void fuse_lo_data_cleanup(struct lo_data *lo)
+{
+ if (lo->inodes) {
+ g_hash_table_destroy(lo->inodes);
+ }
+
+ if (lo->root.posix_locks) {
+ g_hash_table_destroy(lo->root.posix_locks);
+ }
+ lo_map_destroy(&lo->fd_map);
+ lo_map_destroy(&lo->dirp_map);
+ lo_map_destroy(&lo->ino_map);
+
+ if (lo->proc_self_fd >= 0) {
+ close(lo->proc_self_fd);
+ }
+
+ if (lo->root.fd >= 0) {
+ close(lo->root.fd);
+ }
+
+ free(lo->xattrmap);
+ free_xattrmap(lo);
+ free(lo->xattr_security_capability);
+ free(lo->source);
+}
+
+static void qemu_version(void)
+{
+ printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n");
+}
+
+int main(int argc, char *argv[])
+{
+ struct fuse_args args = FUSE_ARGS_INIT(argc, argv);
+ struct fuse_session *se;
+ struct fuse_cmdline_opts opts;
+ struct lo_data lo = {
+ .sandbox = SANDBOX_NAMESPACE,
+ .debug = 0,
+ .writeback = 0,
+ .posix_lock = 0,
+ .allow_direct_io = 0,
+ .proc_self_fd = -1,
+ .user_killpriv_v2 = -1,
+ .user_posix_acl = -1,
+ };
+ struct lo_map_elem *root_elem;
+ struct lo_map_elem *reserve_elem;
+ int ret = -1;
+
+ /* Initialize time conversion information for localtime_r(). */
+ tzset();
+
+ /* Don't mask creation mode, kernel already did that */
+ umask(0);
+
+ qemu_init_exec_dir(argv[0]);
+
+ pthread_mutex_init(&lo.mutex, NULL);
+ lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal);
+ lo.root.fd = -1;
+ lo.root.fuse_ino = FUSE_ROOT_ID;
+ lo.cache = CACHE_AUTO;
+
+ /*
+ * Set up the ino map like this:
+ * [0] Reserved (will not be used)
+ * [1] Root inode
+ */
+ lo_map_init(&lo.ino_map);
+ reserve_elem = lo_map_reserve(&lo.ino_map, 0);
+ if (!reserve_elem) {
+ fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n");
+ goto err_out1;
+ }
+ reserve_elem->in_use = false;
+ root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino);
+ if (!root_elem) {
+ fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n");
+ goto err_out1;
+ }
+ root_elem->inode = &lo.root;
+
+ lo_map_init(&lo.dirp_map);
+ lo_map_init(&lo.fd_map);
+
+ if (fuse_parse_cmdline(&args, &opts) != 0) {
+ goto err_out1;
+ }
+ fuse_set_log_func(log_func);
+ use_syslog = opts.syslog;
+ if (use_syslog) {
+ openlog("virtiofsd", LOG_PID, LOG_DAEMON);
+ }
+
+ if (opts.show_help) {
+ printf("usage: %s [options]\n\n", argv[0]);
+ fuse_cmdline_help();
+ printf(" -o source=PATH shared directory tree\n");
+ fuse_lowlevel_help();
+ ret = 0;
+ goto err_out1;
+ } else if (opts.show_version) {
+ qemu_version();
+ fuse_lowlevel_version();
+ ret = 0;
+ goto err_out1;
+ } else if (opts.print_capabilities) {
+ print_capabilities();
+ ret = 0;
+ goto err_out1;
+ }
+
+ if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) {
+ goto err_out1;
+ }
+
+ if (opts.log_level != 0) {
+ current_log_level = opts.log_level;
+ } else {
+ /* default log level is INFO */
+ current_log_level = FUSE_LOG_INFO;
+ }
+ lo.debug = opts.debug;
+ if (lo.debug) {
+ current_log_level = FUSE_LOG_DEBUG;
+ }
+ if (lo.source) {
+ struct stat stat;
+ int res;
+
+ res = lstat(lo.source, &stat);
+ if (res == -1) {
+ fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n",
+ lo.source);
+ exit(1);
+ }
+ if (!S_ISDIR(stat.st_mode)) {
+ fuse_log(FUSE_LOG_ERR, "source is not a directory\n");
+ exit(1);
+ }
+ } else {
+ lo.source = strdup("/");
+ if (!lo.source) {
+ fuse_log(FUSE_LOG_ERR, "failed to strdup source\n");
+ goto err_out1;
+ }
+ }
+
+ if (lo.xattrmap) {
+ lo.xattr = 1;
+ parse_xattrmap(&lo);
+ }
+
+ if (!lo.timeout_set) {
+ switch (lo.cache) {
+ case CACHE_NONE:
+ lo.timeout = 0.0;
+ break;
+
+ case CACHE_AUTO:
+ lo.timeout = 1.0;
+ break;
+
+ case CACHE_ALWAYS:
+ lo.timeout = 86400.0;
+ break;
+ }
+ } else if (lo.timeout < 0) {
+ fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout);
+ exit(1);
+ }
+
+ if (lo.user_posix_acl == 1 && !lo.xattr) {
+ fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled."
+ "\n");
+ exit(1);
+ }
+
+ lo.use_statx = true;
+
+ se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo);
+ if (se == NULL) {
+ goto err_out1;
+ }
+
+ if (fuse_set_signal_handlers(se) != 0) {
+ goto err_out2;
+ }
+
+ if (fuse_session_mount(se) != 0) {
+ goto err_out3;
+ }
+
+ fuse_daemonize(opts.foreground);
+
+ setup_nofile_rlimit(opts.rlimit_nofile);
+
+ /* Must be before sandbox since it wants /proc */
+ setup_capng();
+
+ setup_sandbox(&lo, se, opts.syslog);
+
+ setup_root(&lo, &lo.root);
+ /* Block until ctrl+c or fusermount -u */
+ ret = virtio_loop(se);
+
+ fuse_session_unmount(se);
+ cleanup_capng();
+err_out3:
+ fuse_remove_signal_handlers(se);
+err_out2:
+ fuse_session_destroy(se);
+err_out1:
+ fuse_opt_free_args(&args);
+
+ fuse_lo_data_cleanup(&lo);
+
+ return ret ? 1 : 0;
+}
diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c
new file mode 100644
index 000000000..a3ce9f898
--- /dev/null
+++ b/tools/virtiofsd/passthrough_seccomp.c
@@ -0,0 +1,177 @@
+/*
+ * Seccomp sandboxing for virtiofsd
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "passthrough_seccomp.h"
+#include "fuse_i.h"
+#include "fuse_log.h"
+#include <seccomp.h>
+
+/* Bodge for libseccomp 2.4.2 which broke ppoll */
+#if !defined(__SNR_ppoll) && defined(__SNR_brk)
+#ifdef __NR_ppoll
+#define __SNR_ppoll __NR_ppoll
+#else
+#define __SNR_ppoll __PNR_ppoll
+#endif
+#endif
+
+static const int syscall_allowlist[] = {
+ /* TODO ireg sem*() syscalls */
+ SCMP_SYS(brk),
+ SCMP_SYS(capget), /* For CAP_FSETID */
+ SCMP_SYS(capset),
+ SCMP_SYS(clock_gettime),
+ SCMP_SYS(clone),
+#ifdef __NR_clone3
+ SCMP_SYS(clone3),
+#endif
+ SCMP_SYS(close),
+ SCMP_SYS(copy_file_range),
+ SCMP_SYS(dup),
+ SCMP_SYS(eventfd2),
+ SCMP_SYS(exit),
+ SCMP_SYS(exit_group),
+ SCMP_SYS(fallocate),
+ SCMP_SYS(fchdir),
+ SCMP_SYS(fchmod),
+ SCMP_SYS(fchmodat),
+ SCMP_SYS(fchownat),
+ SCMP_SYS(fcntl),
+ SCMP_SYS(fdatasync),
+ SCMP_SYS(fgetxattr),
+ SCMP_SYS(flistxattr),
+ SCMP_SYS(flock),
+ SCMP_SYS(fremovexattr),
+ SCMP_SYS(fsetxattr),
+ SCMP_SYS(fstat),
+ SCMP_SYS(fstatfs),
+ SCMP_SYS(fstatfs64),
+ SCMP_SYS(fsync),
+ SCMP_SYS(ftruncate),
+ SCMP_SYS(futex),
+ SCMP_SYS(getdents),
+ SCMP_SYS(getdents64),
+ SCMP_SYS(getegid),
+ SCMP_SYS(geteuid),
+ SCMP_SYS(getpid),
+ SCMP_SYS(gettid),
+ SCMP_SYS(gettimeofday),
+ SCMP_SYS(getxattr),
+ SCMP_SYS(linkat),
+ SCMP_SYS(listxattr),
+ SCMP_SYS(lseek),
+ SCMP_SYS(_llseek), /* For POWER */
+ SCMP_SYS(madvise),
+ SCMP_SYS(mkdirat),
+ SCMP_SYS(mknodat),
+ SCMP_SYS(mmap),
+ SCMP_SYS(mprotect),
+ SCMP_SYS(mremap),
+ SCMP_SYS(munmap),
+ SCMP_SYS(newfstatat),
+ SCMP_SYS(statx),
+ SCMP_SYS(open),
+ SCMP_SYS(openat),
+ SCMP_SYS(ppoll),
+ SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */
+ SCMP_SYS(preadv),
+ SCMP_SYS(pread64),
+ SCMP_SYS(pwritev),
+ SCMP_SYS(pwrite64),
+ SCMP_SYS(read),
+ SCMP_SYS(readlinkat),
+ SCMP_SYS(recvmsg),
+ SCMP_SYS(renameat),
+ SCMP_SYS(renameat2),
+ SCMP_SYS(removexattr),
+ SCMP_SYS(restart_syscall),
+ SCMP_SYS(rt_sigaction),
+ SCMP_SYS(rt_sigprocmask),
+ SCMP_SYS(rt_sigreturn),
+ SCMP_SYS(sched_getattr),
+ SCMP_SYS(sched_setattr),
+ SCMP_SYS(sendmsg),
+ SCMP_SYS(setresgid),
+ SCMP_SYS(setresuid),
+#ifdef __NR_setresgid32
+ SCMP_SYS(setresgid32),
+#endif
+#ifdef __NR_setresuid32
+ SCMP_SYS(setresuid32),
+#endif
+ SCMP_SYS(set_robust_list),
+ SCMP_SYS(setxattr),
+ SCMP_SYS(symlinkat),
+ SCMP_SYS(time), /* Rarely needed, except on static builds */
+ SCMP_SYS(tgkill),
+ SCMP_SYS(unlinkat),
+ SCMP_SYS(unshare),
+ SCMP_SYS(utimensat),
+ SCMP_SYS(write),
+ SCMP_SYS(writev),
+ SCMP_SYS(umask),
+};
+
+/* Syscalls used when --syslog is enabled */
+static const int syscall_allowlist_syslog[] = {
+ SCMP_SYS(send),
+ SCMP_SYS(sendto),
+};
+
+static void add_allowlist(scmp_filter_ctx ctx, const int syscalls[], size_t len)
+{
+ size_t i;
+
+ for (i = 0; i < len; i++) {
+ if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n",
+ syscalls[i]);
+ exit(1);
+ }
+ }
+}
+
+void setup_seccomp(bool enable_syslog)
+{
+ scmp_filter_ctx ctx;
+
+#ifdef SCMP_ACT_KILL_PROCESS
+ ctx = seccomp_init(SCMP_ACT_KILL_PROCESS);
+ /* Handle a newer libseccomp but an older kernel */
+ if (!ctx && errno == EOPNOTSUPP) {
+ ctx = seccomp_init(SCMP_ACT_TRAP);
+ }
+#else
+ ctx = seccomp_init(SCMP_ACT_TRAP);
+#endif
+ if (!ctx) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n");
+ exit(1);
+ }
+
+ add_allowlist(ctx, syscall_allowlist, G_N_ELEMENTS(syscall_allowlist));
+ if (enable_syslog) {
+ add_allowlist(ctx, syscall_allowlist_syslog,
+ G_N_ELEMENTS(syscall_allowlist_syslog));
+ }
+
+ /* libvhost-user calls this for post-copy migration, we don't need it */
+ if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(userfaultfd), 0) != 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n");
+ exit(1);
+ }
+
+ if (seccomp_load(ctx) < 0) {
+ fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n");
+ exit(1);
+ }
+
+ seccomp_release(ctx);
+}
diff --git a/tools/virtiofsd/passthrough_seccomp.h b/tools/virtiofsd/passthrough_seccomp.h
new file mode 100644
index 000000000..a3ab073f0
--- /dev/null
+++ b/tools/virtiofsd/passthrough_seccomp.h
@@ -0,0 +1,15 @@
+/*
+ * Seccomp sandboxing for virtiofsd
+ *
+ * Copyright (C) 2019 Red Hat, Inc.
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VIRTIOFSD_SECCOMP_H
+#define VIRTIOFSD_SECCOMP_H
+
+
+void setup_seccomp(bool enable_syslog);
+
+#endif /* VIRTIOFSD_SECCOMP_H */