diff options
Diffstat (limited to 'tools')
26 files changed, 13775 insertions, 0 deletions
diff --git a/tools/ebpf/Makefile.ebpf b/tools/ebpf/Makefile.ebpf new file mode 100755 index 000000000..8f327ae3b --- /dev/null +++ b/tools/ebpf/Makefile.ebpf @@ -0,0 +1,21 @@ +OBJS = rss.bpf.o + +LLC ?= llc +CLANG ?= clang +INC_FLAGS = `$(CLANG) -print-file-name=include` +EXTRA_CFLAGS ?= -O2 -emit-llvm -fno-stack-protector + +all: $(OBJS) + +.PHONY: clean + +clean: + rm -f $(OBJS) + +$(OBJS): %.o:%.c + $(CLANG) $(INC_FLAGS) \ + -D__KERNEL__ -D__ASM_SYSREG_H \ + -I../include $(LINUXINCLUDE) \ + $(EXTRA_CFLAGS) -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ + bpftool gen skeleton rss.bpf.o > rss.bpf.skeleton.h + cp rss.bpf.skeleton.h ../../ebpf/ diff --git a/tools/ebpf/rss.bpf.c b/tools/ebpf/rss.bpf.c new file mode 100644 index 000000000..e85ec55f9 --- /dev/null +++ b/tools/ebpf/rss.bpf.c @@ -0,0 +1,571 @@ +/* + * eBPF RSS program + * + * Developed by Daynix Computing LTD (http://www.daynix.com) + * + * Authors: + * Andrew Melnychenko <andrew@daynix.com> + * Yuri Benditovich <yuri.benditovich@daynix.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Prepare: + * Requires llvm, clang, bpftool, linux kernel tree + * + * Build rss.bpf.skeleton.h: + * make -f Makefile.ebpf clean all + */ + +#include <stddef.h> +#include <stdbool.h> +#include <linux/bpf.h> + +#include <linux/in.h> +#include <linux/if_ether.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +#include <linux/udp.h> +#include <linux/tcp.h> + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_endian.h> +#include <linux/virtio_net.h> + +#define INDIRECTION_TABLE_SIZE 128 +#define HASH_CALCULATION_BUFFER_SIZE 36 + +struct rss_config_t { + __u8 redirect; + __u8 populate_hash; + __u32 hash_types; + __u16 indirections_len; + __u16 default_queue; +} __attribute__((packed)); + +struct toeplitz_key_data_t { + __u32 leftmost_32_bits; + __u8 next_byte[HASH_CALCULATION_BUFFER_SIZE]; +}; + +struct packet_hash_info_t { + __u8 is_ipv4; + __u8 is_ipv6; + __u8 is_udp; + __u8 is_tcp; + __u8 is_ipv6_ext_src; + __u8 is_ipv6_ext_dst; + __u8 is_fragmented; + + __u16 src_port; + __u16 dst_port; + + union { + struct { + __be32 in_src; + __be32 in_dst; + }; + + struct { + struct in6_addr in6_src; + struct in6_addr in6_dst; + struct in6_addr in6_ext_src; + struct in6_addr in6_ext_dst; + }; + }; +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_configurations = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct rss_config_t), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_toeplitz_key = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(struct toeplitz_key_data_t), + .max_entries = 1, +}; + +struct bpf_map_def SEC("maps") +tap_rss_map_indirection_table = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u16), + .max_entries = INDIRECTION_TABLE_SIZE, +}; + +static inline void net_rx_rss_add_chunk(__u8 *rss_input, size_t *bytes_written, + const void *ptr, size_t size) { + __builtin_memcpy(&rss_input[*bytes_written], ptr, size); + *bytes_written += size; +} + +static inline +void net_toeplitz_add(__u32 *result, + __u8 *input, + __u32 len + , struct toeplitz_key_data_t *key) { + + __u32 accumulator = *result; + __u32 leftmost_32_bits = key->leftmost_32_bits; + __u32 byte; + + for (byte = 0; byte < HASH_CALCULATION_BUFFER_SIZE; byte++) { + __u8 input_byte = input[byte]; + __u8 key_byte = key->next_byte[byte]; + __u8 bit; + + for (bit = 0; bit < 8; bit++) { + if (input_byte & (1 << 7)) { + accumulator ^= leftmost_32_bits; + } + + leftmost_32_bits = + (leftmost_32_bits << 1) | ((key_byte & (1 << 7)) >> 7); + + input_byte <<= 1; + key_byte <<= 1; + } + } + + *result = accumulator; +} + + +static inline int ip6_extension_header_type(__u8 hdr_type) +{ + switch (hdr_type) { + case IPPROTO_HOPOPTS: + case IPPROTO_ROUTING: + case IPPROTO_FRAGMENT: + case IPPROTO_ICMPV6: + case IPPROTO_NONE: + case IPPROTO_DSTOPTS: + case IPPROTO_MH: + return 1; + default: + return 0; + } +} +/* + * According to + * https://www.iana.org/assignments/ipv6-parameters/ipv6-parameters.xhtml + * we expect that there are would be no more than 11 extensions in IPv6 header, + * also there is 27 TLV options for Destination and Hop-by-hop extensions. + * Need to choose reasonable amount of maximum extensions/options we may + * check to find ext src/dst. + */ +#define IP6_EXTENSIONS_COUNT 11 +#define IP6_OPTIONS_COUNT 30 + +static inline int parse_ipv6_ext(struct __sk_buff *skb, + struct packet_hash_info_t *info, + __u8 *l4_protocol, size_t *l4_offset) +{ + int err = 0; + + if (!ip6_extension_header_type(*l4_protocol)) { + return 0; + } + + struct ipv6_opt_hdr ext_hdr = {}; + + for (unsigned int i = 0; i < IP6_EXTENSIONS_COUNT; ++i) { + + err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_hdr, + sizeof(ext_hdr), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if (*l4_protocol == IPPROTO_ROUTING) { + struct ipv6_rt_hdr ext_rt = {}; + + err = bpf_skb_load_bytes_relative(skb, *l4_offset, &ext_rt, + sizeof(ext_rt), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if ((ext_rt.type == IPV6_SRCRT_TYPE_2) && + (ext_rt.hdrlen == sizeof(struct in6_addr) / 8) && + (ext_rt.segments_left == 1)) { + + err = bpf_skb_load_bytes_relative(skb, + *l4_offset + offsetof(struct rt2_hdr, addr), + &info->in6_ext_dst, sizeof(info->in6_ext_dst), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->is_ipv6_ext_dst = 1; + } + + } else if (*l4_protocol == IPPROTO_DSTOPTS) { + struct ipv6_opt_t { + __u8 type; + __u8 length; + } __attribute__((packed)) opt = {}; + + size_t opt_offset = sizeof(ext_hdr); + + for (unsigned int j = 0; j < IP6_OPTIONS_COUNT; ++j) { + err = bpf_skb_load_bytes_relative(skb, *l4_offset + opt_offset, + &opt, sizeof(opt), BPF_HDR_START_NET); + if (err) { + goto error; + } + + if (opt.type == IPV6_TLV_HAO) { + err = bpf_skb_load_bytes_relative(skb, + *l4_offset + opt_offset + + offsetof(struct ipv6_destopt_hao, addr), + &info->in6_ext_src, sizeof(info->in6_ext_src), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->is_ipv6_ext_src = 1; + break; + } + + opt_offset += (opt.type == IPV6_TLV_PAD1) ? + 1 : opt.length + sizeof(opt); + + if (opt_offset + 1 >= ext_hdr.hdrlen * 8) { + break; + } + } + } else if (*l4_protocol == IPPROTO_FRAGMENT) { + info->is_fragmented = true; + } + + *l4_protocol = ext_hdr.nexthdr; + *l4_offset += (ext_hdr.hdrlen + 1) * 8; + + if (!ip6_extension_header_type(ext_hdr.nexthdr)) { + return 0; + } + } + + return 0; +error: + return err; +} + +static __be16 parse_eth_type(struct __sk_buff *skb) +{ + unsigned int offset = 12; + __be16 ret = 0; + int err = 0; + + err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), + BPF_HDR_START_MAC); + if (err) { + return 0; + } + + switch (bpf_ntohs(ret)) { + case ETH_P_8021AD: + offset += 4; + case ETH_P_8021Q: + offset += 4; + err = bpf_skb_load_bytes_relative(skb, offset, &ret, sizeof(ret), + BPF_HDR_START_MAC); + default: + break; + } + + if (err) { + return 0; + } + + return ret; +} + +static inline int parse_packet(struct __sk_buff *skb, + struct packet_hash_info_t *info) +{ + int err = 0; + + if (!info || !skb) { + return -1; + } + + size_t l4_offset = 0; + __u8 l4_protocol = 0; + __u16 l3_protocol = bpf_ntohs(parse_eth_type(skb)); + if (l3_protocol == 0) { + err = -1; + goto error; + } + + if (l3_protocol == ETH_P_IP) { + info->is_ipv4 = 1; + + struct iphdr ip = {}; + err = bpf_skb_load_bytes_relative(skb, 0, &ip, sizeof(ip), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->in_src = ip.saddr; + info->in_dst = ip.daddr; + info->is_fragmented = !!ip.frag_off; + + l4_protocol = ip.protocol; + l4_offset = ip.ihl * 4; + } else if (l3_protocol == ETH_P_IPV6) { + info->is_ipv6 = 1; + + struct ipv6hdr ip6 = {}; + err = bpf_skb_load_bytes_relative(skb, 0, &ip6, sizeof(ip6), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->in6_src = ip6.saddr; + info->in6_dst = ip6.daddr; + + l4_protocol = ip6.nexthdr; + l4_offset = sizeof(ip6); + + err = parse_ipv6_ext(skb, info, &l4_protocol, &l4_offset); + if (err) { + goto error; + } + } + + if (l4_protocol != 0 && !info->is_fragmented) { + if (l4_protocol == IPPROTO_TCP) { + info->is_tcp = 1; + + struct tcphdr tcp = {}; + err = bpf_skb_load_bytes_relative(skb, l4_offset, &tcp, sizeof(tcp), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->src_port = tcp.source; + info->dst_port = tcp.dest; + } else if (l4_protocol == IPPROTO_UDP) { /* TODO: add udplite? */ + info->is_udp = 1; + + struct udphdr udp = {}; + err = bpf_skb_load_bytes_relative(skb, l4_offset, &udp, sizeof(udp), + BPF_HDR_START_NET); + if (err) { + goto error; + } + + info->src_port = udp.source; + info->dst_port = udp.dest; + } + } + + return 0; + +error: + return err; +} + +static inline __u32 calculate_rss_hash(struct __sk_buff *skb, + struct rss_config_t *config, struct toeplitz_key_data_t *toe) +{ + __u8 rss_input[HASH_CALCULATION_BUFFER_SIZE] = {}; + size_t bytes_written = 0; + __u32 result = 0; + int err = 0; + struct packet_hash_info_t packet_info = {}; + + err = parse_packet(skb, &packet_info); + if (err) { + return 0; + } + + if (packet_info.is_ipv4) { + if (packet_info.is_tcp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv4) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (packet_info.is_udp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv4) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv4) { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_src, + sizeof(packet_info.in_src)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in_dst, + sizeof(packet_info.in_dst)); + } + } else if (packet_info.is_ipv6) { + if (packet_info.is_tcp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCPv6) { + + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_TCP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + } else if (packet_info.is_udp && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDPv6) { + + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_UDP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.src_port, + sizeof(packet_info.src_port)); + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.dst_port, + sizeof(packet_info.dst_port)); + + } else if (config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IPv6) { + if (packet_info.is_ipv6_ext_src && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_src, + sizeof(packet_info.in6_ext_src)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_src, + sizeof(packet_info.in6_src)); + } + if (packet_info.is_ipv6_ext_dst && + config->hash_types & VIRTIO_NET_RSS_HASH_TYPE_IP_EX) { + + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_ext_dst, + sizeof(packet_info.in6_ext_dst)); + } else { + net_rx_rss_add_chunk(rss_input, &bytes_written, + &packet_info.in6_dst, + sizeof(packet_info.in6_dst)); + } + } + } + + if (bytes_written) { + net_toeplitz_add(&result, rss_input, bytes_written, toe); + } + + return result; +} + +SEC("tun_rss_steering") +int tun_rss_steering_prog(struct __sk_buff *skb) +{ + + struct rss_config_t *config; + struct toeplitz_key_data_t *toe; + + __u32 key = 0; + __u32 hash = 0; + + config = bpf_map_lookup_elem(&tap_rss_map_configurations, &key); + toe = bpf_map_lookup_elem(&tap_rss_map_toeplitz_key, &key); + + if (config && toe) { + if (!config->redirect) { + return config->default_queue; + } + + hash = calculate_rss_hash(skb, config, toe); + if (hash) { + __u32 table_idx = hash % config->indirections_len; + __u16 *queue = 0; + + queue = bpf_map_lookup_elem(&tap_rss_map_indirection_table, + &table_idx); + + if (queue) { + return *queue; + } + } + + return config->default_queue; + } + + return -1; +} + +char _license[] SEC("license") = "GPL v2"; diff --git a/tools/meson.build b/tools/meson.build new file mode 100644 index 000000000..1dd3e204d --- /dev/null +++ b/tools/meson.build @@ -0,0 +1,35 @@ +have_virtiofsd = (targetos == 'linux' and + have_tools and + seccomp.found() and + libcap_ng.found() and + 'CONFIG_VHOST_USER' in config_host) + +if get_option('virtiofsd').enabled() + if not have_virtiofsd + if targetos != 'linux' + error('virtiofsd requires Linux') + elif not seccomp.found() or not libcap_ng.found() + error('virtiofsd requires libcap-ng-devel and seccomp-devel') + elif 'CONFIG_VHOST_USER' not in config_host + error('virtiofsd needs vhost-user support') + else + # Disabled all the tools but virtiofsd. + have_virtiofsd = true + endif + endif +elif get_option('virtiofsd').disabled() or not have_system + have_virtiofsd = false +endif + +if have_virtiofsd + subdir('virtiofsd') +endif + +have_virtiorng = (have_system and + have_tools and + 'CONFIG_LINUX' in config_host) + +if have_virtiorng + subdir('vhost-user-rng') +endif + diff --git a/tools/vhost-user-rng/50-qemu-rng.json.in b/tools/vhost-user-rng/50-qemu-rng.json.in new file mode 100644 index 000000000..64198f163 --- /dev/null +++ b/tools/vhost-user-rng/50-qemu-rng.json.in @@ -0,0 +1,5 @@ +{
+ "description": "QEMU vhost-user-rng",
+ "type": "bridge",
+ "binary": "@libexecdir@/vhost-user-rng"
+}
diff --git a/tools/vhost-user-rng/main.c b/tools/vhost-user-rng/main.c new file mode 100644 index 000000000..4823bce18 --- /dev/null +++ b/tools/vhost-user-rng/main.c @@ -0,0 +1,407 @@ +/*
+ * VIRTIO RNG Emulation via vhost-user
+ *
+ * Copyright (c) 2021 Mathieu Poirier <mathieu.poirier@linaro.org>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#define G_LOG_DOMAIN "vhost-user-rng"
+#define G_LOG_USE_STRUCTURED 1
+
+#include <glib.h>
+#include <gio/gio.h>
+#include <gio/gunixsocketaddress.h>
+#include <glib-unix.h>
+#include <glib/gstdio.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <unistd.h>
+#include <endian.h>
+#include <assert.h>
+
+#include "qemu/cutils.h"
+#include "subprojects/libvhost-user/libvhost-user-glib.h"
+#include "subprojects/libvhost-user/libvhost-user.h"
+
+#ifndef container_of
+#define container_of(ptr, type, member) ({ \
+ const typeof(((type *) 0)->member) * __mptr = (ptr); \
+ (type *) ((char *) __mptr - offsetof(type, member)); })
+#endif
+
+typedef struct {
+ VugDev dev;
+ struct itimerspec ts;
+ timer_t rate_limit_timer;
+ pthread_mutex_t rng_mutex;
+ pthread_cond_t rng_cond;
+ int64_t quota_remaining;
+ bool activate_timer;
+ GMainLoop *loop;
+} VuRNG;
+
+static gboolean print_cap, verbose;
+static gchar *source_path, *socket_path;
+static gint source_fd, socket_fd = -1;
+
+/* Defaults tailored on virtio-rng.c */
+static uint32_t period_ms = 1 << 16;
+static uint64_t max_bytes = INT64_MAX;
+
+static void check_rate_limit(union sigval sv)
+{
+ VuRNG *rng = sv.sival_ptr;
+ bool wakeup = false;
+
+ pthread_mutex_lock(&rng->rng_mutex);
+ /*
+ * The timer has expired and the guest has used all available
+ * entropy, which means function vu_rng_handle_request() is waiting
+ * on us. As such wake it up once we're done here.
+ */
+ if (rng->quota_remaining == 0) {
+ wakeup = true;
+ }
+
+ /*
+ * Reset the entropy available to the guest and tell function
+ * vu_rng_handle_requests() to start the timer before using it.
+ */
+ rng->quota_remaining = max_bytes;
+ rng->activate_timer = true;
+ pthread_mutex_unlock(&rng->rng_mutex);
+
+ if (wakeup) {
+ pthread_cond_signal(&rng->rng_cond);
+ }
+}
+
+static void setup_timer(VuRNG *rng)
+{
+ struct sigevent sev;
+ int ret;
+
+ memset(&rng->ts, 0, sizeof(struct itimerspec));
+ rng->ts.it_value.tv_sec = period_ms / 1000;
+ rng->ts.it_value.tv_nsec = (period_ms % 1000) * 1000000;
+
+ /*
+ * Call function check_rate_limit() as if it was the start of
+ * a new thread when the timer expires.
+ */
+ sev.sigev_notify = SIGEV_THREAD;
+ sev.sigev_notify_function = check_rate_limit;
+ sev.sigev_value.sival_ptr = rng;
+ /* Needs to be NULL if defaults attributes are to be used. */
+ sev.sigev_notify_attributes = NULL;
+ ret = timer_create(CLOCK_MONOTONIC, &sev, &rng->rate_limit_timer);
+ if (ret < 0) {
+ fprintf(stderr, "timer_create() failed\n");
+ }
+
+}
+
+
+/* Virtio helpers */
+static uint64_t rng_get_features(VuDev *dev)
+{
+ if (verbose) {
+ g_info("%s: replying", __func__);
+ }
+ return 0;
+}
+
+static void rng_set_features(VuDev *dev, uint64_t features)
+{
+ if (verbose && features) {
+ g_autoptr(GString) s = g_string_new("Requested un-handled feature");
+ g_string_append_printf(s, " 0x%" PRIx64 "", features);
+ g_info("%s: %s", __func__, s->str);
+ }
+}
+
+static void vu_rng_handle_requests(VuDev *dev, int qidx)
+{
+ VuRNG *rng = container_of(dev, VuRNG, dev.parent);
+ VuVirtq *vq = vu_get_queue(dev, qidx);
+ VuVirtqElement *elem;
+ size_t to_read;
+ int len, ret;
+
+ printf("vu_rng_handle_requests\n");
+
+ for (;;) {
+ /* Get element in the vhost virtqueue */
+ elem = vu_queue_pop(dev, vq, sizeof(VuVirtqElement));
+ if (!elem) {
+ break;
+ }
+
+ printf("elem->in_sg[0].iov_base: 0x%lx\n", (uint64_t)elem->in_sg[0].iov_base);
+
+ /* Get the amount of entropy to read from the vhost server */
+ to_read = elem->in_sg[0].iov_len;
+
+ pthread_mutex_lock(&rng->rng_mutex);
+
+ /*
+ * We have consumed all entropy available for this time slice.
+ * Wait for the timer (check_rate_limit()) to tell us about the
+ * start of a new time slice.
+ */
+ if (rng->quota_remaining == 0) {
+ pthread_cond_wait(&rng->rng_cond, &rng->rng_mutex);
+ }
+
+ /* Start the timer if the last time slice has expired */
+ if (rng->activate_timer == true) {
+ rng->activate_timer = false;
+ ret = timer_settime(rng->rate_limit_timer, 0, &rng->ts, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "timer_settime() failed\n");
+ }
+ }
+
+ /* Make sure we don't read more than it's available */
+ if (rng->quota_remaining < to_read) {
+ to_read = rng->quota_remaining;
+ }
+
+ len = read(source_fd, elem->in_sg[0].iov_base, to_read);
+
+ /* Simply return 0 if an error occurs */
+ if (len < 0) {
+ len = 0;
+ }
+
+ rng->quota_remaining -= len;
+
+ pthread_mutex_unlock(&rng->rng_mutex);
+
+ vu_queue_push(dev, vq, elem, len);
+ free(elem);
+ }
+
+ vu_queue_notify(dev, vq);
+}
+
+static void
+vu_rng_queue_set_started(VuDev *dev, int qidx, bool started)
+{
+ VuVirtq *vq = vu_get_queue(dev, qidx);
+
+ g_debug("queue started %d:%d\n", qidx, started);
+
+ if (!qidx) {
+ vu_set_queue_handler(dev, vq, started ? vu_rng_handle_requests : NULL);
+ }
+}
+
+/*
+ * Any messages not handled here are processed by the libvhost library
+ * itself.
+ */
+static int rng_process_msg(VuDev *dev, VhostUserMsg *msg, int *do_reply)
+{
+ VuRNG *rng = container_of(dev, VuRNG, dev.parent);
+
+ if (msg->request == VHOST_USER_NONE) {
+ g_main_loop_quit(rng->loop);
+ return 1;
+ }
+
+ return 0;
+}
+
+static const VuDevIface vuiface = {
+ .set_features = rng_set_features,
+ .get_features = rng_get_features,
+ .queue_set_started = vu_rng_queue_set_started,
+ .process_msg = rng_process_msg,
+};
+
+static gboolean hangup(gpointer user_data)
+{
+ GMainLoop *loop = (GMainLoop *) user_data;
+
+ g_printerr("%s: caught hangup/quit signal, quitting", __func__);
+ g_main_loop_quit(loop);
+ return true;
+}
+
+static void panic(VuDev *dev, const char *msg)
+{
+ g_critical("%s\n", msg);
+ exit(EXIT_FAILURE);
+}
+
+/* Print vhost-user.json backend program capabilities */
+static void print_capabilities(void)
+{
+ printf("{\n");
+ printf(" \"type\": \"RNG\"\n");
+ printf(" \"filename\": [ RNG source ]\n");
+ printf("}\n");
+}
+
+static GOptionEntry options[] = {
+ { "socket-path", 's', 0, G_OPTION_ARG_FILENAME, &socket_path,
+ "Location of vhost-user Unix domain socket, incompatible with --fd",
+ "PATH" },
+ { "fd", 'f', 0, G_OPTION_ARG_INT, &socket_fd,
+ "Specify the backend file-descriptor, incompatible with --socket-path",
+ "FD" },
+ { "period", 'p', 0, G_OPTION_ARG_INT, &period_ms,
+ "Time needed (in ms) to transfer a maximum amount of byte", NULL },
+ { "max-bytes", 'm', 0, G_OPTION_ARG_INT64, &max_bytes,
+ "Maximum amount of byte that can be transferred in a period", NULL },
+ { "filename", 'n', 0, G_OPTION_ARG_FILENAME, &source_path,
+ "RNG source, defaults to /dev/urandom", "PATH" },
+ { "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, &print_cap,
+ "Output to stdout the backend capabilities in JSON format and exit",
+ NULL},
+ { "verbose", 'v', 0, G_OPTION_ARG_NONE, &verbose,
+ "Be more verbose in output", NULL},
+ { NULL }
+};
+
+int main(int argc, char *argv[])
+{
+ GError *error = NULL;
+ GOptionContext *context;
+ g_autoptr(GSocket) socket = NULL;
+ char default_source[] = "/dev/urandom";
+ char *source = default_source;
+ VuRNG rng;
+
+ context = g_option_context_new("vhost-user emulation of RNG device");
+ g_option_context_add_main_entries(context, options, "vhost-user-rng");
+ if (!g_option_context_parse(context, &argc, &argv, &error)) {
+ g_printerr("option parsing failed: %s\n", error->message);
+ exit(1);
+ }
+
+ if (print_cap) {
+ print_capabilities();
+ exit(0);
+ }
+
+ if (!socket_path && socket_fd < 0) {
+ g_printerr("Please specify either --fd or --socket-path\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (socket_path && socket_fd > 0) {
+ g_printerr("Either --fd or --socket-path, not both\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (max_bytes > INT64_MAX) {
+ g_printerr("'max-bytes' parameter must be non-negative, "
+ "and less than 2^63\n");
+ exit(EXIT_FAILURE);
+ }
+
+ if (period_ms <= 0) {
+ g_printerr("'period' parameter expects a positive integer\n");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ * Now create a vhost-user socket that we will receive messages
+ * on. Once we have our handler set up we can enter the glib main
+ * loop.
+ */
+ if (socket_path) {
+ g_autoptr(GSocketAddress) addr = g_unix_socket_address_new(socket_path);
+ g_autoptr(GSocket) bind_socket = g_socket_new(G_SOCKET_FAMILY_UNIX,
+ G_SOCKET_TYPE_STREAM,
+ G_SOCKET_PROTOCOL_DEFAULT,
+ &error);
+
+ if (!g_socket_bind(bind_socket, addr, false, &error)) {
+ g_printerr("Failed to bind to socket at %s (%s).\n",
+ socket_path, error->message);
+ exit(EXIT_FAILURE);
+ }
+ if (!g_socket_listen(bind_socket, &error)) {
+ g_printerr("Failed to listen on socket %s (%s).\n",
+ socket_path, error->message);
+ }
+ g_message("awaiting connection to %s", socket_path);
+ socket = g_socket_accept(bind_socket, NULL, &error);
+ if (!socket) {
+ g_printerr("Failed to accept on socket %s (%s).\n",
+ socket_path, error->message);
+ }
+ } else {
+ socket = g_socket_new_from_fd(socket_fd, &error);
+ if (!socket) {
+ g_printerr("Failed to connect to FD %d (%s).\n",
+ socket_fd, error->message);
+ exit(EXIT_FAILURE);
+ }
+ }
+
+ /* Overwrite default RNG source with what user provided, if any */
+ if (source_path) {
+ source = source_path;
+ }
+
+ source_fd = open(source, O_RDWR);
+ if (source_fd < 0) {
+ g_printerr("Failed to open RNG source %s\n", source);
+ g_socket_close(socket, &error);
+ unlink(socket_path);
+ exit(EXIT_FAILURE);
+ }
+
+ /* catch exit signals */
+ g_unix_signal_add(SIGHUP, hangup, rng.loop);
+ g_unix_signal_add(SIGINT, hangup, rng.loop);
+
+ /*
+ * Create the main loop first so all the various sources can be
+ * added. As well as catching signals we need to ensure vug_init
+ * can add it's GSource watches.
+ */
+ rng.loop = g_main_loop_new(NULL, FALSE);
+
+ if (!vug_init(&rng.dev, 1, g_socket_get_fd(socket),
+ panic, &vuiface)) {
+ g_printerr("Failed to initialize libvhost-user-glib.\n");
+ exit(EXIT_FAILURE);
+ }
+
+ rng.quota_remaining = max_bytes;
+ rng.activate_timer = true;
+ pthread_mutex_init(&rng.rng_mutex, NULL);
+ pthread_cond_init(&rng.rng_cond, NULL);
+ setup_timer(&rng);
+
+ if (verbose) {
+ g_info("period_ms: %d tv_sec: %ld tv_nsec: %lu\n",
+ period_ms, rng.ts.it_value.tv_sec, rng.ts.it_value.tv_nsec);
+ }
+
+ g_message("entering main loop, awaiting messages");
+ g_main_loop_run(rng.loop);
+ g_message("finished main loop, cleaning up");
+
+ g_main_loop_unref(rng.loop);
+ vug_deinit(&rng.dev);
+ timer_delete(rng.rate_limit_timer);
+ close(source_fd);
+ unlink(socket_path);
+}
diff --git a/tools/vhost-user-rng/meson.build b/tools/vhost-user-rng/meson.build new file mode 100644 index 000000000..4bcc4ad87 --- /dev/null +++ b/tools/vhost-user-rng/meson.build @@ -0,0 +1,10 @@ +executable('vhost-user-rng', files(
+ 'main.c'),
+ dependencies: [qemuutil, glib, gio, rt],
+ install: true,
+ install_dir: get_option('libexecdir'))
+
+configure_file(input: '50-qemu-rng.json.in',
+ output: '50-qemu-rng.json',
+ configuration: config_host,
+ install_dir: qemu_datadir / 'vhost-user')
diff --git a/tools/virtiofsd/50-qemu-virtiofsd.json.in b/tools/virtiofsd/50-qemu-virtiofsd.json.in new file mode 100644 index 000000000..9bcd86f8d --- /dev/null +++ b/tools/virtiofsd/50-qemu-virtiofsd.json.in @@ -0,0 +1,5 @@ +{ + "description": "QEMU virtiofsd vhost-user-fs", + "type": "fs", + "binary": "@libexecdir@/virtiofsd" +} diff --git a/tools/virtiofsd/buffer.c b/tools/virtiofsd/buffer.c new file mode 100644 index 000000000..b5f04be35 --- /dev/null +++ b/tools/virtiofsd/buffer.c @@ -0,0 +1,350 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2010 Miklos Szeredi <miklos@szeredi.hu> + * + * Functions for dealing with `struct fuse_buf` and `struct + * fuse_bufvec`. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" + +size_t fuse_buf_size(const struct fuse_bufvec *bufv) +{ + size_t i; + size_t size = 0; + + for (i = 0; i < bufv->count; i++) { + if (bufv->buf[i].size == SIZE_MAX) { + size = SIZE_MAX; + } else { + size += bufv->buf[i].size; + } + } + + return size; +} + +static ssize_t fuse_buf_writev(struct fuse_buf *out_buf, + struct fuse_bufvec *in_buf) +{ + ssize_t res, i, j; + size_t iovcnt = in_buf->count; + struct iovec *iov; + int fd = out_buf->fd; + + iov = g_try_new0(struct iovec, iovcnt); + if (!iov) { + return -ENOMEM; + } + + for (i = 0, j = 0; i < iovcnt; i++) { + /* Skip the buf with 0 size */ + if (in_buf->buf[i].size) { + iov[j].iov_base = in_buf->buf[i].mem; + iov[j].iov_len = in_buf->buf[i].size; + j++; + } + } + + if (out_buf->flags & FUSE_BUF_FD_SEEK) { + res = pwritev(fd, iov, iovcnt, out_buf->pos); + } else { + res = writev(fd, iov, iovcnt); + } + + if (res == -1) { + res = -errno; + } + + g_free(iov); + return res; +} + +static size_t min_size(size_t s1, size_t s2) +{ + return s1 < s2 ? s1 : s2; +} + +static ssize_t fuse_buf_write(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + ssize_t res = 0; + size_t copied = 0; + + while (len) { + if (dst->flags & FUSE_BUF_FD_SEEK) { + res = pwrite(dst->fd, (char *)src->mem + src_off, len, + dst->pos + dst_off); + } else { + res = write(dst->fd, (char *)src->mem + src_off, len); + } + if (res == -1) { + if (!copied) { + return -errno; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + if (!(dst->flags & FUSE_BUF_FD_RETRY)) { + break; + } + + src_off += res; + dst_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_read(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + ssize_t res = 0; + size_t copied = 0; + + while (len) { + if (src->flags & FUSE_BUF_FD_SEEK) { + res = pread(src->fd, (char *)dst->mem + dst_off, len, + src->pos + src_off); + } else { + res = read(src->fd, (char *)dst->mem + dst_off, len); + } + if (res == -1) { + if (!copied) { + return -errno; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + if (!(src->flags & FUSE_BUF_FD_RETRY)) { + break; + } + + dst_off += res; + src_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_fd_to_fd(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + char buf[4096]; + struct fuse_buf tmp = { + .size = sizeof(buf), + .flags = 0, + }; + ssize_t res; + size_t copied = 0; + + tmp.mem = buf; + + while (len) { + size_t this_len = min_size(tmp.size, len); + size_t read_len; + + res = fuse_buf_read(&tmp, 0, src, src_off, this_len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + if (res == 0) { + break; + } + + read_len = res; + res = fuse_buf_write(dst, dst_off, &tmp, 0, read_len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + if (res == 0) { + break; + } + + copied += res; + + if (res < this_len) { + break; + } + + dst_off += res; + src_off += res; + len -= res; + } + + return copied; +} + +static ssize_t fuse_buf_copy_one(const struct fuse_buf *dst, size_t dst_off, + const struct fuse_buf *src, size_t src_off, + size_t len) +{ + int src_is_fd = src->flags & FUSE_BUF_IS_FD; + int dst_is_fd = dst->flags & FUSE_BUF_IS_FD; + + if (!src_is_fd && !dst_is_fd) { + char *dstmem = (char *)dst->mem + dst_off; + char *srcmem = (char *)src->mem + src_off; + + if (dstmem != srcmem) { + if (dstmem + len <= srcmem || srcmem + len <= dstmem) { + memcpy(dstmem, srcmem, len); + } else { + memmove(dstmem, srcmem, len); + } + } + + return len; + } else if (!src_is_fd) { + return fuse_buf_write(dst, dst_off, src, src_off, len); + } else if (!dst_is_fd) { + return fuse_buf_read(dst, dst_off, src, src_off, len); + } else { + return fuse_buf_fd_to_fd(dst, dst_off, src, src_off, len); + } +} + +static const struct fuse_buf *fuse_bufvec_current(struct fuse_bufvec *bufv) +{ + if (bufv->idx < bufv->count) { + return &bufv->buf[bufv->idx]; + } else { + return NULL; + } +} + +static int fuse_bufvec_advance(struct fuse_bufvec *bufv, size_t len) +{ + const struct fuse_buf *buf = fuse_bufvec_current(bufv); + + if (!buf) { + return 0; + } + + bufv->off += len; + assert(bufv->off <= buf->size); + if (bufv->off == buf->size) { + assert(bufv->idx < bufv->count); + bufv->idx++; + if (bufv->idx == bufv->count) { + return 0; + } + bufv->off = 0; + } + return 1; +} + +ssize_t fuse_buf_copy(struct fuse_bufvec *dstv, struct fuse_bufvec *srcv) +{ + size_t copied = 0, i; + + if (dstv == srcv) { + return fuse_buf_size(dstv); + } + + /* + * use writev to improve bandwidth when all the + * src buffers already mapped by the daemon + * process + */ + for (i = 0; i < srcv->count; i++) { + if (srcv->buf[i].flags & FUSE_BUF_IS_FD) { + break; + } + } + if ((i == srcv->count) && (dstv->count == 1) && + (dstv->idx == 0) && + (dstv->buf[0].flags & FUSE_BUF_IS_FD)) { + dstv->buf[0].pos += dstv->off; + return fuse_buf_writev(&dstv->buf[0], srcv); + } + + for (;;) { + const struct fuse_buf *src = fuse_bufvec_current(srcv); + const struct fuse_buf *dst = fuse_bufvec_current(dstv); + size_t src_len; + size_t dst_len; + size_t len; + ssize_t res; + + if (src == NULL || dst == NULL) { + break; + } + + src_len = src->size - srcv->off; + dst_len = dst->size - dstv->off; + len = min_size(src_len, dst_len); + + res = fuse_buf_copy_one(dst, dstv->off, src, srcv->off, len); + if (res < 0) { + if (!copied) { + return res; + } + break; + } + copied += res; + + if (!fuse_bufvec_advance(srcv, res) || + !fuse_bufvec_advance(dstv, res)) { + break; + } + + if (res < len) { + break; + } + } + + return copied; +} + +void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len) +{ + void *ptr; + + if (len > iter->size - iter->pos) { + return NULL; + } + + ptr = iter->mem + iter->pos; + iter->pos += len; + return ptr; +} + +const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter) +{ + const char *str = iter->mem + iter->pos; + size_t remaining = iter->size - iter->pos; + size_t i; + + for (i = 0; i < remaining; i++) { + if (str[i] == '\0') { + iter->pos += i + 1; + return str; + } + } + return NULL; +} diff --git a/tools/virtiofsd/fuse_common.h b/tools/virtiofsd/fuse_common.h new file mode 100644 index 000000000..0c2665b97 --- /dev/null +++ b/tools/virtiofsd/fuse_common.h @@ -0,0 +1,832 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +/** @file */ + +#if !defined(FUSE_H_) && !defined(FUSE_LOWLEVEL_H_) +#error \ + "Never include <fuse_common.h> directly; use <fuse.h> or <fuse_lowlevel.h> instead." +#endif + +#ifndef FUSE_COMMON_H_ +#define FUSE_COMMON_H_ + +#include "fuse_log.h" +#include "fuse_opt.h" + +/** Major version of FUSE library interface */ +#define FUSE_MAJOR_VERSION 3 + +/** Minor version of FUSE library interface */ +#define FUSE_MINOR_VERSION 2 + +#define FUSE_MAKE_VERSION(maj, min) ((maj) * 10 + (min)) +#define FUSE_VERSION FUSE_MAKE_VERSION(FUSE_MAJOR_VERSION, FUSE_MINOR_VERSION) + +/** + * Information about an open file. + * + * File Handles are created by the open, opendir, and create methods and closed + * by the release and releasedir methods. Multiple file handles may be + * concurrently open for the same file. Generally, a client will create one + * file handle per file descriptor, though in some cases multiple file + * descriptors can share a single file handle. + */ +struct fuse_file_info { + /** Open flags. Available in open() and release() */ + int flags; + + /* + * In case of a write operation indicates if this was caused + * by a delayed write from the page cache. If so, then the + * context's pid, uid, and gid fields will not be valid, and + * the *fh* value may not match the *fh* value that would + * have been sent with the corresponding individual write + * requests if write caching had been disabled. + */ + unsigned int writepage:1; + + /** Can be filled in by open, to use direct I/O on this file. */ + unsigned int direct_io:1; + + /* + * Can be filled in by open. It signals the kernel that any + * currently cached file data (ie., data that the filesystem + * provided the last time the file was open) need not be + * invalidated. Has no effect when set in other contexts (in + * particular it does nothing when set by opendir()). + */ + unsigned int keep_cache:1; + + /* + * Indicates a flush operation. Set in flush operation, also + * maybe set in highlevel lock operation and lowlevel release + * operation. + */ + unsigned int flush:1; + + /* + * Can be filled in by open, to indicate that the file is not + * seekable. + */ + unsigned int nonseekable:1; + + /* + * Indicates that flock locks for this file should be + * released. If set, lock_owner shall contain a valid value. + * May only be set in ->release(). + */ + unsigned int flock_release:1; + + /* + * Can be filled in by opendir. It signals the kernel to + * enable caching of entries returned by readdir(). Has no + * effect when set in other contexts (in particular it does + * nothing when set by open()). + */ + unsigned int cache_readdir:1; + + /* Indicates that suid/sgid bits should be removed upon write */ + unsigned int kill_priv:1; + + + /** Padding. Reserved for future use*/ + unsigned int padding:24; + unsigned int padding2:32; + + /* + * File handle id. May be filled in by filesystem in create, + * open, and opendir(). Available in most other file operations on the + * same file handle. + */ + uint64_t fh; + + /** Lock owner id. Available in locking operations and flush */ + uint64_t lock_owner; + + /* + * Requested poll events. Available in ->poll. Only set on kernels + * which support it. If unsupported, this field is set to zero. + */ + uint32_t poll_events; +}; + +/* + * Capability bits for 'fuse_conn_info.capable' and 'fuse_conn_info.want' + */ + +/** + * Indicates that the filesystem supports asynchronous read requests. + * + * If this capability is not requested/available, the kernel will + * ensure that there is at most one pending read request per + * file-handle at any time, and will attempt to order read requests by + * increasing offset. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ASYNC_READ (1 << 0) + +/** + * Indicates that the filesystem supports "remote" locking. + * + * This feature is enabled by default when supported by the kernel, + * and if getlk() and setlk() handlers are implemented. + */ +#define FUSE_CAP_POSIX_LOCKS (1 << 1) + +/** + * Indicates that the filesystem supports the O_TRUNC open flag. If + * disabled, and an application specifies O_TRUNC, fuse first calls + * truncate() and then open() with O_TRUNC filtered out. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ATOMIC_O_TRUNC (1 << 3) + +/** + * Indicates that the filesystem supports lookups of "." and "..". + * + * This feature is disabled by default. + */ +#define FUSE_CAP_EXPORT_SUPPORT (1 << 4) + +/** + * Indicates that the kernel should not apply the umask to the + * file mode on create operations. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_DONT_MASK (1 << 6) + +/** + * Indicates that libfuse should try to use splice() when writing to + * the fuse device. This may improve performance. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_SPLICE_WRITE (1 << 7) + +/** + * Indicates that libfuse should try to move pages instead of copying when + * writing to / reading from the fuse device. This may improve performance. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_SPLICE_MOVE (1 << 8) + +/** + * Indicates that libfuse should try to use splice() when reading from + * the fuse device. This may improve performance. + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a write_buf() handler. + */ +#define FUSE_CAP_SPLICE_READ (1 << 9) + +/** + * If set, the calls to flock(2) will be emulated using POSIX locks and must + * then be handled by the filesystem's setlock() handler. + * + * If not set, flock(2) calls will be handled by the FUSE kernel module + * internally (so any access that does not go through the kernel cannot be taken + * into account). + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements a flock() handler. + */ +#define FUSE_CAP_FLOCK_LOCKS (1 << 10) + +/** + * Indicates that the filesystem supports ioctl's on directories. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_IOCTL_DIR (1 << 11) + +/** + * Traditionally, while a file is open the FUSE kernel module only + * asks the filesystem for an update of the file's attributes when a + * client attempts to read beyond EOF. This is unsuitable for + * e.g. network filesystems, where the file contents may change + * without the kernel knowing about it. + * + * If this flag is set, FUSE will check the validity of the attributes + * on every read. If the attributes are no longer valid (i.e., if the + * *attr_timeout* passed to fuse_reply_attr() or set in `struct + * fuse_entry_param` has passed), it will first issue a `getattr` + * request. If the new mtime differs from the previous value, any + * cached file *contents* will be invalidated as well. + * + * This flag should always be set when available. If all file changes + * go through the kernel, *attr_timeout* should be set to a very large + * number to avoid unnecessary getattr() calls. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_AUTO_INVAL_DATA (1 << 12) + +/** + * Indicates that the filesystem supports readdirplus. + * + * This feature is enabled by default when supported by the kernel and if the + * filesystem implements a readdirplus() handler. + */ +#define FUSE_CAP_READDIRPLUS (1 << 13) + +/** + * Indicates that the filesystem supports adaptive readdirplus. + * + * If FUSE_CAP_READDIRPLUS is not set, this flag has no effect. + * + * If FUSE_CAP_READDIRPLUS is set and this flag is not set, the kernel + * will always issue readdirplus() requests to retrieve directory + * contents. + * + * If FUSE_CAP_READDIRPLUS is set and this flag is set, the kernel + * will issue both readdir() and readdirplus() requests, depending on + * how much information is expected to be required. + * + * As of Linux 4.20, the algorithm is as follows: when userspace + * starts to read directory entries, issue a READDIRPLUS request to + * the filesystem. If any entry attributes have been looked up by the + * time userspace requests the next batch of entries continue with + * READDIRPLUS, otherwise switch to plain READDIR. This will reasult + * in eg plain "ls" triggering READDIRPLUS first then READDIR after + * that because it doesn't do lookups. "ls -l" should result in all + * READDIRPLUS, except if dentries are already cached. + * + * This feature is enabled by default when supported by the kernel and + * if the filesystem implements both a readdirplus() and a readdir() + * handler. + */ +#define FUSE_CAP_READDIRPLUS_AUTO (1 << 14) + +/** + * Indicates that the filesystem supports asynchronous direct I/O submission. + * + * If this capability is not requested/available, the kernel will ensure that + * there is at most one pending read and one pending write request per direct + * I/O file-handle at any time. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_ASYNC_DIO (1 << 15) + +/** + * Indicates that writeback caching should be enabled. This means that + * individual write request may be buffered and merged in the kernel + * before they are send to the filesystem. + * + * This feature is disabled by default. + */ +#define FUSE_CAP_WRITEBACK_CACHE (1 << 16) + +/** + * Indicates support for zero-message opens. If this flag is set in + * the `capable` field of the `fuse_conn_info` structure, then the + * filesystem may return `ENOSYS` from the open() handler to indicate + * success. Further attempts to open files will be handled in the + * kernel. (If this flag is not set, returning ENOSYS will be treated + * as an error and signaled to the caller). + * + * Setting (or unsetting) this flag in the `want` field has *no + * effect*. + */ +#define FUSE_CAP_NO_OPEN_SUPPORT (1 << 17) + +/** + * Indicates support for parallel directory operations. If this flag + * is unset, the FUSE kernel module will ensure that lookup() and + * readdir() requests are never issued concurrently for the same + * directory. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_PARALLEL_DIROPS (1 << 18) + +/** + * Indicates support for POSIX ACLs. + * + * If this feature is enabled, the kernel will cache and have + * responsibility for enforcing ACLs. ACL will be stored as xattrs and + * passed to userspace, which is responsible for updating the ACLs in + * the filesystem, keeping the file mode in sync with the ACL, and + * ensuring inheritance of default ACLs when new filesystem nodes are + * created. Note that this requires that the file system is able to + * parse and interpret the xattr representation of ACLs. + * + * Enabling this feature implicitly turns on the + * ``default_permissions`` mount option (even if it was not passed to + * mount(2)). + * + * This feature is disabled by default. + */ +#define FUSE_CAP_POSIX_ACL (1 << 19) + +/** + * Indicates that the filesystem is responsible for unsetting + * setuid and setgid bits when a file is written, truncated, or + * its owner is changed. + * + * This feature is enabled by default when supported by the kernel. + */ +#define FUSE_CAP_HANDLE_KILLPRIV (1 << 20) + +/** + * Indicates support for zero-message opendirs. If this flag is set in + * the `capable` field of the `fuse_conn_info` structure, then the filesystem + * may return `ENOSYS` from the opendir() handler to indicate success. Further + * opendir and releasedir messages will be handled in the kernel. (If this + * flag is not set, returning ENOSYS will be treated as an error and signalled + * to the caller.) + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +#define FUSE_CAP_NO_OPENDIR_SUPPORT (1 << 24) + +/** + * Indicates that the kernel supports the FUSE_ATTR_SUBMOUNT flag. + * + * Setting (or unsetting) this flag in the `want` field has *no effect*. + */ +#define FUSE_CAP_SUBMOUNTS (1 << 27) + +/** + * Indicates that the filesystem is responsible for clearing + * security.capability xattr and clearing setuid and setgid bits. Following + * are the rules. + * - clear "security.capability" on write, truncate and chown unconditionally + * - clear suid/sgid if following is true. Note, sgid is cleared only if + * group executable bit is set. + * o setattr has FATTR_SIZE and FATTR_KILL_SUIDGID set. + * o setattr has FATTR_UID or FATTR_GID + * o open has O_TRUNC and FUSE_OPEN_KILL_SUIDGID + * o create has O_TRUNC and FUSE_OPEN_KILL_SUIDGID flag set. + * o write has FUSE_WRITE_KILL_SUIDGID + */ +#define FUSE_CAP_HANDLE_KILLPRIV_V2 (1 << 28) + +/** + * Indicates that file server supports extended struct fuse_setxattr_in + */ +#define FUSE_CAP_SETXATTR_EXT (1 << 29) + +/** + * Ioctl flags + * + * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine + * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed + * FUSE_IOCTL_RETRY: retry with new iovecs + * FUSE_IOCTL_DIR: is a directory + * + * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs + */ +#define FUSE_IOCTL_COMPAT (1 << 0) +#define FUSE_IOCTL_UNRESTRICTED (1 << 1) +#define FUSE_IOCTL_RETRY (1 << 2) +#define FUSE_IOCTL_DIR (1 << 4) + +#define FUSE_IOCTL_MAX_IOV 256 + +/** + * Connection information, passed to the ->init() method + * + * Some of the elements are read-write, these can be changed to + * indicate the value requested by the filesystem. The requested + * value must usually be smaller than the indicated value. + */ +struct fuse_conn_info { + /** + * Major version of the protocol (read-only) + */ + unsigned proto_major; + + /** + * Minor version of the protocol (read-only) + */ + unsigned proto_minor; + + /** + * Maximum size of the write buffer + */ + unsigned max_write; + + /** + * Maximum size of read requests. A value of zero indicates no + * limit. However, even if the filesystem does not specify a + * limit, the maximum size of read requests will still be + * limited by the kernel. + * + * NOTE: For the time being, the maximum size of read requests + * must be set both here *and* passed to fuse_session_new() + * using the ``-o max_read=<n>`` mount option. At some point + * in the future, specifying the mount option will no longer + * be necessary. + */ + unsigned max_read; + + /** + * Maximum readahead + */ + unsigned max_readahead; + + /** + * Capability flags that the kernel supports (read-only) + */ + unsigned capable; + + /** + * Capability flags that the filesystem wants to enable. + * + * libfuse attempts to initialize this field with + * reasonable default values before calling the init() handler. + */ + unsigned want; + + /** + * Maximum number of pending "background" requests. A + * background request is any type of request for which the + * total number is not limited by other means. As of kernel + * 4.8, only two types of requests fall into this category: + * + * 1. Read-ahead requests + * 2. Asynchronous direct I/O requests + * + * Read-ahead requests are generated (if max_readahead is + * non-zero) by the kernel to preemptively fill its caches + * when it anticipates that userspace will soon read more + * data. + * + * Asynchronous direct I/O requests are generated if + * FUSE_CAP_ASYNC_DIO is enabled and userspace submits a large + * direct I/O request. In this case the kernel will internally + * split it up into multiple smaller requests and submit them + * to the filesystem concurrently. + * + * Note that the following requests are *not* background + * requests: writeback requests (limited by the kernel's + * flusher algorithm), regular (i.e., synchronous and + * buffered) userspace read/write requests (limited to one per + * thread), asynchronous read requests (Linux's io_submit(2) + * call actually blocks, so these are also limited to one per + * thread). + */ + unsigned max_background; + + /** + * Kernel congestion threshold parameter. If the number of pending + * background requests exceeds this number, the FUSE kernel module will + * mark the filesystem as "congested". This instructs the kernel to + * expect that queued requests will take some time to complete, and to + * adjust its algorithms accordingly (e.g. by putting a waiting thread + * to sleep instead of using a busy-loop). + */ + unsigned congestion_threshold; + + /** + * When FUSE_CAP_WRITEBACK_CACHE is enabled, the kernel is responsible + * for updating mtime and ctime when write requests are received. The + * updated values are passed to the filesystem with setattr() requests. + * However, if the filesystem does not support the full resolution of + * the kernel timestamps (nanoseconds), the mtime and ctime values used + * by kernel and filesystem will differ (and result in an apparent + * change of times after a cache flush). + * + * To prevent this problem, this variable can be used to inform the + * kernel about the timestamp granularity supported by the file-system. + * The value should be power of 10. The default is 1, i.e. full + * nano-second resolution. Filesystems supporting only second resolution + * should set this to 1000000000. + */ + unsigned time_gran; + + /** + * For future use. + */ + unsigned reserved[22]; +}; + +struct fuse_session; +struct fuse_pollhandle; +struct fuse_conn_info_opts; + +/** + * This function parses several command-line options that can be used + * to override elements of struct fuse_conn_info. The pointer returned + * by this function should be passed to the + * fuse_apply_conn_info_opts() method by the file system's init() + * handler. + * + * Before using this function, think twice if you really want these + * parameters to be adjustable from the command line. In most cases, + * they should be determined by the file system internally. + * + * The following options are recognized: + * + * -o max_write=N sets conn->max_write + * -o max_readahead=N sets conn->max_readahead + * -o max_background=N sets conn->max_background + * -o congestion_threshold=N sets conn->congestion_threshold + * -o async_read sets FUSE_CAP_ASYNC_READ in conn->want + * -o sync_read unsets FUSE_CAP_ASYNC_READ in conn->want + * -o atomic_o_trunc sets FUSE_CAP_ATOMIC_O_TRUNC in conn->want + * -o no_remote_lock Equivalent to -o + *no_remote_flock,no_remote_posix_lock -o no_remote_flock Unsets + *FUSE_CAP_FLOCK_LOCKS in conn->want -o no_remote_posix_lock Unsets + *FUSE_CAP_POSIX_LOCKS in conn->want -o [no_]splice_write (un-)sets + *FUSE_CAP_SPLICE_WRITE in conn->want -o [no_]splice_move (un-)sets + *FUSE_CAP_SPLICE_MOVE in conn->want -o [no_]splice_read (un-)sets + *FUSE_CAP_SPLICE_READ in conn->want -o [no_]auto_inval_data (un-)sets + *FUSE_CAP_AUTO_INVAL_DATA in conn->want -o readdirplus=no unsets + *FUSE_CAP_READDIRPLUS in conn->want -o readdirplus=yes sets + *FUSE_CAP_READDIRPLUS and unsets FUSE_CAP_READDIRPLUS_AUTO in conn->want -o + *readdirplus=auto sets FUSE_CAP_READDIRPLUS and FUSE_CAP_READDIRPLUS_AUTO + *in conn->want -o [no_]async_dio (un-)sets FUSE_CAP_ASYNC_DIO in + *conn->want -o [no_]writeback_cache (un-)sets FUSE_CAP_WRITEBACK_CACHE in + *conn->want -o time_gran=N sets conn->time_gran + * + * Known options will be removed from *args*, unknown options will be + * passed through unchanged. + * + * @param args argument vector (input+output) + * @return parsed options + **/ +struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args); + +/** + * This function applies the (parsed) parameters in *opts* to the + * *conn* pointer. It may modify the following fields: wants, + * max_write, max_readahead, congestion_threshold, max_background, + * time_gran. A field is only set (or unset) if the corresponding + * option has been explicitly set. + */ +void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn); + +/** + * Go into the background + * + * @param foreground if true, stay in the foreground + * @return 0 on success, -1 on failure + */ +int fuse_daemonize(int foreground); + +/** + * Get the version of the library + * + * @return the version + */ +int fuse_version(void); + +/** + * Get the full package version string of the library + * + * @return the package version + */ +const char *fuse_pkgversion(void); + +/** + * Destroy poll handle + * + * @param ph the poll handle + */ +void fuse_pollhandle_destroy(struct fuse_pollhandle *ph); + +/* + * Data buffer + */ + +/** + * Buffer flags + */ +enum fuse_buf_flags { + /** + * Buffer contains a file descriptor + * + * If this flag is set, the .fd field is valid, otherwise the + * .mem fields is valid. + */ + FUSE_BUF_IS_FD = (1 << 1), + + /** + * Seek on the file descriptor + * + * If this flag is set then the .pos field is valid and is + * used to seek to the given offset before performing + * operation on file descriptor. + */ + FUSE_BUF_FD_SEEK = (1 << 2), + + /** + * Retry operation on file descriptor + * + * If this flag is set then retry operation on file descriptor + * until .size bytes have been copied or an error or EOF is + * detected. + */ + FUSE_BUF_FD_RETRY = (1 << 3), +}; + +/** + * Single data buffer + * + * Generic data buffer for I/O, extended attributes, etc... Data may + * be supplied as a memory pointer or as a file descriptor + */ +struct fuse_buf { + /** + * Size of data in bytes + */ + size_t size; + + /** + * Buffer flags + */ + enum fuse_buf_flags flags; + + /** + * Memory pointer + * + * Used unless FUSE_BUF_IS_FD flag is set. + */ + void *mem; + + /** + * File descriptor + * + * Used if FUSE_BUF_IS_FD flag is set. + */ + int fd; + + /** + * File position + * + * Used if FUSE_BUF_FD_SEEK flag is set. + */ + off_t pos; +}; + +/** + * Data buffer vector + * + * An array of data buffers, each containing a memory pointer or a + * file descriptor. + * + * Allocate dynamically to add more than one buffer. + */ +struct fuse_bufvec { + /** + * Number of buffers in the array + */ + size_t count; + + /** + * Index of current buffer within the array + */ + size_t idx; + + /** + * Current offset within the current buffer + */ + size_t off; + + /** + * Array of buffers + */ + struct fuse_buf buf[1]; +}; + +/* Initialize bufvec with a single buffer of given size */ +#define FUSE_BUFVEC_INIT(size__) \ + ((struct fuse_bufvec){ /* .count= */ 1, \ + /* .idx = */ 0, \ + /* .off = */ 0, /* .buf = */ \ + { /* [0] = */ { \ + /* .size = */ (size__), \ + /* .flags = */ (enum fuse_buf_flags)0, \ + /* .mem = */ NULL, \ + /* .fd = */ -1, \ + /* .pos = */ 0, \ + } } }) + +/** + * Get total size of data in a fuse buffer vector + * + * @param bufv buffer vector + * @return size of data + */ +size_t fuse_buf_size(const struct fuse_bufvec *bufv); + +/** + * Copy data from one buffer vector to another + * + * @param dst destination buffer vector + * @param src source buffer vector + * @return actual number of bytes copied or -errno on error + */ +ssize_t fuse_buf_copy(struct fuse_bufvec *dst, struct fuse_bufvec *src); + +/** + * Memory buffer iterator + * + */ +struct fuse_mbuf_iter { + /** + * Data pointer + */ + void *mem; + + /** + * Total length, in bytes + */ + size_t size; + + /** + * Offset from start of buffer + */ + size_t pos; +}; + +/* Initialize memory buffer iterator from a fuse_buf */ +#define FUSE_MBUF_ITER_INIT(fbuf) \ + ((struct fuse_mbuf_iter){ \ + .mem = fbuf->mem, \ + .size = fbuf->size, \ + .pos = 0, \ + }) + +/** + * Consume bytes from a memory buffer iterator + * + * @param iter memory buffer iterator + * @param len number of bytes to consume + * @return pointer to start of consumed bytes or + * NULL if advancing beyond end of buffer + */ +void *fuse_mbuf_iter_advance(struct fuse_mbuf_iter *iter, size_t len); + +/** + * Consume a NUL-terminated string from a memory buffer iterator + * + * @param iter memory buffer iterator + * @return pointer to the string or + * NULL if advancing beyond end of buffer or there is no NUL-terminator + */ +const char *fuse_mbuf_iter_advance_str(struct fuse_mbuf_iter *iter); + +/* + * Signal handling + */ +/** + * Exit session on HUP, TERM and INT signals and ignore PIPE signal + * + * Stores session in a global variable. May only be called once per + * process until fuse_remove_signal_handlers() is called. + * + * Once either of the POSIX signals arrives, the signal handler calls + * fuse_session_exit(). + * + * @param se the session to exit + * @return 0 on success, -1 on failure + * + * See also: + * fuse_remove_signal_handlers() + */ +int fuse_set_signal_handlers(struct fuse_session *se); + +/** + * Restore default signal handlers + * + * Resets global session. After this fuse_set_signal_handlers() may + * be called again. + * + * @param se the same session as given in fuse_set_signal_handlers() + * + * See also: + * fuse_set_signal_handlers() + */ +void fuse_remove_signal_handlers(struct fuse_session *se); + +/* + * Compatibility stuff + */ + +#if !defined(FUSE_USE_VERSION) || FUSE_USE_VERSION < 30 +#error only API version 30 or greater is supported +#endif + + +/* + * This interface uses 64 bit off_t. + * + * On 32bit systems please add -D_FILE_OFFSET_BITS=64 to your compile flags! + */ +QEMU_BUILD_BUG_ON(sizeof(off_t) != 8); + +#endif /* FUSE_COMMON_H_ */ diff --git a/tools/virtiofsd/fuse_i.h b/tools/virtiofsd/fuse_i.h new file mode 100644 index 000000000..492e00218 --- /dev/null +++ b/tools/virtiofsd/fuse_i.h @@ -0,0 +1,100 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#ifndef FUSE_I_H +#define FUSE_I_H + +#define FUSE_USE_VERSION 31 +#include "fuse_lowlevel.h" + +struct fv_VuDev; +struct fv_QueueInfo; + +struct fuse_req { + struct fuse_session *se; + uint64_t unique; + int ctr; + pthread_mutex_t lock; + struct fuse_ctx ctx; + struct fuse_chan *ch; + int interrupted; + unsigned int ioctl_64bit:1; + union { + struct { + uint64_t unique; + } i; + struct { + fuse_interrupt_func_t func; + void *data; + } ni; + } u; + struct fuse_req *next; + struct fuse_req *prev; +}; + +struct fuse_notify_req { + uint64_t unique; + void (*reply)(struct fuse_notify_req *, fuse_req_t, fuse_ino_t, + const void *, const struct fuse_buf *); + struct fuse_notify_req *next; + struct fuse_notify_req *prev; +}; + +struct fuse_session { + char *mountpoint; + volatile int exited; + int fd; + int debug; + int deny_others; + struct fuse_lowlevel_ops op; + int got_init; + struct cuse_data *cuse_data; + void *userdata; + uid_t owner; + struct fuse_conn_info conn; + struct fuse_req list; + struct fuse_req interrupts; + pthread_mutex_t lock; + pthread_rwlock_t init_rwlock; + int got_destroy; + int broken_splice_nonblock; + uint64_t notify_ctr; + struct fuse_notify_req notify_list; + size_t bufsize; + int error; + char *vu_socket_path; + char *vu_socket_group; + int vu_listen_fd; + int vu_socketfd; + struct fv_VuDev *virtio_dev; + int thread_pool_size; +}; + +struct fuse_chan { + pthread_mutex_t lock; + int ctr; + int fd; + struct fv_QueueInfo *qi; +}; + +int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count); +void fuse_free_req(fuse_req_t req); + +void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_bufvec *bufv, + struct fuse_chan *ch); + + +#define FUSE_MAX_MAX_PAGES 256 +#define FUSE_DEFAULT_MAX_PAGES_PER_REQ 32 + +/* room needed in buffer to accommodate header */ +#define FUSE_BUFFER_HEADER_SIZE 0x1000 + +#endif diff --git a/tools/virtiofsd/fuse_log.c b/tools/virtiofsd/fuse_log.c new file mode 100644 index 000000000..745d88cd2 --- /dev/null +++ b/tools/virtiofsd/fuse_log.c @@ -0,0 +1,39 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2019 Red Hat, Inc. + * + * Logging API. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_log.h" + + +static void default_log_func(__attribute__((unused)) enum fuse_log_level level, + const char *fmt, va_list ap) +{ + vfprintf(stderr, fmt, ap); +} + +static fuse_log_func_t log_func = default_log_func; + +void fuse_set_log_func(fuse_log_func_t func) +{ + if (!func) { + func = default_log_func; + } + + log_func = func; +} + +void fuse_log(enum fuse_log_level level, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + log_func(level, fmt, ap); + va_end(ap); +} diff --git a/tools/virtiofsd/fuse_log.h b/tools/virtiofsd/fuse_log.h new file mode 100644 index 000000000..8d7091bd4 --- /dev/null +++ b/tools/virtiofsd/fuse_log.h @@ -0,0 +1,73 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2019 Red Hat, Inc. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_LOG_H_ +#define FUSE_LOG_H_ + +/** @file + * + * This file defines the logging interface of FUSE + */ + + +/** + * Log severity level + * + * These levels correspond to syslog(2) log levels since they are widely used. + */ +enum fuse_log_level { + FUSE_LOG_EMERG, + FUSE_LOG_ALERT, + FUSE_LOG_CRIT, + FUSE_LOG_ERR, + FUSE_LOG_WARNING, + FUSE_LOG_NOTICE, + FUSE_LOG_INFO, + FUSE_LOG_DEBUG +}; + +/** + * Log message handler function. + * + * This function must be thread-safe. It may be called from any libfuse + * function, including fuse_parse_cmdline() and other functions invoked before + * a FUSE filesystem is created. + * + * Install a custom log message handler function using fuse_set_log_func(). + * + * @param level log severity level + * @param fmt sprintf-style format string including newline + * @param ap format string arguments + */ +typedef void (*fuse_log_func_t)(enum fuse_log_level level, const char *fmt, + va_list ap); + +/** + * Install a custom log handler function. + * + * Log messages are emitted by libfuse functions to report errors and debug + * information. Messages are printed to stderr by default but this can be + * overridden by installing a custom log message handler function. + * + * The log message handler function is global and affects all FUSE filesystems + * created within this process. + * + * @param func a custom log message handler function or NULL to revert to + * the default + */ +void fuse_set_log_func(fuse_log_func_t func); + +/** + * Emit a log message + * + * @param level severity level (FUSE_LOG_ERR, FUSE_LOG_DEBUG, etc) + * @param fmt sprintf-style format string including newline + */ +void fuse_log(enum fuse_log_level level, const char *fmt, ...); + +#endif /* FUSE_LOG_H_ */ diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c new file mode 100644 index 000000000..e4679c73a --- /dev/null +++ b/tools/virtiofsd/fuse_lowlevel.c @@ -0,0 +1,2614 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Implementation of (most of) the low-level FUSE API. The session loop + * functions are implemented in separate files. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "standard-headers/linux/fuse.h" +#include "fuse_misc.h" +#include "fuse_opt.h" +#include "fuse_virtio.h" + +#include <sys/file.h> + +#define THREAD_POOL_SIZE 0 + +#define OFFSET_MAX 0x7fffffffffffffffLL + +struct fuse_pollhandle { + uint64_t kh; + struct fuse_session *se; +}; + +static size_t pagesize; + +static __attribute__((constructor)) void fuse_ll_init_pagesize(void) +{ + pagesize = getpagesize(); +} + +static void convert_stat(const struct stat *stbuf, struct fuse_attr *attr) +{ + *attr = (struct fuse_attr){ + .ino = stbuf->st_ino, + .mode = stbuf->st_mode, + .nlink = stbuf->st_nlink, + .uid = stbuf->st_uid, + .gid = stbuf->st_gid, + .rdev = stbuf->st_rdev, + .size = stbuf->st_size, + .blksize = stbuf->st_blksize, + .blocks = stbuf->st_blocks, + .atime = stbuf->st_atime, + .mtime = stbuf->st_mtime, + .ctime = stbuf->st_ctime, + .atimensec = ST_ATIM_NSEC(stbuf), + .mtimensec = ST_MTIM_NSEC(stbuf), + .ctimensec = ST_CTIM_NSEC(stbuf), + }; +} + +static void convert_attr(const struct fuse_setattr_in *attr, struct stat *stbuf) +{ + stbuf->st_mode = attr->mode; + stbuf->st_uid = attr->uid; + stbuf->st_gid = attr->gid; + stbuf->st_size = attr->size; + stbuf->st_atime = attr->atime; + stbuf->st_mtime = attr->mtime; + stbuf->st_ctime = attr->ctime; + ST_ATIM_NSEC_SET(stbuf, attr->atimensec); + ST_MTIM_NSEC_SET(stbuf, attr->mtimensec); + ST_CTIM_NSEC_SET(stbuf, attr->ctimensec); +} + +static size_t iov_length(const struct iovec *iov, size_t count) +{ + size_t seg; + size_t ret = 0; + + for (seg = 0; seg < count; seg++) { + ret += iov[seg].iov_len; + } + return ret; +} + +static void list_init_req(struct fuse_req *req) +{ + req->next = req; + req->prev = req; +} + +static void list_del_req(struct fuse_req *req) +{ + struct fuse_req *prev = req->prev; + struct fuse_req *next = req->next; + prev->next = next; + next->prev = prev; +} + +static void list_add_req(struct fuse_req *req, struct fuse_req *next) +{ + struct fuse_req *prev = next->prev; + req->next = next; + req->prev = prev; + prev->next = req; + next->prev = req; +} + +static void destroy_req(fuse_req_t req) +{ + pthread_mutex_destroy(&req->lock); + g_free(req); +} + +void fuse_free_req(fuse_req_t req) +{ + int ctr; + struct fuse_session *se = req->se; + + pthread_mutex_lock(&se->lock); + req->u.ni.func = NULL; + req->u.ni.data = NULL; + list_del_req(req); + ctr = --req->ctr; + req->ch = NULL; + pthread_mutex_unlock(&se->lock); + if (!ctr) { + destroy_req(req); + } +} + +static struct fuse_req *fuse_ll_alloc_req(struct fuse_session *se) +{ + struct fuse_req *req; + + req = g_try_new0(struct fuse_req, 1); + if (req == NULL) { + fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate request\n"); + } else { + req->se = se; + req->ctr = 1; + list_init_req(req); + fuse_mutex_init(&req->lock); + } + + return req; +} + +/* Send data. If *ch* is NULL, send via session master fd */ +static int fuse_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) +{ + struct fuse_out_header *out = iov[0].iov_base; + + out->len = iov_length(iov, count); + if (out->unique == 0) { + fuse_log(FUSE_LOG_DEBUG, "NOTIFY: code=%d length=%u\n", out->error, + out->len); + } else if (out->error) { + fuse_log(FUSE_LOG_DEBUG, + " unique: %llu, error: %i (%s), outsize: %i\n", + (unsigned long long)out->unique, out->error, + strerror(-out->error), out->len); + } else { + fuse_log(FUSE_LOG_DEBUG, " unique: %llu, success, outsize: %i\n", + (unsigned long long)out->unique, out->len); + } + + if (fuse_lowlevel_is_virtio(se)) { + return virtio_send_msg(se, ch, iov, count); + } + + abort(); /* virtio should have taken it before here */ + return 0; +} + + +int fuse_send_reply_iov_nofree(fuse_req_t req, int error, struct iovec *iov, + int count) +{ + struct fuse_out_header out = { + .unique = req->unique, + .error = error, + }; + + if (error <= -1000 || error > 0) { + fuse_log(FUSE_LOG_ERR, "fuse: bad error value: %i\n", error); + out.error = -ERANGE; + } + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + + return fuse_send_msg(req->se, req->ch, iov, count); +} + +static int send_reply_iov(fuse_req_t req, int error, struct iovec *iov, + int count) +{ + int res; + + res = fuse_send_reply_iov_nofree(req, error, iov, count); + fuse_free_req(req); + return res; +} + +static int send_reply(fuse_req_t req, int error, const void *arg, + size_t argsize) +{ + struct iovec iov[2]; + int count = 1; + if (argsize) { + iov[1].iov_base = (void *)arg; + iov[1].iov_len = argsize; + count++; + } + return send_reply_iov(req, error, iov, count); +} + +int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count) +{ + int res; + g_autofree struct iovec *padded_iov = NULL; + + padded_iov = g_try_new(struct iovec, count + 1); + if (padded_iov == NULL) { + return fuse_reply_err(req, ENOMEM); + } + + memcpy(padded_iov + 1, iov, count * sizeof(struct iovec)); + count++; + + res = send_reply_iov(req, 0, padded_iov, count); + + return res; +} + + +/* + * 'buf` is allowed to be empty so that the proper size may be + * allocated by the caller + */ +size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + const char *name, const struct stat *stbuf, off_t off) +{ + (void)req; + size_t namelen; + size_t entlen; + size_t entlen_padded; + struct fuse_dirent *dirent; + + namelen = strlen(name); + entlen = FUSE_NAME_OFFSET + namelen; + entlen_padded = FUSE_DIRENT_ALIGN(entlen); + + if ((buf == NULL) || (entlen_padded > bufsize)) { + return entlen_padded; + } + + dirent = (struct fuse_dirent *)buf; + dirent->ino = stbuf->st_ino; + dirent->off = off; + dirent->namelen = namelen; + dirent->type = (stbuf->st_mode & S_IFMT) >> 12; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + + return entlen_padded; +} + +static void convert_statfs(const struct statvfs *stbuf, + struct fuse_kstatfs *kstatfs) +{ + *kstatfs = (struct fuse_kstatfs){ + .bsize = stbuf->f_bsize, + .frsize = stbuf->f_frsize, + .blocks = stbuf->f_blocks, + .bfree = stbuf->f_bfree, + .bavail = stbuf->f_bavail, + .files = stbuf->f_files, + .ffree = stbuf->f_ffree, + .namelen = stbuf->f_namemax, + }; +} + +static int send_reply_ok(fuse_req_t req, const void *arg, size_t argsize) +{ + return send_reply(req, 0, arg, argsize); +} + +int fuse_reply_err(fuse_req_t req, int err) +{ + return send_reply(req, -err, NULL, 0); +} + +void fuse_reply_none(fuse_req_t req) +{ + fuse_free_req(req); +} + +static unsigned long calc_timeout_sec(double t) +{ + if (t > (double)ULONG_MAX) { + return ULONG_MAX; + } else if (t < 0.0) { + return 0; + } else { + return (unsigned long)t; + } +} + +static unsigned int calc_timeout_nsec(double t) +{ + double f = t - (double)calc_timeout_sec(t); + if (f < 0.0) { + return 0; + } else if (f >= 0.999999999) { + return 999999999; + } else { + return (unsigned int)(f * 1.0e9); + } +} + +static void fill_entry(struct fuse_entry_out *arg, + const struct fuse_entry_param *e) +{ + *arg = (struct fuse_entry_out){ + .nodeid = e->ino, + .generation = e->generation, + .entry_valid = calc_timeout_sec(e->entry_timeout), + .entry_valid_nsec = calc_timeout_nsec(e->entry_timeout), + .attr_valid = calc_timeout_sec(e->attr_timeout), + .attr_valid_nsec = calc_timeout_nsec(e->attr_timeout), + }; + convert_stat(&e->attr, &arg->attr); + + arg->attr.flags = e->attr_flags; +} + +/* + * `buf` is allowed to be empty so that the proper size may be + * allocated by the caller + */ +size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + const char *name, + const struct fuse_entry_param *e, off_t off) +{ + (void)req; + size_t namelen; + size_t entlen; + size_t entlen_padded; + + namelen = strlen(name); + entlen = FUSE_NAME_OFFSET_DIRENTPLUS + namelen; + entlen_padded = FUSE_DIRENT_ALIGN(entlen); + if ((buf == NULL) || (entlen_padded > bufsize)) { + return entlen_padded; + } + + struct fuse_direntplus *dp = (struct fuse_direntplus *)buf; + memset(&dp->entry_out, 0, sizeof(dp->entry_out)); + fill_entry(&dp->entry_out, e); + + struct fuse_dirent *dirent = &dp->dirent; + *dirent = (struct fuse_dirent){ + .ino = e->attr.st_ino, + .off = off, + .namelen = namelen, + .type = (e->attr.st_mode & S_IFMT) >> 12, + }; + memcpy(dirent->name, name, namelen); + memset(dirent->name + namelen, 0, entlen_padded - entlen); + + return entlen_padded; +} + +static void fill_open(struct fuse_open_out *arg, const struct fuse_file_info *f) +{ + arg->fh = f->fh; + if (f->direct_io) { + arg->open_flags |= FOPEN_DIRECT_IO; + } + if (f->keep_cache) { + arg->open_flags |= FOPEN_KEEP_CACHE; + } + if (f->cache_readdir) { + arg->open_flags |= FOPEN_CACHE_DIR; + } + if (f->nonseekable) { + arg->open_flags |= FOPEN_NONSEEKABLE; + } +} + +int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e) +{ + struct fuse_entry_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + fill_entry(&arg, e); + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *f) +{ + char buf[sizeof(struct fuse_entry_out) + sizeof(struct fuse_open_out)]; + size_t entrysize = sizeof(struct fuse_entry_out); + struct fuse_entry_out *earg = (struct fuse_entry_out *)buf; + struct fuse_open_out *oarg = (struct fuse_open_out *)(buf + entrysize); + + memset(buf, 0, sizeof(buf)); + fill_entry(earg, e); + fill_open(oarg, f); + return send_reply_ok(req, buf, entrysize + sizeof(struct fuse_open_out)); +} + +int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout) +{ + struct fuse_attr_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + arg.attr_valid = calc_timeout_sec(attr_timeout); + arg.attr_valid_nsec = calc_timeout_nsec(attr_timeout); + convert_stat(attr, &arg.attr); + + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_readlink(fuse_req_t req, const char *linkname) +{ + return send_reply_ok(req, linkname, strlen(linkname)); +} + +int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *f) +{ + struct fuse_open_out arg; + + memset(&arg, 0, sizeof(arg)); + fill_open(&arg, f); + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_write(fuse_req_t req, size_t count) +{ + struct fuse_write_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.size = count; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size) +{ + return send_reply_ok(req, buf, size); +} + +static int fuse_send_data_iov_fallback(struct fuse_session *se, + struct fuse_chan *ch, struct iovec *iov, + int iov_count, struct fuse_bufvec *buf, + size_t len) +{ + /* Optimize common case */ + if (buf->count == 1 && buf->idx == 0 && buf->off == 0 && + !(buf->buf[0].flags & FUSE_BUF_IS_FD)) { + /* + * FIXME: also avoid memory copy if there are multiple buffers + * but none of them contain an fd + */ + + iov[iov_count].iov_base = buf->buf[0].mem; + iov[iov_count].iov_len = len; + iov_count++; + return fuse_send_msg(se, ch, iov, iov_count); + } + + if (fuse_lowlevel_is_virtio(se) && buf->count == 1 && + buf->buf[0].flags == (FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK)) { + return virtio_send_data_iov(se, ch, iov, iov_count, buf, len); + } + + abort(); /* Will have taken vhost path */ + return 0; +} + +static int fuse_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int iov_count, + struct fuse_bufvec *buf) +{ + size_t len = fuse_buf_size(buf); + + return fuse_send_data_iov_fallback(se, ch, iov, iov_count, buf, len); +} + +int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv) +{ + struct iovec iov[2]; + struct fuse_out_header out = { + .unique = req->unique, + }; + int res; + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(struct fuse_out_header); + + res = fuse_send_data_iov(req->se, req->ch, iov, 1, bufv); + if (res <= 0) { + fuse_free_req(req); + return res; + } else { + return fuse_reply_err(req, res); + } +} + +int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf) +{ + struct fuse_statfs_out arg; + size_t size = sizeof(arg); + + memset(&arg, 0, sizeof(arg)); + convert_statfs(stbuf, &arg.st); + + return send_reply_ok(req, &arg, size); +} + +int fuse_reply_xattr(fuse_req_t req, size_t count) +{ + struct fuse_getxattr_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.size = count; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_lock(fuse_req_t req, const struct flock *lock) +{ + struct fuse_lk_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.lk.type = lock->l_type; + if (lock->l_type != F_UNLCK) { + arg.lk.start = lock->l_start; + if (lock->l_len == 0) { + arg.lk.end = OFFSET_MAX; + } else { + arg.lk.end = lock->l_start + lock->l_len - 1; + } + } + arg.lk.pid = lock->l_pid; + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_bmap(fuse_req_t req, uint64_t idx) +{ + struct fuse_bmap_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.block = idx; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +static struct fuse_ioctl_iovec *fuse_ioctl_iovec_copy(const struct iovec *iov, + size_t count) +{ + struct fuse_ioctl_iovec *fiov; + size_t i; + + fiov = g_try_new(struct fuse_ioctl_iovec, count); + if (!fiov) { + return NULL; + } + + for (i = 0; i < count; i++) { + fiov[i].base = (uintptr_t)iov[i].iov_base; + fiov[i].len = iov[i].iov_len; + } + + return fiov; +} + +int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + size_t in_count, const struct iovec *out_iov, + size_t out_count) +{ + struct fuse_ioctl_out arg; + g_autofree struct fuse_ioctl_iovec *in_fiov = NULL; + g_autofree struct fuse_ioctl_iovec *out_fiov = NULL; + struct iovec iov[4]; + size_t count = 1; + int res; + + memset(&arg, 0, sizeof(arg)); + arg.flags |= FUSE_IOCTL_RETRY; + arg.in_iovs = in_count; + arg.out_iovs = out_count; + iov[count].iov_base = &arg; + iov[count].iov_len = sizeof(arg); + count++; + + /* Can't handle non-compat 64bit ioctls on 32bit */ + if (sizeof(void *) == 4 && req->ioctl_64bit) { + res = fuse_reply_err(req, EINVAL); + return res; + } + + if (in_count) { + in_fiov = fuse_ioctl_iovec_copy(in_iov, in_count); + if (!in_fiov) { + res = fuse_reply_err(req, ENOMEM); + return res; + } + + iov[count].iov_base = (void *)in_fiov; + iov[count].iov_len = sizeof(in_fiov[0]) * in_count; + count++; + } + if (out_count) { + out_fiov = fuse_ioctl_iovec_copy(out_iov, out_count); + if (!out_fiov) { + res = fuse_reply_err(req, ENOMEM); + return res; + } + + iov[count].iov_base = (void *)out_fiov; + iov[count].iov_len = sizeof(out_fiov[0]) * out_count; + count++; + } + + res = send_reply_iov(req, 0, iov, count); + + return res; +} + +int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size) +{ + struct fuse_ioctl_out arg; + struct iovec iov[3]; + size_t count = 1; + + memset(&arg, 0, sizeof(arg)); + arg.result = result; + iov[count].iov_base = &arg; + iov[count].iov_len = sizeof(arg); + count++; + + if (size) { + iov[count].iov_base = (char *)buf; + iov[count].iov_len = size; + count++; + } + + return send_reply_iov(req, 0, iov, count); +} + +int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, + int count) +{ + g_autofree struct iovec *padded_iov = NULL; + struct fuse_ioctl_out arg; + int res; + + padded_iov = g_try_new(struct iovec, count + 2); + if (padded_iov == NULL) { + return fuse_reply_err(req, ENOMEM); + } + + memset(&arg, 0, sizeof(arg)); + arg.result = result; + padded_iov[1].iov_base = &arg; + padded_iov[1].iov_len = sizeof(arg); + + memcpy(&padded_iov[2], iov, count * sizeof(struct iovec)); + + res = send_reply_iov(req, 0, padded_iov, count + 2); + + return res; +} + +int fuse_reply_poll(fuse_req_t req, unsigned revents) +{ + struct fuse_poll_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.revents = revents; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +int fuse_reply_lseek(fuse_req_t req, off_t off) +{ + struct fuse_lseek_out arg; + + memset(&arg, 0, sizeof(arg)); + arg.offset = off; + + return send_reply_ok(req, &arg, sizeof(arg)); +} + +static void do_lookup(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.lookup) { + req->se->op.lookup(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_forget(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_forget_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.forget) { + req->se->op.forget(req, nodeid, arg->nlookup); + } else { + fuse_reply_none(req); + } +} + +static void do_batch_forget(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_batch_forget_in *arg; + struct fuse_forget_data *forgets; + size_t scount; + + (void)nodeid; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_none(req); + return; + } + + /* + * Prevent integer overflow. The compiler emits the following warning + * unless we use the scount local variable: + * + * error: comparison is always false due to limited range of data type + * [-Werror=type-limits] + * + * This may be true on 64-bit hosts but we need this check for 32-bit + * hosts. + */ + scount = arg->count; + if (scount > SIZE_MAX / sizeof(forgets[0])) { + fuse_reply_none(req); + return; + } + + forgets = fuse_mbuf_iter_advance(iter, arg->count * sizeof(forgets[0])); + if (!forgets) { + fuse_reply_none(req); + return; + } + + if (req->se->op.forget_multi) { + req->se->op.forget_multi(req, arg->count, forgets); + } else if (req->se->op.forget) { + unsigned int i; + + for (i = 0; i < arg->count; i++) { + struct fuse_req *dummy_req; + + dummy_req = fuse_ll_alloc_req(req->se); + if (dummy_req == NULL) { + break; + } + + dummy_req->unique = req->unique; + dummy_req->ctx = req->ctx; + dummy_req->ch = NULL; + + req->se->op.forget(dummy_req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); + } else { + fuse_reply_none(req); + } +} + +static void do_getattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_file_info *fip = NULL; + struct fuse_file_info fi; + + struct fuse_getattr_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (arg->getattr_flags & FUSE_GETATTR_FH) { + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fip = &fi; + } + + if (req->se->op.getattr) { + req->se->op.getattr(req, nodeid, fip); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_setattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.setattr) { + struct fuse_setattr_in *arg; + struct fuse_file_info *fi = NULL; + struct fuse_file_info fi_store; + struct stat stbuf; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&stbuf, 0, sizeof(stbuf)); + convert_attr(arg, &stbuf); + if (arg->valid & FATTR_FH) { + arg->valid &= ~FATTR_FH; + memset(&fi_store, 0, sizeof(fi_store)); + fi = &fi_store; + fi->fh = arg->fh; + } + arg->valid &= FUSE_SET_ATTR_MODE | FUSE_SET_ATTR_UID | + FUSE_SET_ATTR_GID | FUSE_SET_ATTR_SIZE | + FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME | + FUSE_SET_ATTR_ATIME_NOW | FUSE_SET_ATTR_MTIME_NOW | + FUSE_SET_ATTR_CTIME | FUSE_SET_ATTR_KILL_SUIDGID; + + req->se->op.setattr(req, nodeid, &stbuf, arg->valid, fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_access(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_access_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.access) { + req->se->op.access(req, nodeid, arg->mask); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_readlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + (void)iter; + + if (req->se->op.readlink) { + req->se->op.readlink(req, nodeid); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_mknod(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_mknod_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + req->ctx.umask = arg->umask; + + if (req->se->op.mknod) { + req->se->op.mknod(req, nodeid, name, arg->mode, arg->rdev); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_mkdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_mkdir_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + req->ctx.umask = arg->umask; + + if (req->se->op.mkdir) { + req->se->op.mkdir(req, nodeid, name, arg->mode); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_unlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.unlink) { + req->se->op.unlink(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rmdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rmdir) { + req->se->op.rmdir(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_symlink(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + const char *linkname = fuse_mbuf_iter_advance_str(iter); + + if (!name || !linkname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.symlink) { + req->se->op.symlink(req, linkname, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rename(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_rename_in *arg; + const char *oldname; + const char *newname; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + oldname = fuse_mbuf_iter_advance_str(iter); + newname = fuse_mbuf_iter_advance_str(iter); + if (!arg || !oldname || !newname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, 0); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_rename2(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_rename2_in *arg; + const char *oldname; + const char *newname; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + oldname = fuse_mbuf_iter_advance_str(iter); + newname = fuse_mbuf_iter_advance_str(iter); + if (!arg || !oldname || !newname) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.rename) { + req->se->op.rename(req, nodeid, oldname, arg->newdir, newname, + arg->flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_link(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_link_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.link) { + req->se->op.link(req, arg->oldnodeid, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_create(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.create) { + struct fuse_create_in *arg; + struct fuse_file_info fi; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; + + req->ctx.umask = arg->umask; + + req->se->op.create(req, nodeid, name, arg->mode, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_open(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_open_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + /* File creation is handled by do_create() or do_mknod() */ + if (arg->flags & (O_CREAT | O_TMPFILE)) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.kill_priv = arg->open_flags & FUSE_OPEN_KILL_SUIDGID; + + if (req->se->op.open) { + req->se->op.open(req, nodeid, &fi); + } else { + fuse_reply_open(req, &fi); + } +} + +static void do_read(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + if (req->se->op.read) { + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + req->se->op.read(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_write(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_write_in *arg; + struct fuse_file_info fi; + const char *param; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + param = fuse_mbuf_iter_advance(iter, arg->size); + if (!param) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.writepage = (arg->write_flags & FUSE_WRITE_CACHE) != 0; + fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + + if (req->se->op.write) { + req->se->op.write(req, nodeid, param, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_write_buf(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter, struct fuse_bufvec *ibufv) +{ + struct fuse_session *se = req->se; + struct fuse_bufvec *pbufv = ibufv; + struct fuse_bufvec tmpbufv = { + .buf[0] = ibufv->buf[0], + .count = 1, + }; + struct fuse_write_in *arg; + size_t arg_size = sizeof(*arg); + struct fuse_file_info fi; + + memset(&fi, 0, sizeof(fi)); + + arg = fuse_mbuf_iter_advance(iter, arg_size); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + fi.lock_owner = arg->lock_owner; + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.writepage = !!(arg->write_flags & FUSE_WRITE_CACHE); + fi.kill_priv = !!(arg->write_flags & FUSE_WRITE_KILL_PRIV); + + if (ibufv->count == 1) { + assert(!(tmpbufv.buf[0].flags & FUSE_BUF_IS_FD)); + tmpbufv.buf[0].mem = ((char *)arg) + arg_size; + tmpbufv.buf[0].size -= sizeof(struct fuse_in_header) + arg_size; + pbufv = &tmpbufv; + } else { + /* + * Input bufv contains the headers in the first element + * and the data in the rest, we need to skip that first element + */ + ibufv->buf[0].size = 0; + } + + if (fuse_buf_size(pbufv) != arg->size) { + fuse_log(FUSE_LOG_ERR, + "fuse: do_write_buf: buffer size doesn't match arg->size\n"); + fuse_reply_err(req, EIO); + return; + } + + se->op.write_buf(req, nodeid, pbufv, arg->offset, &fi); +} + +static void do_flush(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_flush_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.flush = 1; + fi.lock_owner = arg->lock_owner; + + if (req->se->op.flush) { + req->se->op.flush(req, nodeid, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_release(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_release_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + fi.flush = (arg->release_flags & FUSE_RELEASE_FLUSH) ? 1 : 0; + fi.lock_owner = arg->lock_owner; + + if (arg->release_flags & FUSE_RELEASE_FLOCK_UNLOCK) { + fi.flock_release = 1; + } + + if (req->se->op.release) { + req->se->op.release(req, nodeid, &fi); + } else { + fuse_reply_err(req, 0); + } +} + +static void do_fsync(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fsync_in *arg; + struct fuse_file_info fi; + int datasync; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fsync) { + if (fi.fh == (uint64_t)-1) { + req->se->op.fsync(req, nodeid, datasync, NULL); + } else { + req->se->op.fsync(req, nodeid, datasync, &fi); + } + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_opendir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_open_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + + if (req->se->op.opendir) { + req->se->op.opendir(req, nodeid, &fi); + } else { + fuse_reply_open(req, &fi); + } +} + +static void do_readdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.readdir) { + req->se->op.readdir(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_readdirplus(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_read_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.readdirplus) { + req->se->op.readdirplus(req, nodeid, arg->size, arg->offset, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_releasedir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_release_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.flags = arg->flags; + fi.fh = arg->fh; + + if (req->se->op.releasedir) { + req->se->op.releasedir(req, nodeid, &fi); + } else { + fuse_reply_err(req, 0); + } +} + +static void do_fsyncdir(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fsync_in *arg; + struct fuse_file_info fi; + int datasync; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + datasync = arg->fsync_flags & 1; + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fsyncdir) { + req->se->op.fsyncdir(req, nodeid, datasync, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_statfs(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + (void)nodeid; + (void)iter; + + if (req->se->op.statfs) { + req->se->op.statfs(req, nodeid); + } else { + struct statvfs buf = { + .f_namemax = 255, + .f_bsize = 512, + }; + fuse_reply_statfs(req, &buf); + } +} + +static void do_setxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_setxattr_in *arg; + const char *name; + const char *value; + bool setxattr_ext = req->se->conn.want & FUSE_CAP_SETXATTR_EXT; + + if (setxattr_ext) { + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + } else { + arg = fuse_mbuf_iter_advance(iter, FUSE_COMPAT_SETXATTR_IN_SIZE); + } + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + value = fuse_mbuf_iter_advance(iter, arg->size); + if (!value) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.setxattr) { + uint32_t setxattr_flags = setxattr_ext ? arg->setxattr_flags : 0; + req->se->op.setxattr(req, nodeid, name, value, arg->size, arg->flags, + setxattr_flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_getxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_getxattr_in *arg; + const char *name; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + name = fuse_mbuf_iter_advance_str(iter); + if (!arg || !name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.getxattr) { + req->se->op.getxattr(req, nodeid, name, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_listxattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_getxattr_in *arg; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.listxattr) { + req->se->op.listxattr(req, nodeid, arg->size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_removexattr(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + const char *name = fuse_mbuf_iter_advance_str(iter); + + if (!name) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.removexattr) { + req->se->op.removexattr(req, nodeid, name); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void convert_fuse_file_lock(struct fuse_file_lock *fl, + struct flock *flock) +{ + memset(flock, 0, sizeof(struct flock)); + flock->l_type = fl->type; + flock->l_whence = SEEK_SET; + flock->l_start = fl->start; + if (fl->end == OFFSET_MAX) { + flock->l_len = 0; + } else { + flock->l_len = fl->end - fl->start + 1; + } + flock->l_pid = fl->pid; +} + +static void do_getlk(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; + + convert_fuse_file_lock(&arg->lk, &flock); + if (req->se->op.getlk) { + req->se->op.getlk(req, nodeid, &fi, &flock); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_setlk_common(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter, int sleep) +{ + struct fuse_lk_in *arg; + struct fuse_file_info fi; + struct flock flock; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.lock_owner = arg->owner; + + if (arg->lk_flags & FUSE_LK_FLOCK) { + int op = 0; + + switch (arg->lk.type) { + case F_RDLCK: + op = LOCK_SH; + break; + case F_WRLCK: + op = LOCK_EX; + break; + case F_UNLCK: + op = LOCK_UN; + break; + } + if (!sleep) { + op |= LOCK_NB; + } + + if (req->se->op.flock) { + req->se->op.flock(req, nodeid, &fi, op); + } else { + fuse_reply_err(req, ENOSYS); + } + } else { + convert_fuse_file_lock(&arg->lk, &flock); + if (req->se->op.setlk) { + req->se->op.setlk(req, nodeid, &fi, &flock, sleep); + } else { + fuse_reply_err(req, ENOSYS); + } + } +} + +static void do_setlk(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + do_setlk_common(req, nodeid, iter, 0); +} + +static void do_setlkw(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + do_setlk_common(req, nodeid, iter, 1); +} + +static int find_interrupted(struct fuse_session *se, struct fuse_req *req) +{ + struct fuse_req *curr; + + for (curr = se->list.next; curr != &se->list; curr = curr->next) { + if (curr->unique == req->u.i.unique) { + fuse_interrupt_func_t func; + void *data; + + curr->ctr++; + pthread_mutex_unlock(&se->lock); + + /* Ugh, ugly locking */ + pthread_mutex_lock(&curr->lock); + pthread_mutex_lock(&se->lock); + curr->interrupted = 1; + func = curr->u.ni.func; + data = curr->u.ni.data; + pthread_mutex_unlock(&se->lock); + if (func) { + func(curr, data); + } + pthread_mutex_unlock(&curr->lock); + + pthread_mutex_lock(&se->lock); + curr->ctr--; + if (!curr->ctr) { + destroy_req(curr); + } + + return 1; + } + } + for (curr = se->interrupts.next; curr != &se->interrupts; + curr = curr->next) { + if (curr->u.i.unique == req->u.i.unique) { + return 1; + } + } + return 0; +} + +static void do_interrupt(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_interrupt_in *arg; + struct fuse_session *se = req->se; + + (void)nodeid; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + fuse_log(FUSE_LOG_DEBUG, "INTERRUPT: %llu\n", + (unsigned long long)arg->unique); + + req->u.i.unique = arg->unique; + + pthread_mutex_lock(&se->lock); + if (find_interrupted(se, req)) { + destroy_req(req); + } else { + list_add_req(req, &se->interrupts); + } + pthread_mutex_unlock(&se->lock); +} + +static struct fuse_req *check_interrupt(struct fuse_session *se, + struct fuse_req *req) +{ + struct fuse_req *curr; + + for (curr = se->interrupts.next; curr != &se->interrupts; + curr = curr->next) { + if (curr->u.i.unique == req->unique) { + req->interrupted = 1; + list_del_req(curr); + g_free(curr); + return NULL; + } + } + curr = se->interrupts.next; + if (curr != &se->interrupts) { + list_del_req(curr); + list_init_req(curr); + return curr; + } else { + return NULL; + } +} + +static void do_bmap(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_bmap_in *arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + if (req->se->op.bmap) { + req->se->op.bmap(req, nodeid, arg->blocksize, arg->block); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_ioctl(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_ioctl_in *arg; + unsigned int flags; + void *in_buf = NULL; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + flags = arg->flags; + if (flags & FUSE_IOCTL_DIR && !(req->se->conn.want & FUSE_CAP_IOCTL_DIR)) { + fuse_reply_err(req, ENOTTY); + return; + } + + if (arg->in_size) { + in_buf = fuse_mbuf_iter_advance(iter, arg->in_size); + if (!in_buf) { + fuse_reply_err(req, EINVAL); + return; + } + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (sizeof(void *) == 4 && !(flags & FUSE_IOCTL_32BIT)) { + req->ioctl_64bit = 1; + } + + if (req->se->op.ioctl) { + req->se->op.ioctl(req, nodeid, arg->cmd, (void *)(uintptr_t)arg->arg, + &fi, flags, in_buf, arg->in_size, arg->out_size); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +void fuse_pollhandle_destroy(struct fuse_pollhandle *ph) +{ + free(ph); +} + +static void do_poll(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_poll_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + fi.poll_events = arg->events; + + if (req->se->op.poll) { + struct fuse_pollhandle *ph = NULL; + + if (arg->flags & FUSE_POLL_SCHEDULE_NOTIFY) { + ph = malloc(sizeof(struct fuse_pollhandle)); + if (ph == NULL) { + fuse_reply_err(req, ENOMEM); + return; + } + ph->kh = arg->kh; + ph->se = req->se; + } + + req->se->op.poll(req, nodeid, &fi, ph); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_fallocate(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_fallocate_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.fallocate) { + req->se->op.fallocate(req, nodeid, arg->mode, arg->offset, arg->length, + &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_copy_file_range(fuse_req_t req, fuse_ino_t nodeid_in, + struct fuse_mbuf_iter *iter) +{ + struct fuse_copy_file_range_in *arg; + struct fuse_file_info fi_in, fi_out; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + memset(&fi_in, 0, sizeof(fi_in)); + fi_in.fh = arg->fh_in; + + memset(&fi_out, 0, sizeof(fi_out)); + fi_out.fh = arg->fh_out; + + + if (req->se->op.copy_file_range) { + req->se->op.copy_file_range(req, nodeid_in, arg->off_in, &fi_in, + arg->nodeid_out, arg->off_out, &fi_out, + arg->len, arg->flags); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_lseek(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_lseek_in *arg; + struct fuse_file_info fi; + + arg = fuse_mbuf_iter_advance(iter, sizeof(*arg)); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + memset(&fi, 0, sizeof(fi)); + fi.fh = arg->fh; + + if (req->se->op.lseek) { + req->se->op.lseek(req, nodeid, arg->offset, arg->whence, &fi); + } else { + fuse_reply_err(req, ENOSYS); + } +} + +static void do_init(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + size_t compat_size = offsetof(struct fuse_init_in, max_readahead); + struct fuse_init_in *arg; + struct fuse_init_out outarg; + struct fuse_session *se = req->se; + size_t bufsize = se->bufsize; + size_t outargsize = sizeof(outarg); + + (void)nodeid; + + /* First consume the old fields... */ + arg = fuse_mbuf_iter_advance(iter, compat_size); + if (!arg) { + fuse_reply_err(req, EINVAL); + return; + } + + /* ...and now consume the new fields. */ + if (arg->major == 7 && arg->minor >= 6) { + if (!fuse_mbuf_iter_advance(iter, sizeof(*arg) - compat_size)) { + fuse_reply_err(req, EINVAL); + return; + } + } + + fuse_log(FUSE_LOG_DEBUG, "INIT: %u.%u\n", arg->major, arg->minor); + if (arg->major == 7 && arg->minor >= 6) { + fuse_log(FUSE_LOG_DEBUG, "flags=0x%08x\n", arg->flags); + fuse_log(FUSE_LOG_DEBUG, "max_readahead=0x%08x\n", arg->max_readahead); + } + se->conn.proto_major = arg->major; + se->conn.proto_minor = arg->minor; + se->conn.capable = 0; + se->conn.want = 0; + + memset(&outarg, 0, sizeof(outarg)); + outarg.major = FUSE_KERNEL_VERSION; + outarg.minor = FUSE_KERNEL_MINOR_VERSION; + + if (arg->major < 7 || (arg->major == 7 && arg->minor < 31)) { + fuse_log(FUSE_LOG_ERR, "fuse: unsupported protocol version: %u.%u\n", + arg->major, arg->minor); + fuse_reply_err(req, EPROTO); + return; + } + + if (arg->major > 7) { + /* Wait for a second INIT request with a 7.X version */ + send_reply_ok(req, &outarg, sizeof(outarg)); + return; + } + + if (arg->max_readahead < se->conn.max_readahead) { + se->conn.max_readahead = arg->max_readahead; + } + if (arg->flags & FUSE_ASYNC_READ) { + se->conn.capable |= FUSE_CAP_ASYNC_READ; + } + if (arg->flags & FUSE_POSIX_LOCKS) { + se->conn.capable |= FUSE_CAP_POSIX_LOCKS; + } + if (arg->flags & FUSE_ATOMIC_O_TRUNC) { + se->conn.capable |= FUSE_CAP_ATOMIC_O_TRUNC; + } + if (arg->flags & FUSE_EXPORT_SUPPORT) { + se->conn.capable |= FUSE_CAP_EXPORT_SUPPORT; + } + if (arg->flags & FUSE_DONT_MASK) { + se->conn.capable |= FUSE_CAP_DONT_MASK; + } + if (arg->flags & FUSE_FLOCK_LOCKS) { + se->conn.capable |= FUSE_CAP_FLOCK_LOCKS; + } + if (arg->flags & FUSE_AUTO_INVAL_DATA) { + se->conn.capable |= FUSE_CAP_AUTO_INVAL_DATA; + } + if (arg->flags & FUSE_DO_READDIRPLUS) { + se->conn.capable |= FUSE_CAP_READDIRPLUS; + } + if (arg->flags & FUSE_READDIRPLUS_AUTO) { + se->conn.capable |= FUSE_CAP_READDIRPLUS_AUTO; + } + if (arg->flags & FUSE_ASYNC_DIO) { + se->conn.capable |= FUSE_CAP_ASYNC_DIO; + } + if (arg->flags & FUSE_WRITEBACK_CACHE) { + se->conn.capable |= FUSE_CAP_WRITEBACK_CACHE; + } + if (arg->flags & FUSE_NO_OPEN_SUPPORT) { + se->conn.capable |= FUSE_CAP_NO_OPEN_SUPPORT; + } + if (arg->flags & FUSE_PARALLEL_DIROPS) { + se->conn.capable |= FUSE_CAP_PARALLEL_DIROPS; + } + if (arg->flags & FUSE_POSIX_ACL) { + se->conn.capable |= FUSE_CAP_POSIX_ACL; + } + if (arg->flags & FUSE_HANDLE_KILLPRIV) { + se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV; + } + if (arg->flags & FUSE_NO_OPENDIR_SUPPORT) { + se->conn.capable |= FUSE_CAP_NO_OPENDIR_SUPPORT; + } + if (!(arg->flags & FUSE_MAX_PAGES)) { + size_t max_bufsize = FUSE_DEFAULT_MAX_PAGES_PER_REQ * getpagesize() + + FUSE_BUFFER_HEADER_SIZE; + if (bufsize > max_bufsize) { + bufsize = max_bufsize; + } + } + if (arg->flags & FUSE_SUBMOUNTS) { + se->conn.capable |= FUSE_CAP_SUBMOUNTS; + } + if (arg->flags & FUSE_HANDLE_KILLPRIV_V2) { + se->conn.capable |= FUSE_CAP_HANDLE_KILLPRIV_V2; + } + if (arg->flags & FUSE_SETXATTR_EXT) { + se->conn.capable |= FUSE_CAP_SETXATTR_EXT; + } +#ifdef HAVE_SPLICE +#ifdef HAVE_VMSPLICE + se->conn.capable |= FUSE_CAP_SPLICE_WRITE | FUSE_CAP_SPLICE_MOVE; +#endif + se->conn.capable |= FUSE_CAP_SPLICE_READ; +#endif + se->conn.capable |= FUSE_CAP_IOCTL_DIR; + + /* + * Default settings for modern filesystems. + * + * Most of these capabilities were disabled by default in + * libfuse2 for backwards compatibility reasons. In libfuse3, + * we can finally enable them by default (as long as they're + * supported by the kernel). + */ +#define LL_SET_DEFAULT(cond, cap) \ + if ((cond) && (se->conn.capable & (cap))) \ + se->conn.want |= (cap) + LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_READ); + LL_SET_DEFAULT(1, FUSE_CAP_PARALLEL_DIROPS); + LL_SET_DEFAULT(1, FUSE_CAP_AUTO_INVAL_DATA); + LL_SET_DEFAULT(1, FUSE_CAP_HANDLE_KILLPRIV); + LL_SET_DEFAULT(1, FUSE_CAP_ASYNC_DIO); + LL_SET_DEFAULT(1, FUSE_CAP_IOCTL_DIR); + LL_SET_DEFAULT(1, FUSE_CAP_ATOMIC_O_TRUNC); + LL_SET_DEFAULT(se->op.write_buf, FUSE_CAP_SPLICE_READ); + LL_SET_DEFAULT(se->op.getlk && se->op.setlk, FUSE_CAP_POSIX_LOCKS); + LL_SET_DEFAULT(se->op.flock, FUSE_CAP_FLOCK_LOCKS); + LL_SET_DEFAULT(se->op.readdirplus, FUSE_CAP_READDIRPLUS); + LL_SET_DEFAULT(se->op.readdirplus && se->op.readdir, + FUSE_CAP_READDIRPLUS_AUTO); + se->conn.time_gran = 1; + + if (bufsize < FUSE_MIN_READ_BUFFER) { + fuse_log(FUSE_LOG_ERR, "fuse: warning: buffer size too small: %zu\n", + bufsize); + bufsize = FUSE_MIN_READ_BUFFER; + } + se->bufsize = bufsize; + + if (se->conn.max_write > bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->conn.max_write = bufsize - FUSE_BUFFER_HEADER_SIZE; + } + + se->got_init = 1; + se->got_destroy = 0; + if (se->op.init) { + se->op.init(se->userdata, &se->conn); + } + + if (se->conn.want & (~se->conn.capable)) { + fuse_log(FUSE_LOG_ERR, + "fuse: error: filesystem requested capabilities " + "0x%x that are not supported by kernel, aborting.\n", + se->conn.want & (~se->conn.capable)); + fuse_reply_err(req, EPROTO); + se->error = -EPROTO; + fuse_session_exit(se); + return; + } + + if (se->conn.max_write < bufsize - FUSE_BUFFER_HEADER_SIZE) { + se->bufsize = se->conn.max_write + FUSE_BUFFER_HEADER_SIZE; + } + if (arg->flags & FUSE_MAX_PAGES) { + outarg.flags |= FUSE_MAX_PAGES; + outarg.max_pages = (se->conn.max_write - 1) / getpagesize() + 1; + } + + /* + * Always enable big writes, this is superseded + * by the max_write option + */ + outarg.flags |= FUSE_BIG_WRITES; + + if (se->conn.want & FUSE_CAP_ASYNC_READ) { + outarg.flags |= FUSE_ASYNC_READ; + } + if (se->conn.want & FUSE_CAP_PARALLEL_DIROPS) { + outarg.flags |= FUSE_PARALLEL_DIROPS; + } + if (se->conn.want & FUSE_CAP_POSIX_LOCKS) { + outarg.flags |= FUSE_POSIX_LOCKS; + } + if (se->conn.want & FUSE_CAP_ATOMIC_O_TRUNC) { + outarg.flags |= FUSE_ATOMIC_O_TRUNC; + } + if (se->conn.want & FUSE_CAP_EXPORT_SUPPORT) { + outarg.flags |= FUSE_EXPORT_SUPPORT; + } + if (se->conn.want & FUSE_CAP_DONT_MASK) { + outarg.flags |= FUSE_DONT_MASK; + } + if (se->conn.want & FUSE_CAP_FLOCK_LOCKS) { + outarg.flags |= FUSE_FLOCK_LOCKS; + } + if (se->conn.want & FUSE_CAP_AUTO_INVAL_DATA) { + outarg.flags |= FUSE_AUTO_INVAL_DATA; + } + if (se->conn.want & FUSE_CAP_READDIRPLUS) { + outarg.flags |= FUSE_DO_READDIRPLUS; + } + if (se->conn.want & FUSE_CAP_READDIRPLUS_AUTO) { + outarg.flags |= FUSE_READDIRPLUS_AUTO; + } + if (se->conn.want & FUSE_CAP_ASYNC_DIO) { + outarg.flags |= FUSE_ASYNC_DIO; + } + if (se->conn.want & FUSE_CAP_WRITEBACK_CACHE) { + outarg.flags |= FUSE_WRITEBACK_CACHE; + } + if (se->conn.want & FUSE_CAP_POSIX_ACL) { + outarg.flags |= FUSE_POSIX_ACL; + } + outarg.max_readahead = se->conn.max_readahead; + outarg.max_write = se->conn.max_write; + if (se->conn.max_background >= (1 << 16)) { + se->conn.max_background = (1 << 16) - 1; + } + if (se->conn.congestion_threshold > se->conn.max_background) { + se->conn.congestion_threshold = se->conn.max_background; + } + if (!se->conn.congestion_threshold) { + se->conn.congestion_threshold = se->conn.max_background * 3 / 4; + } + + outarg.max_background = se->conn.max_background; + outarg.congestion_threshold = se->conn.congestion_threshold; + outarg.time_gran = se->conn.time_gran; + + if (se->conn.want & FUSE_CAP_HANDLE_KILLPRIV_V2) { + outarg.flags |= FUSE_HANDLE_KILLPRIV_V2; + } + + if (se->conn.want & FUSE_CAP_SETXATTR_EXT) { + outarg.flags |= FUSE_SETXATTR_EXT; + } + + fuse_log(FUSE_LOG_DEBUG, " INIT: %u.%u\n", outarg.major, outarg.minor); + fuse_log(FUSE_LOG_DEBUG, " flags=0x%08x\n", outarg.flags); + fuse_log(FUSE_LOG_DEBUG, " max_readahead=0x%08x\n", outarg.max_readahead); + fuse_log(FUSE_LOG_DEBUG, " max_write=0x%08x\n", outarg.max_write); + fuse_log(FUSE_LOG_DEBUG, " max_background=%i\n", outarg.max_background); + fuse_log(FUSE_LOG_DEBUG, " congestion_threshold=%i\n", + outarg.congestion_threshold); + fuse_log(FUSE_LOG_DEBUG, " time_gran=%u\n", outarg.time_gran); + + send_reply_ok(req, &outarg, outargsize); +} + +static void do_destroy(fuse_req_t req, fuse_ino_t nodeid, + struct fuse_mbuf_iter *iter) +{ + struct fuse_session *se = req->se; + + (void)nodeid; + (void)iter; + + se->got_destroy = 1; + se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + + send_reply_ok(req, NULL, 0); +} + +int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv) +{ + struct fuse_out_header out = { + .error = FUSE_NOTIFY_STORE, + }; + struct fuse_notify_store_out outarg = { + .nodeid = ino, + .offset = offset, + .size = fuse_buf_size(bufv), + }; + struct iovec iov[3]; + int res; + + if (!se) { + return -EINVAL; + } + + iov[0].iov_base = &out; + iov[0].iov_len = sizeof(out); + iov[1].iov_base = &outarg; + iov[1].iov_len = sizeof(outarg); + + res = fuse_send_data_iov(se, NULL, iov, 2, bufv); + if (res > 0) { + res = -res; + } + + return res; +} + +void *fuse_req_userdata(fuse_req_t req) +{ + return req->se->userdata; +} + +const struct fuse_ctx *fuse_req_ctx(fuse_req_t req) +{ + return &req->ctx; +} + +void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + void *data) +{ + pthread_mutex_lock(&req->lock); + pthread_mutex_lock(&req->se->lock); + req->u.ni.func = func; + req->u.ni.data = data; + pthread_mutex_unlock(&req->se->lock); + if (req->interrupted && func) { + func(req, data); + } + pthread_mutex_unlock(&req->lock); +} + +int fuse_req_interrupted(fuse_req_t req) +{ + int interrupted; + + pthread_mutex_lock(&req->se->lock); + interrupted = req->interrupted; + pthread_mutex_unlock(&req->se->lock); + + return interrupted; +} + +static struct { + void (*func)(fuse_req_t, fuse_ino_t, struct fuse_mbuf_iter *); + const char *name; +} fuse_ll_ops[] = { + [FUSE_LOOKUP] = { do_lookup, "LOOKUP" }, + [FUSE_FORGET] = { do_forget, "FORGET" }, + [FUSE_GETATTR] = { do_getattr, "GETATTR" }, + [FUSE_SETATTR] = { do_setattr, "SETATTR" }, + [FUSE_READLINK] = { do_readlink, "READLINK" }, + [FUSE_SYMLINK] = { do_symlink, "SYMLINK" }, + [FUSE_MKNOD] = { do_mknod, "MKNOD" }, + [FUSE_MKDIR] = { do_mkdir, "MKDIR" }, + [FUSE_UNLINK] = { do_unlink, "UNLINK" }, + [FUSE_RMDIR] = { do_rmdir, "RMDIR" }, + [FUSE_RENAME] = { do_rename, "RENAME" }, + [FUSE_LINK] = { do_link, "LINK" }, + [FUSE_OPEN] = { do_open, "OPEN" }, + [FUSE_READ] = { do_read, "READ" }, + [FUSE_WRITE] = { do_write, "WRITE" }, + [FUSE_STATFS] = { do_statfs, "STATFS" }, + [FUSE_RELEASE] = { do_release, "RELEASE" }, + [FUSE_FSYNC] = { do_fsync, "FSYNC" }, + [FUSE_SETXATTR] = { do_setxattr, "SETXATTR" }, + [FUSE_GETXATTR] = { do_getxattr, "GETXATTR" }, + [FUSE_LISTXATTR] = { do_listxattr, "LISTXATTR" }, + [FUSE_REMOVEXATTR] = { do_removexattr, "REMOVEXATTR" }, + [FUSE_FLUSH] = { do_flush, "FLUSH" }, + [FUSE_INIT] = { do_init, "INIT" }, + [FUSE_OPENDIR] = { do_opendir, "OPENDIR" }, + [FUSE_READDIR] = { do_readdir, "READDIR" }, + [FUSE_RELEASEDIR] = { do_releasedir, "RELEASEDIR" }, + [FUSE_FSYNCDIR] = { do_fsyncdir, "FSYNCDIR" }, + [FUSE_GETLK] = { do_getlk, "GETLK" }, + [FUSE_SETLK] = { do_setlk, "SETLK" }, + [FUSE_SETLKW] = { do_setlkw, "SETLKW" }, + [FUSE_ACCESS] = { do_access, "ACCESS" }, + [FUSE_CREATE] = { do_create, "CREATE" }, + [FUSE_INTERRUPT] = { do_interrupt, "INTERRUPT" }, + [FUSE_BMAP] = { do_bmap, "BMAP" }, + [FUSE_IOCTL] = { do_ioctl, "IOCTL" }, + [FUSE_POLL] = { do_poll, "POLL" }, + [FUSE_FALLOCATE] = { do_fallocate, "FALLOCATE" }, + [FUSE_DESTROY] = { do_destroy, "DESTROY" }, + [FUSE_NOTIFY_REPLY] = { NULL, "NOTIFY_REPLY" }, + [FUSE_BATCH_FORGET] = { do_batch_forget, "BATCH_FORGET" }, + [FUSE_READDIRPLUS] = { do_readdirplus, "READDIRPLUS" }, + [FUSE_RENAME2] = { do_rename2, "RENAME2" }, + [FUSE_COPY_FILE_RANGE] = { do_copy_file_range, "COPY_FILE_RANGE" }, + [FUSE_LSEEK] = { do_lseek, "LSEEK" }, +}; + +#define FUSE_MAXOP (sizeof(fuse_ll_ops) / sizeof(fuse_ll_ops[0])) + +static const char *opname(enum fuse_opcode opcode) +{ + if (opcode >= FUSE_MAXOP || !fuse_ll_ops[opcode].name) { + return "???"; + } else { + return fuse_ll_ops[opcode].name; + } +} + +void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf) +{ + struct fuse_bufvec bufv = { .buf[0] = *buf, .count = 1 }; + fuse_session_process_buf_int(se, &bufv, NULL); +} + +/* + * Restriction: + * bufv is normally a single entry buffer, except for a write + * where (if it's in memory) then the bufv may be multiple entries, + * where the first entry contains all headers and subsequent entries + * contain data + * bufv shall not use any offsets etc to make the data anything + * other than contiguous starting from 0. + */ +void fuse_session_process_buf_int(struct fuse_session *se, + struct fuse_bufvec *bufv, + struct fuse_chan *ch) +{ + const struct fuse_buf *buf = bufv->buf; + struct fuse_mbuf_iter iter = FUSE_MBUF_ITER_INIT(buf); + struct fuse_in_header *in; + struct fuse_req *req; + int err; + + /* The first buffer must be a memory buffer */ + assert(!(buf->flags & FUSE_BUF_IS_FD)); + + in = fuse_mbuf_iter_advance(&iter, sizeof(*in)); + assert(in); /* caller guarantees the input buffer is large enough */ + + fuse_log( + FUSE_LOG_DEBUG, + "unique: %llu, opcode: %s (%i), nodeid: %llu, insize: %zu, pid: %u\n", + (unsigned long long)in->unique, opname((enum fuse_opcode)in->opcode), + in->opcode, (unsigned long long)in->nodeid, buf->size, in->pid); + + req = fuse_ll_alloc_req(se); + if (req == NULL) { + struct fuse_out_header out = { + .unique = in->unique, + .error = -ENOMEM, + }; + struct iovec iov = { + .iov_base = &out, + .iov_len = sizeof(struct fuse_out_header), + }; + + fuse_send_msg(se, ch, &iov, 1); + return; + } + + req->unique = in->unique; + req->ctx.uid = in->uid; + req->ctx.gid = in->gid; + req->ctx.pid = in->pid; + req->ch = ch; + + /* + * INIT and DESTROY requests are serialized, all other request types + * run in parallel. This prevents races between FUSE_INIT and ordinary + * requests, FUSE_INIT and FUSE_INIT, FUSE_INIT and FUSE_DESTROY, and + * FUSE_DESTROY and FUSE_DESTROY. + */ + if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT || + in->opcode == FUSE_DESTROY) { + pthread_rwlock_wrlock(&se->init_rwlock); + } else { + pthread_rwlock_rdlock(&se->init_rwlock); + } + + err = EIO; + if (!se->got_init) { + enum fuse_opcode expected; + + expected = se->cuse_data ? CUSE_INIT : FUSE_INIT; + if (in->opcode != expected) { + goto reply_err; + } + } else if (in->opcode == FUSE_INIT || in->opcode == CUSE_INIT) { + if (fuse_lowlevel_is_virtio(se)) { + /* + * TODO: This is after a hard reboot typically, we need to do + * a destroy, but we can't reply to this request yet so + * we can't use do_destroy + */ + fuse_log(FUSE_LOG_DEBUG, "%s: reinit\n", __func__); + se->got_destroy = 1; + se->got_init = 0; + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + } else { + goto reply_err; + } + } + + err = EACCES; + /* Implement -o allow_root */ + if (se->deny_others && in->uid != se->owner && in->uid != 0 && + in->opcode != FUSE_INIT && in->opcode != FUSE_READ && + in->opcode != FUSE_WRITE && in->opcode != FUSE_FSYNC && + in->opcode != FUSE_RELEASE && in->opcode != FUSE_READDIR && + in->opcode != FUSE_FSYNCDIR && in->opcode != FUSE_RELEASEDIR && + in->opcode != FUSE_NOTIFY_REPLY && in->opcode != FUSE_READDIRPLUS) { + goto reply_err; + } + + err = ENOSYS; + if (in->opcode >= FUSE_MAXOP || !fuse_ll_ops[in->opcode].func) { + goto reply_err; + } + if (in->opcode != FUSE_INTERRUPT) { + struct fuse_req *intr; + pthread_mutex_lock(&se->lock); + intr = check_interrupt(se, req); + list_add_req(req, &se->list); + pthread_mutex_unlock(&se->lock); + if (intr) { + fuse_reply_err(intr, EAGAIN); + } + } + + if (in->opcode == FUSE_WRITE && se->op.write_buf) { + do_write_buf(req, in->nodeid, &iter, bufv); + } else { + fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter); + } + + pthread_rwlock_unlock(&se->init_rwlock); + return; + +reply_err: + fuse_reply_err(req, err); + pthread_rwlock_unlock(&se->init_rwlock); +} + +#define LL_OPTION(n, o, v) \ + { \ + n, offsetof(struct fuse_session, o), v \ + } + +static const struct fuse_opt fuse_ll_opts[] = { + LL_OPTION("debug", debug, 1), + LL_OPTION("-d", debug, 1), + LL_OPTION("--debug", debug, 1), + LL_OPTION("allow_root", deny_others, 1), + LL_OPTION("--socket-path=%s", vu_socket_path, 0), + LL_OPTION("--socket-group=%s", vu_socket_group, 0), + LL_OPTION("--fd=%d", vu_listen_fd, 0), + LL_OPTION("--thread-pool-size=%d", thread_pool_size, 0), + FUSE_OPT_END +}; + +void fuse_lowlevel_version(void) +{ + printf("using FUSE kernel interface version %i.%i\n", FUSE_KERNEL_VERSION, + FUSE_KERNEL_MINOR_VERSION); +} + +void fuse_lowlevel_help(void) +{ + /* + * These are not all options, but the ones that are + * potentially of interest to an end-user + */ + printf( + " -o allow_root allow access by root\n" + " --socket-path=PATH path for the vhost-user socket\n" + " --socket-group=GRNAME name of group for the vhost-user socket\n" + " --fd=FDNUM fd number of vhost-user socket\n" + " --thread-pool-size=NUM thread pool size limit (default %d)\n", + THREAD_POOL_SIZE); +} + +void fuse_session_destroy(struct fuse_session *se) +{ + if (se->got_init && !se->got_destroy) { + if (se->op.destroy) { + se->op.destroy(se->userdata); + } + } + pthread_rwlock_destroy(&se->init_rwlock); + pthread_mutex_destroy(&se->lock); + free(se->cuse_data); + if (se->fd != -1) { + close(se->fd); + } + + if (fuse_lowlevel_is_virtio(se)) { + virtio_session_close(se); + } + + free(se->vu_socket_path); + se->vu_socket_path = NULL; + + g_free(se); +} + + +struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata) +{ + struct fuse_session *se; + + if (sizeof(struct fuse_lowlevel_ops) < op_size) { + fuse_log( + FUSE_LOG_ERR, + "fuse: warning: library too old, some operations may not work\n"); + op_size = sizeof(struct fuse_lowlevel_ops); + } + + if (args->argc == 0) { + fuse_log(FUSE_LOG_ERR, + "fuse: empty argv passed to fuse_session_new().\n"); + return NULL; + } + + se = g_try_new0(struct fuse_session, 1); + if (se == NULL) { + fuse_log(FUSE_LOG_ERR, "fuse: failed to allocate fuse object\n"); + goto out1; + } + se->fd = -1; + se->vu_listen_fd = -1; + se->thread_pool_size = THREAD_POOL_SIZE; + se->conn.max_write = UINT_MAX; + se->conn.max_readahead = UINT_MAX; + + /* Parse options */ + if (fuse_opt_parse(args, se, fuse_ll_opts, NULL) == -1) { + goto out2; + } + if (args->argc == 1 && args->argv[0][0] == '-') { + fuse_log(FUSE_LOG_ERR, + "fuse: warning: argv[0] looks like an option, but " + "will be ignored\n"); + } else if (args->argc != 1) { + int i; + fuse_log(FUSE_LOG_ERR, "fuse: unknown option(s): `"); + for (i = 1; i < args->argc - 1; i++) { + fuse_log(FUSE_LOG_ERR, "%s ", args->argv[i]); + } + fuse_log(FUSE_LOG_ERR, "%s'\n", args->argv[i]); + goto out4; + } + + if (!se->vu_socket_path && se->vu_listen_fd < 0) { + fuse_log(FUSE_LOG_ERR, "fuse: missing --socket-path or --fd option\n"); + goto out4; + } + if (se->vu_socket_path && se->vu_listen_fd >= 0) { + fuse_log(FUSE_LOG_ERR, + "fuse: --socket-path and --fd cannot be given together\n"); + goto out4; + } + if (se->vu_socket_group && !se->vu_socket_path) { + fuse_log(FUSE_LOG_ERR, + "fuse: --socket-group can only be used with --socket-path\n"); + goto out4; + } + + se->bufsize = FUSE_MAX_MAX_PAGES * getpagesize() + FUSE_BUFFER_HEADER_SIZE; + + list_init_req(&se->list); + list_init_req(&se->interrupts); + fuse_mutex_init(&se->lock); + pthread_rwlock_init(&se->init_rwlock, NULL); + + memcpy(&se->op, op, op_size); + se->owner = getuid(); + se->userdata = userdata; + + return se; + +out4: + fuse_opt_free_args(args); +out2: + g_free(se); +out1: + return NULL; +} + +int fuse_session_mount(struct fuse_session *se) +{ + return virtio_session_mount(se); +} + +int fuse_session_fd(struct fuse_session *se) +{ + return se->fd; +} + +void fuse_session_unmount(struct fuse_session *se) +{ +} + +int fuse_lowlevel_is_virtio(struct fuse_session *se) +{ + return !!se->virtio_dev; +} + +void fuse_session_exit(struct fuse_session *se) +{ + se->exited = 1; +} + +void fuse_session_reset(struct fuse_session *se) +{ + se->exited = 0; + se->error = 0; +} + +int fuse_session_exited(struct fuse_session *se) +{ + return se->exited; +} diff --git a/tools/virtiofsd/fuse_lowlevel.h b/tools/virtiofsd/fuse_lowlevel.h new file mode 100644 index 000000000..c55c0ca2f --- /dev/null +++ b/tools/virtiofsd/fuse_lowlevel.h @@ -0,0 +1,1975 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_LOWLEVEL_H_ +#define FUSE_LOWLEVEL_H_ + +/** + * @file + * + * Low level API + * + * IMPORTANT: you should define FUSE_USE_VERSION before including this + * header. To use the newest API define it to 31 (recommended for any + * new application). + */ + +#ifndef FUSE_USE_VERSION +#error FUSE_USE_VERSION not defined +#endif + +#include "fuse_common.h" + +#include <sys/statvfs.h> +#include <sys/uio.h> +#include <utime.h> + +/* + * Miscellaneous definitions + */ + +/** The node ID of the root inode */ +#define FUSE_ROOT_ID 1 + +/** Inode number type */ +typedef uint64_t fuse_ino_t; + +/** Request pointer type */ +typedef struct fuse_req *fuse_req_t; + +/** + * Session + * + * This provides hooks for processing requests, and exiting + */ +struct fuse_session; + +/** Directory entry parameters supplied to fuse_reply_entry() */ +struct fuse_entry_param { + /** + * Unique inode number + * + * In lookup, zero means negative entry (from version 2.5) + * Returning ENOENT also means negative entry, but by setting zero + * ino the kernel may cache negative entries for entry_timeout + * seconds. + */ + fuse_ino_t ino; + + /** + * Generation number for this entry. + * + * If the file system will be exported over NFS, the + * ino/generation pairs need to be unique over the file + * system's lifetime (rather than just the mount time). So if + * the file system reuses an inode after it has been deleted, + * it must assign a new, previously unused generation number + * to the inode at the same time. + * + */ + uint64_t generation; + + /** + * Inode attributes. + * + * Even if attr_timeout == 0, attr must be correct. For example, + * for open(), FUSE uses attr.st_size from lookup() to determine + * how many bytes to request. If this value is not correct, + * incorrect data will be returned. + */ + struct stat attr; + + /** + * Validity timeout (in seconds) for inode attributes. If + * attributes only change as a result of requests that come + * through the kernel, this should be set to a very large + * value. + */ + double attr_timeout; + + /** + * Validity timeout (in seconds) for the name. If directory + * entries are changed/deleted only as a result of requests + * that come through the kernel, this should be set to a very + * large value. + */ + double entry_timeout; + + /** + * Flags for fuse_attr.flags that do not fit into attr. + */ + uint32_t attr_flags; +}; + +/** + * Additional context associated with requests. + * + * Note that the reported client uid, gid and pid may be zero in some + * situations. For example, if the FUSE file system is running in a + * PID or user namespace but then accessed from outside the namespace, + * there is no valid uid/pid/gid that could be reported. + */ +struct fuse_ctx { + /** User ID of the calling process */ + uid_t uid; + + /** Group ID of the calling process */ + gid_t gid; + + /** Thread ID of the calling process */ + pid_t pid; + + /** Umask of the calling process */ + mode_t umask; +}; + +struct fuse_forget_data { + fuse_ino_t ino; + uint64_t nlookup; +}; + +/* 'to_set' flags in setattr */ +#define FUSE_SET_ATTR_MODE (1 << 0) +#define FUSE_SET_ATTR_UID (1 << 1) +#define FUSE_SET_ATTR_GID (1 << 2) +#define FUSE_SET_ATTR_SIZE (1 << 3) +#define FUSE_SET_ATTR_ATIME (1 << 4) +#define FUSE_SET_ATTR_MTIME (1 << 5) +#define FUSE_SET_ATTR_ATIME_NOW (1 << 7) +#define FUSE_SET_ATTR_MTIME_NOW (1 << 8) +#define FUSE_SET_ATTR_CTIME (1 << 10) +#define FUSE_SET_ATTR_KILL_SUIDGID (1 << 11) + +/* + * Request methods and replies + */ + +/** + * Low level filesystem operations + * + * Most of the methods (with the exception of init and destroy) + * receive a request handle (fuse_req_t) as their first argument. + * This handle must be passed to one of the specified reply functions. + * + * This may be done inside the method invocation, or after the call + * has returned. The request handle is valid until one of the reply + * functions is called. + * + * Other pointer arguments (name, fuse_file_info, etc) are not valid + * after the call has returned, so if they are needed later, their + * contents have to be copied. + * + * In general, all methods are expected to perform any necessary + * permission checking. However, a filesystem may delegate this task + * to the kernel by passing the `default_permissions` mount option to + * `fuse_session_new()`. In this case, methods will only be called if + * the kernel's permission check has succeeded. + * + * The filesystem sometimes needs to handle a return value of -ENOENT + * from the reply function, which means, that the request was + * interrupted, and the reply discarded. For example if + * fuse_reply_open() return -ENOENT means, that the release method for + * this file will not be called. + */ +struct fuse_lowlevel_ops { + /** + * Initialize filesystem + * + * This function is called when libfuse establishes + * communication with the FUSE kernel module. The file system + * should use this module to inspect and/or modify the + * connection parameters provided in the `conn` structure. + * + * Note that some parameters may be overwritten by options + * passed to fuse_session_new() which take precedence over the + * values set in this handler. + * + * There's no reply to this function + * + * @param userdata the user data passed to fuse_session_new() + */ + void (*init)(void *userdata, struct fuse_conn_info *conn); + + /** + * Clean up filesystem. + * + * Called on filesystem exit. When this method is called, the + * connection to the kernel may be gone already, so that eg. calls + * to fuse_lowlevel_notify_* will fail. + * + * There's no reply to this function + * + * @param userdata the user data passed to fuse_session_new() + */ + void (*destroy)(void *userdata); + + /** + * Look up a directory entry by name and get its attributes. + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name the name to look up + */ + void (*lookup)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Forget about an inode + * + * This function is called when the kernel removes an inode + * from its internal caches. + * + * The inode's lookup count increases by one for every call to + * fuse_reply_entry and fuse_reply_create. The nlookup parameter + * indicates by how much the lookup count should be decreased. + * + * Inodes with a non-zero lookup count may receive request from + * the kernel even after calls to unlink, rmdir or (when + * overwriting an existing file) rename. Filesystems must handle + * such requests properly and it is recommended to defer removal + * of the inode until the lookup count reaches zero. Calls to + * unlink, rmdir or rename will be followed closely by forget + * unless the file or directory is open, in which case the + * kernel issues forget only after the release or releasedir + * calls. + * + * Note that if a file system will be exported over NFS the + * inodes lifetime must extend even beyond forget. See the + * generation field in struct fuse_entry_param above. + * + * On unmount the lookup count for all inodes implicitly drops + * to zero. It is not guaranteed that the file system will + * receive corresponding forget messages for the affected + * inodes. + * + * Valid replies: + * fuse_reply_none + * + * @param req request handle + * @param ino the inode number + * @param nlookup the number of lookups to forget + */ + void (*forget)(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup); + + /** + * Get file attributes. + * + * If writeback caching is enabled, the kernel may have a + * better idea of a file's length than the FUSE file system + * (eg if there has been a write that extended the file size, + * but that has not yet been passed to the filesystem.n + * + * In this case, the st_size value provided by the file system + * will be ignored. + * + * Valid replies: + * fuse_reply_attr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi for future use, currently always NULL + */ + void (*getattr)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Set file attributes + * + * In the 'attr' argument only members indicated by the 'to_set' + * bitmask contain valid values. Other members contain undefined + * values. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits if the file + * size or owner is being changed. + * + * If the setattr was invoked from the ftruncate() system call + * under Linux kernel versions 2.6.15 or later, the fi->fh will + * contain the value set by the open method or will be undefined + * if the open method didn't set any value. Otherwise (not + * ftruncate call, or kernel version earlier than 2.6.15) the fi + * parameter will be NULL. + * + * Valid replies: + * fuse_reply_attr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param attr the attributes + * @param to_set bit mask of attributes which should be set + * @param fi file information, or NULL + */ + void (*setattr)(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int to_set, struct fuse_file_info *fi); + + /** + * Read symbolic link + * + * Valid replies: + * fuse_reply_readlink + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + */ + void (*readlink)(fuse_req_t req, fuse_ino_t ino); + + /** + * Create file node + * + * Create a regular file, character device, block device, fifo or + * socket node. + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode file type and mode with which to create the new file + * @param rdev the device number (only valid if created file is a device) + */ + void (*mknod)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev); + + /** + * Create a directory + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode with which to create the new file + */ + void (*mkdir)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode); + + /** + * Remove a file + * + * If the file's inode's lookup count is non-zero, the file + * system is expected to postpone any removal of the inode + * until the lookup count reaches zero (see description of the + * forget function). + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to remove + */ + void (*unlink)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Remove a directory + * + * If the directory's inode's lookup count is non-zero, the + * file system is expected to postpone any removal of the + * inode until the lookup count reaches zero (see description + * of the forget function). + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to remove + */ + void (*rmdir)(fuse_req_t req, fuse_ino_t parent, const char *name); + + /** + * Create a symbolic link + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param link the contents of the symbolic link + * @param parent inode number of the parent directory + * @param name to create + */ + void (*symlink)(fuse_req_t req, const char *link, fuse_ino_t parent, + const char *name); + + /** + * Rename a file + * + * If the target exists it should be atomically replaced. If + * the target's inode's lookup count is non-zero, the file + * system is expected to postpone any removal of the inode + * until the lookup count reaches zero (see description of the + * forget function). + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EINVAL, i.e. all + * future bmap requests will fail with EINVAL without being + * send to the filesystem process. + * + * *flags* may be `RENAME_EXCHANGE` or `RENAME_NOREPLACE`. If + * RENAME_NOREPLACE is specified, the filesystem must not + * overwrite *newname* if it exists and return an error + * instead. If `RENAME_EXCHANGE` is specified, the filesystem + * must atomically exchange the two files, i.e. both must + * exist and neither may be deleted. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the old parent directory + * @param name old name + * @param newparent inode number of the new parent directory + * @param newname new name + */ + void (*rename)(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname, + unsigned int flags); + + /** + * Create a hard link + * + * Valid replies: + * fuse_reply_entry + * fuse_reply_err + * + * @param req request handle + * @param ino the old inode number + * @param newparent inode number of the new parent directory + * @param newname new name to create + */ + void (*link)(fuse_req_t req, fuse_ino_t ino, fuse_ino_t newparent, + const char *newname); + + /** + * Open a file + * + * Open flags are available in fi->flags. The following rules + * apply. + * + * - Creation (O_CREAT, O_EXCL, O_NOCTTY) flags will be + * filtered out / handled by the kernel. + * + * - Access modes (O_RDONLY, O_WRONLY, O_RDWR) should be used + * by the filesystem to check if the operation is + * permitted. If the ``-o default_permissions`` mount + * option is given, this check is already done by the + * kernel before calling open() and may thus be omitted by + * the filesystem. + * + * - When writeback caching is enabled, the kernel may send + * read requests even for files opened with O_WRONLY. The + * filesystem should be prepared to handle this. + * + * - When writeback caching is disabled, the filesystem is + * expected to properly handle the O_APPEND flag and ensure + * that each write is appending to the end of the file. + * + * - When writeback caching is enabled, the kernel will + * handle O_APPEND. However, unless all changes to the file + * come through the kernel this will not work reliably. The + * filesystem should thus either ignore the O_APPEND flag + * (and let the kernel handle it), or return an error + * (indicating that reliably O_APPEND is not available). + * + * Filesystem may store an arbitrary file handle (pointer, + * index, etc) in fi->fh, and use this in other all other file + * operations (read, write, flush, release, fsync). + * + * Filesystem may also implement stateless file I/O and not store + * anything in fi->fh. + * + * There are also some flags (direct_io, keep_cache) which the + * filesystem may set in fi, to change the way the file is opened. + * See fuse_file_info structure in <fuse_common.h> for more details. + * + * If this request is answered with an error code of ENOSYS + * and FUSE_CAP_NO_OPEN_SUPPORT is set in + * `fuse_conn_info.capable`, this is treated as success and + * future calls to open and release will also succeed without being + * sent to the filesystem process. + * + * Valid replies: + * fuse_reply_open + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*open)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Read data + * + * Read should send exactly the number of bytes requested except + * on EOF or error, otherwise the rest of the data will be + * substituted with zeroes. An exception to this is when the file + * has been opened in 'direct_io' mode, in which case the return + * value of the read system call will reflect the return value of + * this operation. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_iov + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size number of bytes to read + * @param off offset to read from + * @param fi file information + */ + void (*read)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Write data + * + * Write should return exactly the number of bytes requested + * except on error. An exception to this is when the file has + * been opened in 'direct_io' mode, in which case the return value + * of the write system call will reflect the return value of this + * operation. + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param buf data to write + * @param size number of bytes to write + * @param off offset to write to + * @param fi file information + */ + void (*write)(fuse_req_t req, fuse_ino_t ino, const char *buf, size_t size, + off_t off, struct fuse_file_info *fi); + + /** + * Flush method + * + * This is called on each close() of the opened file. + * + * Since file descriptors can be duplicated (dup, dup2, fork), for + * one open call there may be many flush calls. + * + * Filesystems shouldn't assume that flush will always be called + * after some writes, or that if will be called at all. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * + * NOTE: the name of the method is misleading, since (unlike + * fsync) the filesystem is not forced to flush pending writes. + * One reason to flush data is if the filesystem wants to return + * write errors during close. However, such use is non-portable + * because POSIX does not require [close] to wait for delayed I/O to + * complete. + * + * If the filesystem supports file locking operations (setlk, + * getlk) it should remove all locks belonging to 'fi->owner'. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to flush() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * + * [close]: + * http://pubs.opengroup.org/onlinepubs/9699919799/functions/close.html + */ + void (*flush)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Release an open file + * + * Release is called when there are no more references to an open + * file: all file descriptors are closed and all memory mappings + * are unmapped. + * + * For every open call there will be exactly one release call (unless + * the filesystem is force-unmounted). + * + * The filesystem may reply with an error, but error values are + * not returned to close() or munmap() which triggered the + * release. + * + * fi->fh will contain the value set by the open method, or will + * be undefined if the open method didn't set any value. + * fi->flags will contain the same flags as for open. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*release)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Synchronize file contents + * + * If the datasync parameter is non-zero, then only the user data + * should be flushed, not the meta data. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to fsync() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param datasync flag indicating if only data should be flushed + * @param fi file information + */ + void (*fsync)(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi); + + /** + * Open a directory + * + * Filesystem may store an arbitrary file handle (pointer, index, + * etc) in fi->fh, and use this in other all other directory + * stream operations (readdir, releasedir, fsyncdir). + * + * If this request is answered with an error code of ENOSYS and + * FUSE_CAP_NO_OPENDIR_SUPPORT is set in `fuse_conn_info.capable`, + * this is treated as success and future calls to opendir and + * releasedir will also succeed without being sent to the filesystem + * process. In addition, the kernel will cache readdir results + * as if opendir returned FOPEN_KEEP_CACHE | FOPEN_CACHE_DIR. + * + * Valid replies: + * fuse_reply_open + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*opendir)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi); + + /** + * Read directory + * + * Send a buffer filled using fuse_add_direntry(), with size not + * exceeding the requested size. Send an empty buffer on end of + * stream. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * Returning a directory entry from readdir() does not affect + * its lookup count. + * + * If off_t is non-zero, then it will correspond to one of the off_t + * values that was previously returned by readdir() for the same + * directory handle. In this case, readdir() should skip over entries + * coming before the position defined by the off_t value. If entries + * are added or removed while the directory handle is open, they filesystem + * may still include the entries that have been removed, and may not + * report the entries that have been created. However, addition or + * removal of entries must never cause readdir() to skip over unrelated + * entries or to report them more than once. This means + * that off_t can not be a simple index that enumerates the entries + * that have been returned but must contain sufficient information to + * uniquely determine the next directory entry to return even when the + * set of entries is changing. + * + * The function does not have to report the '.' and '..' + * entries, but is allowed to do so. Note that, if readdir does + * not return '.' or '..', they will not be implicitly returned, + * and this behavior is observable by the caller. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum number of bytes to send + * @param off offset to continue reading the directory stream + * @param fi file information + */ + void (*readdir)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Release an open directory + * + * For every opendir call there will be exactly one releasedir + * call (unless the filesystem is force-unmounted). + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + */ + void (*releasedir)(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi); + + /** + * Synchronize directory contents + * + * If the datasync parameter is non-zero, then only the directory + * contents should be flushed, not the meta data. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * If this request is answered with an error code of ENOSYS, + * this is treated as success and future calls to fsyncdir() will + * succeed automatically without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param datasync flag indicating if only data should be flushed + * @param fi file information + */ + void (*fsyncdir)(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi); + + /** + * Get file system statistics + * + * Valid replies: + * fuse_reply_statfs + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number, zero means "undefined" + */ + void (*statfs)(fuse_req_t req, fuse_ino_t ino); + + /** + * Set an extended attribute + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future setxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + */ + void (*setxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, + const char *value, size_t size, int flags, + uint32_t setxattr_flags); + + /** + * Get an extended attribute + * + * If size is zero, the size of the value should be sent with + * fuse_reply_xattr. + * + * If the size is non-zero, and the value fits in the buffer, the + * value should be sent with fuse_reply_buf. + * + * If the size is too small for the value, the ERANGE error should + * be sent. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future getxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_xattr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param name of the extended attribute + * @param size maximum size of the value to send + */ + void (*getxattr)(fuse_req_t req, fuse_ino_t ino, const char *name, + size_t size); + + /** + * List extended attribute names + * + * If size is zero, the total size of the attribute list should be + * sent with fuse_reply_xattr. + * + * If the size is non-zero, and the null character separated + * attribute list fits in the buffer, the list should be sent with + * fuse_reply_buf. + * + * If the size is too small for the list, the ERANGE error should + * be sent. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future listxattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_xattr + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum size of the list to send + */ + void (*listxattr)(fuse_req_t req, fuse_ino_t ino, size_t size); + + /** + * Remove an extended attribute + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future removexattr() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param name of the extended attribute + */ + void (*removexattr)(fuse_req_t req, fuse_ino_t ino, const char *name); + + /** + * Check file access permissions + * + * This will be called for the access() and chdir() system + * calls. If the 'default_permissions' mount option is given, + * this method is not called. + * + * This method is not called under Linux kernel versions 2.4.x + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent success, i.e. this and all future access() + * requests will succeed without being send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param mask requested access mode + */ + void (*access)(fuse_req_t req, fuse_ino_t ino, int mask); + + /** + * Create and open a file + * + * If the file does not exist, first create it with the specified + * mode, and then open it. + * + * See the description of the open handler for more + * information. + * + * If this method is not implemented or under Linux kernel + * versions earlier than 2.6.15, the mknod() and open() methods + * will be called instead. + * + * If this request is answered with an error code of ENOSYS, the handler + * is treated as not implemented (i.e., for this and future requests the + * mknod() and open() handlers will be called instead). + * + * Valid replies: + * fuse_reply_create + * fuse_reply_err + * + * @param req request handle + * @param parent inode number of the parent directory + * @param name to create + * @param mode file type and mode with which to create the new file + * @param fi file information + */ + void (*create)(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi); + + /** + * Test for a POSIX file lock + * + * Valid replies: + * fuse_reply_lock + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param lock the region/type to test + */ + void (*getlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock); + + /** + * Acquire, modify or release a POSIX file lock + * + * For POSIX threads (NPTL) there's a 1-1 relation between pid and + * owner, but otherwise this is not always the case. For checking + * lock ownership, 'fi->owner' must be used. The l_pid field in + * 'struct flock' should only be used to fill in this field in + * getlk(). + * + * Note: if the locking methods are not implemented, the kernel + * will still allow file locking to work locally. Hence these are + * only interesting for network filesystems and similar. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param lock the region/type to set + * @param sleep locking operation may sleep + */ + void (*setlk)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock, int sleep); + + /** + * Map block index within file to block index within device + * + * Note: This makes sense only for block device backed filesystems + * mounted with the 'blkdev' option + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure, i.e. all future bmap() requests will + * fail with the same error code without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_bmap + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param blocksize unit of block index + * @param idx block index within file + */ + void (*bmap)(fuse_req_t req, fuse_ino_t ino, size_t blocksize, + uint64_t idx); + + /** + * Ioctl + * + * Note: For unrestricted ioctls (not allowed for FUSE + * servers), data in and out areas can be discovered by giving + * iovs and setting FUSE_IOCTL_RETRY in *flags*. For + * restricted ioctls, kernel prepares in/out data area + * according to the information encoded in cmd. + * + * Valid replies: + * fuse_reply_ioctl_retry + * fuse_reply_ioctl + * fuse_reply_ioctl_iov + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param cmd ioctl command + * @param arg ioctl argument + * @param fi file information + * @param flags for FUSE_IOCTL_* flags + * @param in_buf data fetched from the caller + * @param in_bufsz number of fetched bytes + * @param out_bufsz maximum size of output data + * + * Note : the unsigned long request submitted by the application + * is truncated to 32 bits. + */ + void (*ioctl)(fuse_req_t req, fuse_ino_t ino, unsigned int cmd, void *arg, + struct fuse_file_info *fi, unsigned flags, const void *in_buf, + size_t in_bufsz, size_t out_bufsz); + + /** + * Poll for IO readiness + * + * Note: If ph is non-NULL, the client should notify + * when IO readiness events occur by calling + * fuse_lowlevel_notify_poll() with the specified ph. + * + * Regardless of the number of times poll with a non-NULL ph + * is received, single notification is enough to clear all. + * Notifying more times incurs overhead but doesn't harm + * correctness. + * + * The callee is responsible for destroying ph with + * fuse_pollhandle_destroy() when no longer in use. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as success (with a kernel-defined default poll-mask) and + * future calls to pull() will succeed the same way without being send + * to the filesystem process. + * + * Valid replies: + * fuse_reply_poll + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param ph poll handle to be used for notification + */ + void (*poll)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct fuse_pollhandle *ph); + + /** + * Write data made available in a buffer + * + * This is a more generic version of the ->write() method. If + * FUSE_CAP_SPLICE_READ is set in fuse_conn_info.want and the + * kernel supports splicing from the fuse device, then the + * data will be made available in pipe for supporting zero + * copy data transfer. + * + * buf->count is guaranteed to be one (and thus buf->idx is + * always zero). The write_buf handler must ensure that + * bufv->off is correctly updated (reflecting the number of + * bytes read from bufv->buf[0]). + * + * Unless FUSE_CAP_HANDLE_KILLPRIV is disabled, this method is + * expected to reset the setuid and setgid bits. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param bufv buffer containing the data + * @param off offset to write to + * @param fi file information + */ + void (*write_buf)(fuse_req_t req, fuse_ino_t ino, struct fuse_bufvec *bufv, + off_t off, struct fuse_file_info *fi); + + /** + * Forget about multiple inodes + * + * See description of the forget function for more + * information. + * + * Valid replies: + * fuse_reply_none + * + * @param req request handle + */ + void (*forget_multi)(fuse_req_t req, size_t count, + struct fuse_forget_data *forgets); + + /** + * Acquire, modify or release a BSD file lock + * + * Note: if the locking methods are not implemented, the kernel + * will still allow file locking to work locally. Hence these are + * only interesting for network filesystems and similar. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param fi file information + * @param op the locking operation, see flock(2) + */ + void (*flock)(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int op); + + /** + * Allocate requested space. If this function returns success then + * subsequent writes to the specified range shall not fail due to the lack + * of free space on the file system storage media. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future fallocate() requests will fail with EOPNOTSUPP without being + * send to the filesystem process. + * + * Valid replies: + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param offset starting point for allocated region + * @param length size of allocated region + * @param mode determines the operation to be performed on the given range, + * see fallocate(2) + */ + void (*fallocate)(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + off_t length, struct fuse_file_info *fi); + + /** + * Read directory with attributes + * + * Send a buffer filled using fuse_add_direntry_plus(), with size not + * exceeding the requested size. Send an empty buffer on end of + * stream. + * + * fi->fh will contain the value set by the opendir method, or + * will be undefined if the opendir method didn't set any value. + * + * In contrast to readdir() (which does not affect the lookup counts), + * the lookup count of every entry returned by readdirplus(), except "." + * and "..", is incremented by one. + * + * Valid replies: + * fuse_reply_buf + * fuse_reply_data + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param size maximum number of bytes to send + * @param off offset to continue reading the directory stream + * @param fi file information + */ + void (*readdirplus)(fuse_req_t req, fuse_ino_t ino, size_t size, off_t off, + struct fuse_file_info *fi); + + /** + * Copy a range of data from one file to another + * + * Performs an optimized copy between two file descriptors without the + * additional cost of transferring data through the FUSE kernel module + * to user space (glibc) and then back into the FUSE filesystem again. + * + * In case this method is not implemented, glibc falls back to reading + * data from the source and writing to the destination. Effectively + * doing an inefficient copy of the data. + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure with error code EOPNOTSUPP, i.e. all + * future copy_file_range() requests will fail with EOPNOTSUPP without + * being send to the filesystem process. + * + * Valid replies: + * fuse_reply_write + * fuse_reply_err + * + * @param req request handle + * @param ino_in the inode number or the source file + * @param off_in starting point from were the data should be read + * @param fi_in file information of the source file + * @param ino_out the inode number or the destination file + * @param off_out starting point where the data should be written + * @param fi_out file information of the destination file + * @param len maximum size of the data to copy + * @param flags passed along with the copy_file_range() syscall + */ + void (*copy_file_range)(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + struct fuse_file_info *fi_in, fuse_ino_t ino_out, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags); + + /** + * Find next data or hole after the specified offset + * + * If this request is answered with an error code of ENOSYS, this is + * treated as a permanent failure, i.e. all future lseek() requests will + * fail with the same error code without being send to the filesystem + * process. + * + * Valid replies: + * fuse_reply_lseek + * fuse_reply_err + * + * @param req request handle + * @param ino the inode number + * @param off offset to start search from + * @param whence either SEEK_DATA or SEEK_HOLE + * @param fi file information + */ + void (*lseek)(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + struct fuse_file_info *fi); +}; + +/** + * Reply with an error code or success. + * + * Possible requests: + * all except forget + * + * Whereever possible, error codes should be chosen from the list of + * documented error conditions in the corresponding system calls + * manpage. + * + * An error code of ENOSYS is sometimes treated specially. This is + * indicated in the documentation of the affected handler functions. + * + * The following requests may be answered with a zero error code: + * unlink, rmdir, rename, flush, release, fsync, fsyncdir, setxattr, + * removexattr, setlk. + * + * @param req request handle + * @param err the positive error value, or zero for success + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_err(fuse_req_t req, int err); + +/** + * Don't send reply + * + * Possible requests: + * forget + * forget_multi + * retrieve_reply + * + * @param req request handle + */ +void fuse_reply_none(fuse_req_t req); + +/** + * Reply with a directory entry + * + * Possible requests: + * lookup, mknod, mkdir, symlink, link + * + * Side effects: + * increments the lookup count on success + * + * @param req request handle + * @param e the entry parameters + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_entry(fuse_req_t req, const struct fuse_entry_param *e); + +/** + * Reply with a directory entry and open parameters + * + * currently the following members of 'fi' are used: + * fh, direct_io, keep_cache + * + * Possible requests: + * create + * + * Side effects: + * increments the lookup count on success + * + * @param req request handle + * @param e the entry parameters + * @param fi file information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_create(fuse_req_t req, const struct fuse_entry_param *e, + const struct fuse_file_info *fi); + +/** + * Reply with attributes + * + * Possible requests: + * getattr, setattr + * + * @param req request handle + * @param attr the attributes + * @param attr_timeout validity timeout (in seconds) for the attributes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_attr(fuse_req_t req, const struct stat *attr, + double attr_timeout); + +/** + * Reply with the contents of a symbolic link + * + * Possible requests: + * readlink + * + * @param req request handle + * @param link symbolic link contents + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_readlink(fuse_req_t req, const char *link); + +/** + * Reply with open parameters + * + * currently the following members of 'fi' are used: + * fh, direct_io, keep_cache + * + * Possible requests: + * open, opendir + * + * @param req request handle + * @param fi file information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_open(fuse_req_t req, const struct fuse_file_info *fi); + +/** + * Reply with number of bytes written + * + * Possible requests: + * write + * + * @param req request handle + * @param count the number of bytes written + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_write(fuse_req_t req, size_t count); + +/** + * Reply with data + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * @param req request handle + * @param buf buffer containing data + * @param size the size of data in bytes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_buf(fuse_req_t req, const char *buf, size_t size); + +/** + * Reply with data copied/moved from buffer(s) + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * Side effects: + * when used to return data from a readdirplus() (but not readdir()) + * call, increments the lookup count of each returned entry by one + * on success. + * + * @param req request handle + * @param bufv buffer vector + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_data(fuse_req_t req, struct fuse_bufvec *bufv); + +/** + * Reply with data vector + * + * Possible requests: + * read, readdir, getxattr, listxattr + * + * @param req request handle + * @param iov the vector containing the data + * @param count the size of vector + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_iov(fuse_req_t req, const struct iovec *iov, int count); + +/** + * Reply with filesystem statistics + * + * Possible requests: + * statfs + * + * @param req request handle + * @param stbuf filesystem statistics + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_statfs(fuse_req_t req, const struct statvfs *stbuf); + +/** + * Reply with needed buffer size + * + * Possible requests: + * getxattr, listxattr + * + * @param req request handle + * @param count the buffer size needed in bytes + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_xattr(fuse_req_t req, size_t count); + +/** + * Reply with file lock information + * + * Possible requests: + * getlk + * + * @param req request handle + * @param lock the lock information + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_lock(fuse_req_t req, const struct flock *lock); + +/** + * Reply with block index + * + * Possible requests: + * bmap + * + * @param req request handle + * @param idx block index within device + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_bmap(fuse_req_t req, uint64_t idx); + +/* + * Filling a buffer in readdir + */ + +/** + * Add a directory entry to the buffer + * + * Buffer needs to be large enough to hold the entry. If it's not, + * then the entry is not filled in but the size of the entry is still + * returned. The caller can check this by comparing the bufsize + * parameter with the returned entry size. If the entry size is + * larger than the buffer size, the operation failed. + * + * From the 'stbuf' argument the st_ino field and bits 12-15 of the + * st_mode field are used. The other fields are ignored. + * + * *off* should be any non-zero value that the filesystem can use to + * identify the current point in the directory stream. It does not + * need to be the actual physical position. A value of zero is + * reserved to mean "from the beginning", and should therefore never + * be used (the first call to fuse_add_direntry should be passed the + * offset of the second directory entry). + * + * @param req request handle + * @param buf the point where the new entry will be added to the buffer + * @param bufsize remaining size of the buffer + * @param name the name of the entry + * @param stbuf the file attributes + * @param off the offset of the next entry + * @return the space needed for the entry + */ +size_t fuse_add_direntry(fuse_req_t req, char *buf, size_t bufsize, + const char *name, const struct stat *stbuf, off_t off); + +/** + * Add a directory entry to the buffer with the attributes + * + * See documentation of `fuse_add_direntry()` for more details. + * + * @param req request handle + * @param buf the point where the new entry will be added to the buffer + * @param bufsize remaining size of the buffer + * @param name the name of the entry + * @param e the directory entry + * @param off the offset of the next entry + * @return the space needed for the entry + */ +size_t fuse_add_direntry_plus(fuse_req_t req, char *buf, size_t bufsize, + const char *name, + const struct fuse_entry_param *e, off_t off); + +/** + * Reply to ask for data fetch and output buffer preparation. ioctl + * will be retried with the specified input data fetched and output + * buffer prepared. + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param in_iov iovec specifying data to fetch from the caller + * @param in_count number of entries in in_iov + * @param out_iov iovec specifying addresses to write output to + * @param out_count number of entries in out_iov + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_ioctl_retry(fuse_req_t req, const struct iovec *in_iov, + size_t in_count, const struct iovec *out_iov, + size_t out_count); + +/** + * Reply to finish ioctl + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param result result to be passed to the caller + * @param buf buffer containing output data + * @param size length of output data + */ +int fuse_reply_ioctl(fuse_req_t req, int result, const void *buf, size_t size); + +/** + * Reply to finish ioctl with iov buffer + * + * Possible requests: + * ioctl + * + * @param req request handle + * @param result result to be passed to the caller + * @param iov the vector containing the data + * @param count the size of vector + */ +int fuse_reply_ioctl_iov(fuse_req_t req, int result, const struct iovec *iov, + int count); + +/** + * Reply with poll result event mask + * + * @param req request handle + * @param revents poll result event mask + */ +int fuse_reply_poll(fuse_req_t req, unsigned revents); + +/** + * Reply with offset + * + * Possible requests: + * lseek + * + * @param req request handle + * @param off offset of next data or hole + * @return zero for success, -errno for failure to send reply + */ +int fuse_reply_lseek(fuse_req_t req, off_t off); + +/* + * Notification + */ + +/** + * Notify IO readiness event + * + * For more information, please read comment for poll operation. + * + * @param ph poll handle to notify IO readiness event for + */ +int fuse_lowlevel_notify_poll(struct fuse_pollhandle *ph); + +/** + * Notify to invalidate cache for an inode. + * + * Added in FUSE protocol version 7.12. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * If the filesystem has writeback caching enabled, invalidating an + * inode will first trigger a writeback of all dirty pages. The call + * will block until all writeback requests have completed and the + * inode has been invalidated. It will, however, not wait for + * completion of pending writeback requests that have been issued + * before. + * + * If there are no dirty pages, this function will never block. + * + * @param se the session object + * @param ino the inode number + * @param off the offset in the inode where to start invalidating + * or negative to invalidate attributes only + * @param len the amount of cache to invalidate or 0 for all + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_inval_inode(struct fuse_session *se, fuse_ino_t ino, + off_t off, off_t len); + +/** + * Notify to invalidate parent attributes and the dentry matching + * parent/name + * + * To avoid a deadlock this function must not be called in the + * execution path of a related filesystem operation or within any code + * that could hold a lock that could be needed to execute such an + * operation. As of kernel 4.18, a "related operation" is a lookup(), + * symlink(), mknod(), mkdir(), unlink(), rename(), link() or create() + * request for the parent, and a setattr(), unlink(), rmdir(), + * rename(), setxattr(), removexattr(), readdir() or readdirplus() + * request for the inode itself. + * + * When called correctly, this function will never block. + * + * Added in FUSE protocol version 7.12. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param parent inode number + * @param name file name + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_inval_entry(struct fuse_session *se, fuse_ino_t parent, + const char *name, size_t namelen); + +/** + * This function behaves like fuse_lowlevel_notify_inval_entry() with + * the following additional effect (at least as of Linux kernel 4.8): + * + * If the provided *child* inode matches the inode that is currently + * associated with the cached dentry, and if there are any inotify + * watches registered for the dentry, then the watchers are informed + * that the dentry has been deleted. + * + * To avoid a deadlock this function must not be called while + * executing a related filesystem operation or while holding a lock + * that could be needed to execute such an operation (see the + * description of fuse_lowlevel_notify_inval_entry() for more + * details). + * + * When called correctly, this function will never block. + * + * Added in FUSE protocol version 7.18. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param parent inode number + * @param child inode number + * @param name file name + * @param namelen strlen() of file name + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_delete(struct fuse_session *se, fuse_ino_t parent, + fuse_ino_t child, const char *name, + size_t namelen); + +/** + * Store data to the kernel buffers + * + * Synchronously store data in the kernel buffers belonging to the + * given inode. The stored data is marked up-to-date (no read will be + * performed against it, unless it's invalidated or evicted from the + * cache). + * + * If the stored data overflows the current file size, then the size + * is extended, similarly to a write(2) on the filesystem. + * + * If this function returns an error, then the store wasn't fully + * completed, but it may have been partially completed. + * + * Added in FUSE protocol version 7.15. If the kernel does not support + * this (or a newer) version, the function will return -ENOSYS and do + * nothing. + * + * @param se the session object + * @param ino the inode number + * @param offset the starting offset into the file to store to + * @param bufv buffer vector + * @return zero for success, -errno for failure + */ +int fuse_lowlevel_notify_store(struct fuse_session *se, fuse_ino_t ino, + off_t offset, struct fuse_bufvec *bufv); + +/* + * Utility functions + */ + +/** + * Get the userdata from the request + * + * @param req request handle + * @return the user data passed to fuse_session_new() + */ +void *fuse_req_userdata(fuse_req_t req); + +/** + * Get the context from the request + * + * The pointer returned by this function will only be valid for the + * request's lifetime + * + * @param req request handle + * @return the context structure + */ +const struct fuse_ctx *fuse_req_ctx(fuse_req_t req); + +/** + * Callback function for an interrupt + * + * @param req interrupted request + * @param data user data + */ +typedef void (*fuse_interrupt_func_t)(fuse_req_t req, void *data); + +/** + * Register/unregister callback for an interrupt + * + * If an interrupt has already happened, then the callback function is + * called from within this function, hence it's not possible for + * interrupts to be lost. + * + * @param req request handle + * @param func the callback function or NULL for unregister + * @param data user data passed to the callback function + */ +void fuse_req_interrupt_func(fuse_req_t req, fuse_interrupt_func_t func, + void *data); + +/** + * Check if a request has already been interrupted + * + * @param req request handle + * @return 1 if the request has been interrupted, 0 otherwise + */ +int fuse_req_interrupted(fuse_req_t req); + +/** + * Check if the session is connected via virtio + * + * @param se session object + * @return 1 if the session is a virtio session + */ +int fuse_lowlevel_is_virtio(struct fuse_session *se); + +/* + * Inquiry functions + */ + +/** + * Print low-level version information to stdout. + */ +void fuse_lowlevel_version(void); + +/** + * Print available low-level options to stdout. This is not an + * exhaustive list, but includes only those options that may be of + * interest to an end-user of a file system. + */ +void fuse_lowlevel_help(void); + +/** + * Print available options for `fuse_parse_cmdline()`. + */ +void fuse_cmdline_help(void); + +/* + * Filesystem setup & teardown + */ + +struct fuse_cmdline_opts { + int foreground; + int debug; + int nodefault_subtype; + int show_version; + int show_help; + int print_capabilities; + int syslog; + int log_level; + unsigned int max_idle_threads; + unsigned long rlimit_nofile; +}; + +/** + * Utility function to parse common options for simple file systems + * using the low-level API. A help text that describes the available + * options can be printed with `fuse_cmdline_help`. A single + * non-option argument is treated as the mountpoint. Multiple + * non-option arguments will result in an error. + * + * If neither -o subtype= or -o fsname= options are given, a new + * subtype option will be added and set to the basename of the program + * (the fsname will remain unset, and then defaults to "fuse"). + * + * Known options will be removed from *args*, unknown options will + * remain. + * + * @param args argument vector (input+output) + * @param opts output argument for parsed options + * @return 0 on success, -1 on failure + */ +int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts); + +/** + * Create a low level session. + * + * Returns a session structure suitable for passing to + * fuse_session_mount() and fuse_session_loop(). + * + * This function accepts most file-system independent mount options + * (like context, nodev, ro - see mount(8)), as well as the general + * fuse mount options listed in mount.fuse(8) (e.g. -o allow_root and + * -o default_permissions, but not ``-o use_ino``). Instead of `-o + * debug`, debugging may also enabled with `-d` or `--debug`. + * + * If not all options are known, an error message is written to stderr + * and the function returns NULL. + * + * Option parsing skips argv[0], which is assumed to contain the + * program name. To prevent accidentally passing an option in + * argv[0], this element must always be present (even if no options + * are specified). It may be set to the empty string ('\0') if no + * reasonable value can be provided. + * + * @param args argument vector + * @param op the (low-level) filesystem operations + * @param op_size sizeof(struct fuse_lowlevel_ops) + * @param userdata user data + * + * @return the fuse session on success, NULL on failure + **/ +struct fuse_session *fuse_session_new(struct fuse_args *args, + const struct fuse_lowlevel_ops *op, + size_t op_size, void *userdata); + +/** + * Mount a FUSE file system. + * + * @param se session object + * + * @return 0 on success, -1 on failure. + **/ +int fuse_session_mount(struct fuse_session *se); + +/** + * Enter a single threaded, blocking event loop. + * + * When the event loop terminates because the connection to the FUSE + * kernel module has been closed, this function returns zero. This + * happens when the filesystem is unmounted regularly (by the + * filesystem owner or root running the umount(8) or fusermount(1) + * command), or if connection is explicitly severed by writing ``1`` + * to the``abort`` file in ``/sys/fs/fuse/connections/NNN``. The only + * way to distinguish between these two conditions is to check if the + * filesystem is still mounted after the session loop returns. + * + * When some error occurs during request processing, the function + * returns a negated errno(3) value. + * + * If the loop has been terminated because of a signal handler + * installed by fuse_set_signal_handlers(), this function returns the + * (positive) signal value that triggered the exit. + * + * @param se the session + * @return 0, -errno, or a signal value + */ +int fuse_session_loop(struct fuse_session *se); + +/** + * Flag a session as terminated. + * + * This function is invoked by the POSIX signal handlers, when + * registered using fuse_set_signal_handlers(). It will cause any + * running event loops to terminate on the next opportunity. + * + * @param se the session + */ +void fuse_session_exit(struct fuse_session *se); + +/** + * Reset the terminated flag of a session + * + * @param se the session + */ +void fuse_session_reset(struct fuse_session *se); + +/** + * Query the terminated flag of a session + * + * @param se the session + * @return 1 if exited, 0 if not exited + */ +int fuse_session_exited(struct fuse_session *se); + +/** + * Ensure that file system is unmounted. + * + * In regular operation, the file system is typically unmounted by the + * user calling umount(8) or fusermount(1), which then terminates the + * FUSE session loop. However, the session loop may also terminate as + * a result of an explicit call to fuse_session_exit() (e.g. by a + * signal handler installed by fuse_set_signal_handler()). In this + * case the filesystem remains mounted, but any attempt to access it + * will block (while the filesystem process is still running) or give + * an ESHUTDOWN error (after the filesystem process has terminated). + * + * If the communication channel with the FUSE kernel module is still + * open (i.e., if the session loop was terminated by an explicit call + * to fuse_session_exit()), this function will close it and unmount + * the filesystem. If the communication channel has been closed by the + * kernel, this method will do (almost) nothing. + * + * NOTE: The above semantics mean that if the connection to the kernel + * is terminated via the ``/sys/fs/fuse/connections/NNN/abort`` file, + * this method will *not* unmount the filesystem. + * + * @param se the session + */ +void fuse_session_unmount(struct fuse_session *se); + +/** + * Destroy a session + * + * @param se the session + */ +void fuse_session_destroy(struct fuse_session *se); + +/* + * Custom event loop support + */ + +/** + * Return file descriptor for communication with kernel. + * + * The file selector can be used to integrate FUSE with a custom event + * loop. Whenever data is available for reading on the provided fd, + * the event loop should call `fuse_session_receive_buf` followed by + * `fuse_session_process_buf` to process the request. + * + * The returned file descriptor is valid until `fuse_session_unmount` + * is called. + * + * @param se the session + * @return a file descriptor + */ +int fuse_session_fd(struct fuse_session *se); + +/** + * Process a raw request supplied in a generic buffer + * + * The fuse_buf may contain a memory buffer or a pipe file descriptor. + * + * @param se the session + * @param buf the fuse_buf containing the request + */ +void fuse_session_process_buf(struct fuse_session *se, + const struct fuse_buf *buf); + +/** + * Read a raw request from the kernel into the supplied buffer. + * + * Depending on file system options, system capabilities, and request + * size the request is either read into a memory buffer or spliced + * into a temporary pipe. + * + * @param se the session + * @param buf the fuse_buf to store the request in + * @return the actual size of the raw request, or -errno on error + */ +int fuse_session_receive_buf(struct fuse_session *se, struct fuse_buf *buf); + +#endif /* FUSE_LOWLEVEL_H_ */ diff --git a/tools/virtiofsd/fuse_misc.h b/tools/virtiofsd/fuse_misc.h new file mode 100644 index 000000000..f252baa75 --- /dev/null +++ b/tools/virtiofsd/fuse_misc.h @@ -0,0 +1,59 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include <pthread.h> + +/* + * Versioned symbols cannot be used in some cases because it + * - confuse the dynamic linker in uClibc + * - not supported on MacOSX (in MachO binary format) + */ +#if (!defined(__UCLIBC__) && !defined(__APPLE__)) +#define FUSE_SYMVER(x) __asm__(x) +#else +#define FUSE_SYMVER(x) +#endif + +#ifndef USE_UCLIBC +#define fuse_mutex_init(mut) pthread_mutex_init(mut, NULL) +#else +/* Is this hack still needed? */ +static inline void fuse_mutex_init(pthread_mutex_t *mut) +{ + pthread_mutexattr_t attr; + pthread_mutexattr_init(&attr); + pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ADAPTIVE_NP); + pthread_mutex_init(mut, &attr); + pthread_mutexattr_destroy(&attr); +} +#endif + +#ifdef HAVE_STRUCT_STAT_ST_ATIM +/* Linux */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atim.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctim.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtim.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atim.tv_nsec = (val) +#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctim.tv_nsec = (val) +#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtim.tv_nsec = (val) +#elif defined(HAVE_STRUCT_STAT_ST_ATIMESPEC) +/* FreeBSD */ +#define ST_ATIM_NSEC(stbuf) ((stbuf)->st_atimespec.tv_nsec) +#define ST_CTIM_NSEC(stbuf) ((stbuf)->st_ctimespec.tv_nsec) +#define ST_MTIM_NSEC(stbuf) ((stbuf)->st_mtimespec.tv_nsec) +#define ST_ATIM_NSEC_SET(stbuf, val) (stbuf)->st_atimespec.tv_nsec = (val) +#define ST_CTIM_NSEC_SET(stbuf, val) (stbuf)->st_ctimespec.tv_nsec = (val) +#define ST_MTIM_NSEC_SET(stbuf, val) (stbuf)->st_mtimespec.tv_nsec = (val) +#else +#define ST_ATIM_NSEC(stbuf) 0 +#define ST_CTIM_NSEC(stbuf) 0 +#define ST_MTIM_NSEC(stbuf) 0 +#define ST_ATIM_NSEC_SET(stbuf, val) do { } while (0) +#define ST_CTIM_NSEC_SET(stbuf, val) do { } while (0) +#define ST_MTIM_NSEC_SET(stbuf, val) do { } while (0) +#endif diff --git a/tools/virtiofsd/fuse_opt.c b/tools/virtiofsd/fuse_opt.c new file mode 100644 index 000000000..9d371448e --- /dev/null +++ b/tools/virtiofsd/fuse_opt.c @@ -0,0 +1,446 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Implementation of option parsing routines (dealing with `struct + * fuse_args`). + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_opt.h" +#include "fuse_i.h" +#include "fuse_misc.h" + + +struct fuse_opt_context { + void *data; + const struct fuse_opt *opt; + fuse_opt_proc_t proc; + int argctr; + int argc; + char **argv; + struct fuse_args outargs; + char *opts; + int nonopt; +}; + +void fuse_opt_free_args(struct fuse_args *args) +{ + if (args) { + if (args->argv && args->allocated) { + int i; + for (i = 0; i < args->argc; i++) { + free(args->argv[i]); + } + free(args->argv); + } + args->argc = 0; + args->argv = NULL; + args->allocated = 0; + } +} + +static int alloc_failed(void) +{ + fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); + return -1; +} + +int fuse_opt_add_arg(struct fuse_args *args, const char *arg) +{ + char **newargv; + char *newarg; + + assert(!args->argv || args->allocated); + + newarg = strdup(arg); + if (!newarg) { + return alloc_failed(); + } + + newargv = realloc(args->argv, (args->argc + 2) * sizeof(char *)); + if (!newargv) { + free(newarg); + return alloc_failed(); + } + + args->argv = newargv; + args->allocated = 1; + args->argv[args->argc++] = newarg; + args->argv[args->argc] = NULL; + return 0; +} + +static int fuse_opt_insert_arg_common(struct fuse_args *args, int pos, + const char *arg) +{ + assert(pos <= args->argc); + if (fuse_opt_add_arg(args, arg) == -1) { + return -1; + } + + if (pos != args->argc - 1) { + char *newarg = args->argv[args->argc - 1]; + memmove(&args->argv[pos + 1], &args->argv[pos], + sizeof(char *) * (args->argc - pos - 1)); + args->argv[pos] = newarg; + } + return 0; +} + +int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg) +{ + return fuse_opt_insert_arg_common(args, pos, arg); +} + +static int next_arg(struct fuse_opt_context *ctx, const char *opt) +{ + if (ctx->argctr + 1 >= ctx->argc) { + fuse_log(FUSE_LOG_ERR, "fuse: missing argument after `%s'\n", opt); + return -1; + } + ctx->argctr++; + return 0; +} + +static int add_arg(struct fuse_opt_context *ctx, const char *arg) +{ + return fuse_opt_add_arg(&ctx->outargs, arg); +} + +static int add_opt_common(char **opts, const char *opt, int esc) +{ + unsigned oldlen = *opts ? strlen(*opts) : 0; + char *d = realloc(*opts, oldlen + 1 + strlen(opt) * 2 + 1); + + if (!d) { + return alloc_failed(); + } + + *opts = d; + if (oldlen) { + d += oldlen; + *d++ = ','; + } + + for (; *opt; opt++) { + if (esc && (*opt == ',' || *opt == '\\')) { + *d++ = '\\'; + } + *d++ = *opt; + } + *d = '\0'; + + return 0; +} + +int fuse_opt_add_opt(char **opts, const char *opt) +{ + return add_opt_common(opts, opt, 0); +} + +int fuse_opt_add_opt_escaped(char **opts, const char *opt) +{ + return add_opt_common(opts, opt, 1); +} + +static int add_opt(struct fuse_opt_context *ctx, const char *opt) +{ + return add_opt_common(&ctx->opts, opt, 1); +} + +static int call_proc(struct fuse_opt_context *ctx, const char *arg, int key, + int iso) +{ + if (key == FUSE_OPT_KEY_DISCARD) { + return 0; + } + + if (key != FUSE_OPT_KEY_KEEP && ctx->proc) { + int res = ctx->proc(ctx->data, arg, key, &ctx->outargs); + if (res == -1 || !res) { + return res; + } + } + if (iso) { + return add_opt(ctx, arg); + } else { + return add_arg(ctx, arg); + } +} + +static int match_template(const char *t, const char *arg, unsigned *sepp) +{ + int arglen = strlen(arg); + const char *sep = strchr(t, '='); + sep = sep ? sep : strchr(t, ' '); + if (sep && (!sep[1] || sep[1] == '%')) { + int tlen = sep - t; + if (sep[0] == '=') { + tlen++; + } + if (arglen >= tlen && strncmp(arg, t, tlen) == 0) { + *sepp = sep - t; + return 1; + } + } + if (strcmp(t, arg) == 0) { + *sepp = 0; + return 1; + } + return 0; +} + +static const struct fuse_opt *find_opt(const struct fuse_opt *opt, + const char *arg, unsigned *sepp) +{ + for (; opt && opt->templ; opt++) { + if (match_template(opt->templ, arg, sepp)) { + return opt; + } + } + return NULL; +} + +int fuse_opt_match(const struct fuse_opt *opts, const char *opt) +{ + unsigned dummy; + return find_opt(opts, opt, &dummy) ? 1 : 0; +} + +static int process_opt_param(void *var, const char *format, const char *param, + const char *arg) +{ + assert(format[0] == '%'); + if (format[1] == 's') { + char **s = var; + char *copy = strdup(param); + if (!copy) { + return alloc_failed(); + } + + free(*s); + *s = copy; + } else { + if (sscanf(param, format, var) != 1) { + fuse_log(FUSE_LOG_ERR, "fuse: invalid parameter in option `%s'\n", + arg); + return -1; + } + } + return 0; +} + +static int process_opt(struct fuse_opt_context *ctx, const struct fuse_opt *opt, + unsigned sep, const char *arg, int iso) +{ + if (opt->offset == -1U) { + if (call_proc(ctx, arg, opt->value, iso) == -1) { + return -1; + } + } else { + void *var = (char *)ctx->data + opt->offset; + if (sep && opt->templ[sep + 1]) { + const char *param = arg + sep; + if (opt->templ[sep] == '=') { + param++; + } + if (process_opt_param(var, opt->templ + sep + 1, param, arg) == + -1) { + return -1; + } + } else { + *(int *)var = opt->value; + } + } + return 0; +} + +static int process_opt_sep_arg(struct fuse_opt_context *ctx, + const struct fuse_opt *opt, unsigned sep, + const char *arg, int iso) +{ + int res; + char *newarg; + char *param; + + if (next_arg(ctx, arg) == -1) { + return -1; + } + + param = ctx->argv[ctx->argctr]; + newarg = g_try_malloc(sep + strlen(param) + 1); + if (!newarg) { + return alloc_failed(); + } + + memcpy(newarg, arg, sep); + strcpy(newarg + sep, param); + res = process_opt(ctx, opt, sep, newarg, iso); + g_free(newarg); + + return res; +} + +static int process_gopt(struct fuse_opt_context *ctx, const char *arg, int iso) +{ + unsigned sep; + const struct fuse_opt *opt = find_opt(ctx->opt, arg, &sep); + if (opt) { + for (; opt; opt = find_opt(opt + 1, arg, &sep)) { + int res; + if (sep && opt->templ[sep] == ' ' && !arg[sep]) { + res = process_opt_sep_arg(ctx, opt, sep, arg, iso); + } else { + res = process_opt(ctx, opt, sep, arg, iso); + } + if (res == -1) { + return -1; + } + } + return 0; + } else { + return call_proc(ctx, arg, FUSE_OPT_KEY_OPT, iso); + } +} + +static int process_real_option_group(struct fuse_opt_context *ctx, char *opts) +{ + char *s = opts; + char *d = s; + int end = 0; + + while (!end) { + if (*s == '\0') { + end = 1; + } + if (*s == ',' || end) { + int res; + + *d = '\0'; + res = process_gopt(ctx, opts, 1); + if (res == -1) { + return -1; + } + d = opts; + } else { + if (s[0] == '\\' && s[1] != '\0') { + s++; + if (s[0] >= '0' && s[0] <= '3' && s[1] >= '0' && s[1] <= '7' && + s[2] >= '0' && s[2] <= '7') { + *d++ = (s[0] - '0') * 0100 + (s[1] - '0') * 0010 + + (s[2] - '0'); + s += 2; + } else { + *d++ = *s; + } + } else { + *d++ = *s; + } + } + s++; + } + + return 0; +} + +static int process_option_group(struct fuse_opt_context *ctx, const char *opts) +{ + int res; + char *copy = strdup(opts); + + if (!copy) { + fuse_log(FUSE_LOG_ERR, "fuse: memory allocation failed\n"); + return -1; + } + res = process_real_option_group(ctx, copy); + free(copy); + return res; +} + +static int process_one(struct fuse_opt_context *ctx, const char *arg) +{ + if (ctx->nonopt || arg[0] != '-') { + return call_proc(ctx, arg, FUSE_OPT_KEY_NONOPT, 0); + } else if (arg[1] == 'o') { + if (arg[2]) { + return process_option_group(ctx, arg + 2); + } else { + if (next_arg(ctx, arg) == -1) { + return -1; + } + + return process_option_group(ctx, ctx->argv[ctx->argctr]); + } + } else if (arg[1] == '-' && !arg[2]) { + if (add_arg(ctx, arg) == -1) { + return -1; + } + ctx->nonopt = ctx->outargs.argc; + return 0; + } else { + return process_gopt(ctx, arg, 0); + } +} + +static int opt_parse(struct fuse_opt_context *ctx) +{ + if (ctx->argc) { + if (add_arg(ctx, ctx->argv[0]) == -1) { + return -1; + } + } + + for (ctx->argctr = 1; ctx->argctr < ctx->argc; ctx->argctr++) { + if (process_one(ctx, ctx->argv[ctx->argctr]) == -1) { + return -1; + } + } + + if (ctx->opts) { + if (fuse_opt_insert_arg(&ctx->outargs, 1, "-o") == -1 || + fuse_opt_insert_arg(&ctx->outargs, 2, ctx->opts) == -1) { + return -1; + } + } + + /* If option separator ("--") is the last argument, remove it */ + if (ctx->nonopt && ctx->nonopt == ctx->outargs.argc && + strcmp(ctx->outargs.argv[ctx->outargs.argc - 1], "--") == 0) { + free(ctx->outargs.argv[ctx->outargs.argc - 1]); + ctx->outargs.argv[--ctx->outargs.argc] = NULL; + } + + return 0; +} + +int fuse_opt_parse(struct fuse_args *args, void *data, + const struct fuse_opt opts[], fuse_opt_proc_t proc) +{ + int res; + struct fuse_opt_context ctx = { + .data = data, + .opt = opts, + .proc = proc, + }; + + if (!args || !args->argv || !args->argc) { + return 0; + } + + ctx.argc = args->argc; + ctx.argv = args->argv; + + res = opt_parse(&ctx); + if (res != -1) { + struct fuse_args tmp = *args; + *args = ctx.outargs; + ctx.outargs = tmp; + } + free(ctx.opts); + fuse_opt_free_args(&ctx.outargs); + return res; +} diff --git a/tools/virtiofsd/fuse_opt.h b/tools/virtiofsd/fuse_opt.h new file mode 100644 index 000000000..8f59b4d30 --- /dev/null +++ b/tools/virtiofsd/fuse_opt.h @@ -0,0 +1,272 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#ifndef FUSE_OPT_H_ +#define FUSE_OPT_H_ + +/** @file + * + * This file defines the option parsing interface of FUSE + */ + +/** + * Option description + * + * This structure describes a single option, and action associated + * with it, in case it matches. + * + * More than one such match may occur, in which case the action for + * each match is executed. + * + * There are three possible actions in case of a match: + * + * i) An integer (int or unsigned) variable determined by 'offset' is + * set to 'value' + * + * ii) The processing function is called, with 'value' as the key + * + * iii) An integer (any) or string (char *) variable determined by + * 'offset' is set to the value of an option parameter + * + * 'offset' should normally be either set to + * + * - 'offsetof(struct foo, member)' actions i) and iii) + * + * - -1 action ii) + * + * The 'offsetof()' macro is defined in the <stddef.h> header. + * + * The template determines which options match, and also have an + * effect on the action. Normally the action is either i) or ii), but + * if a format is present in the template, then action iii) is + * performed. + * + * The types of templates are: + * + * 1) "-x", "-foo", "--foo", "--foo-bar", etc. These match only + * themselves. Invalid values are "--" and anything beginning + * with "-o" + * + * 2) "foo", "foo-bar", etc. These match "-ofoo", "-ofoo-bar" or + * the relevant option in a comma separated option list + * + * 3) "bar=", "--foo=", etc. These are variations of 1) and 2) + * which have a parameter + * + * 4) "bar=%s", "--foo=%lu", etc. Same matching as above but perform + * action iii). + * + * 5) "-x ", etc. Matches either "-xparam" or "-x param" as + * two separate arguments + * + * 6) "-x %s", etc. Combination of 4) and 5) + * + * If the format is "%s", memory is allocated for the string unlike with + * scanf(). The previous value (if non-NULL) stored at the this location is + * freed. + */ +struct fuse_opt { + /** Matching template and optional parameter formatting */ + const char *templ; + + /** + * Offset of variable within 'data' parameter of fuse_opt_parse() + * or -1 + */ + unsigned long offset; + + /** + * Value to set the variable to, or to be passed as 'key' to the + * processing function. Ignored if template has a format + */ + int value; +}; + +/** + * Key option. In case of a match, the processing function will be + * called with the specified key. + */ +#define FUSE_OPT_KEY(templ, key) \ + { \ + templ, -1U, key \ + } + +/** + * Last option. An array of 'struct fuse_opt' must end with a NULL + * template value + */ +#define FUSE_OPT_END \ + { \ + NULL, 0, 0 \ + } + +/** + * Argument list + */ +struct fuse_args { + /** Argument count */ + int argc; + + /** Argument vector. NULL terminated */ + char **argv; + + /** Is 'argv' allocated? */ + int allocated; +}; + +/** + * Initializer for 'struct fuse_args' + */ +#define FUSE_ARGS_INIT(argc, argv) \ + { \ + argc, argv, 0 \ + } + +/** + * Key value passed to the processing function if an option did not + * match any template + */ +#define FUSE_OPT_KEY_OPT -1 + +/** + * Key value passed to the processing function for all non-options + * + * Non-options are the arguments beginning with a character other than + * '-' or all arguments after the special '--' option + */ +#define FUSE_OPT_KEY_NONOPT -2 + +/** + * Special key value for options to keep + * + * Argument is not passed to processing function, but behave as if the + * processing function returned 1 + */ +#define FUSE_OPT_KEY_KEEP -3 + +/** + * Special key value for options to discard + * + * Argument is not passed to processing function, but behave as if the + * processing function returned zero + */ +#define FUSE_OPT_KEY_DISCARD -4 + +/** + * Processing function + * + * This function is called if + * - option did not match any 'struct fuse_opt' + * - argument is a non-option + * - option did match and offset was set to -1 + * + * The 'arg' parameter will always contain the whole argument or + * option including the parameter if exists. A two-argument option + * ("-x foo") is always converted to single argument option of the + * form "-xfoo" before this function is called. + * + * Options of the form '-ofoo' are passed to this function without the + * '-o' prefix. + * + * The return value of this function determines whether this argument + * is to be inserted into the output argument vector, or discarded. + * + * @param data is the user data passed to the fuse_opt_parse() function + * @param arg is the whole argument or option + * @param key determines why the processing function was called + * @param outargs the current output argument list + * @return -1 on error, 0 if arg is to be discarded, 1 if arg should be kept + */ +typedef int (*fuse_opt_proc_t)(void *data, const char *arg, int key, + struct fuse_args *outargs); + +/** + * Option parsing function + * + * If 'args' was returned from a previous call to fuse_opt_parse() or + * it was constructed from + * + * A NULL 'args' is equivalent to an empty argument vector + * + * A NULL 'opts' is equivalent to an 'opts' array containing a single + * end marker + * + * A NULL 'proc' is equivalent to a processing function always + * returning '1' + * + * @param args is the input and output argument list + * @param data is the user data + * @param opts is the option description array + * @param proc is the processing function + * @return -1 on error, 0 on success + */ +int fuse_opt_parse(struct fuse_args *args, void *data, + const struct fuse_opt opts[], fuse_opt_proc_t proc); + +/** + * Add an option to a comma separated option list + * + * @param opts is a pointer to an option list, may point to a NULL value + * @param opt is the option to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_opt(char **opts, const char *opt); + +/** + * Add an option, escaping commas, to a comma separated option list + * + * @param opts is a pointer to an option list, may point to a NULL value + * @param opt is the option to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_opt_escaped(char **opts, const char *opt); + +/** + * Add an argument to a NULL terminated argument vector + * + * @param args is the structure containing the current argument list + * @param arg is the new argument to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_add_arg(struct fuse_args *args, const char *arg); + +/** + * Add an argument at the specified position in a NULL terminated + * argument vector + * + * Adds the argument to the N-th position. This is useful for adding + * options at the beginning of the array which must not come after the + * special '--' option. + * + * @param args is the structure containing the current argument list + * @param pos is the position at which to add the argument + * @param arg is the new argument to add + * @return -1 on allocation error, 0 on success + */ +int fuse_opt_insert_arg(struct fuse_args *args, int pos, const char *arg); + +/** + * Free the contents of argument list + * + * The structure itself is not freed + * + * @param args is the structure containing the argument list + */ +void fuse_opt_free_args(struct fuse_args *args); + + +/** + * Check if an option matches + * + * @param opts is the option description array + * @param opt is the option to match + * @return 1 if a match is found, 0 if not + */ +int fuse_opt_match(const struct fuse_opt opts[], const char *opt); + +#endif /* FUSE_OPT_H_ */ diff --git a/tools/virtiofsd/fuse_signals.c b/tools/virtiofsd/fuse_signals.c new file mode 100644 index 000000000..1de46de1c --- /dev/null +++ b/tools/virtiofsd/fuse_signals.c @@ -0,0 +1,93 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Utility functions for setting signal handlers. + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" + + +static struct fuse_session *fuse_instance; + +static void exit_handler(int sig) +{ + if (fuse_instance) { + fuse_session_exit(fuse_instance); + if (sig <= 0) { + fuse_log(FUSE_LOG_ERR, "assertion error: signal value <= 0\n"); + abort(); + } + fuse_instance->error = sig; + } +} + +static void do_nothing(int sig) +{ + (void)sig; +} + +static int set_one_signal_handler(int sig, void (*handler)(int), int remove) +{ + struct sigaction sa; + struct sigaction old_sa; + + memset(&sa, 0, sizeof(struct sigaction)); + sa.sa_handler = remove ? SIG_DFL : handler; + sigemptyset(&(sa.sa_mask)); + sa.sa_flags = 0; + + if (sigaction(sig, NULL, &old_sa) == -1) { + fuse_log(FUSE_LOG_ERR, "fuse: cannot get old signal handler: %s\n", + strerror(errno)); + return -1; + } + + if (old_sa.sa_handler == (remove ? handler : SIG_DFL) && + sigaction(sig, &sa, NULL) == -1) { + fuse_log(FUSE_LOG_ERR, "fuse: cannot set signal handler: %s\n", + strerror(errno)); + return -1; + } + return 0; +} + +int fuse_set_signal_handlers(struct fuse_session *se) +{ + /* + * If we used SIG_IGN instead of the do_nothing function, + * then we would be unable to tell if we set SIG_IGN (and + * thus should reset to SIG_DFL in fuse_remove_signal_handlers) + * or if it was already set to SIG_IGN (and should be left + * untouched. + */ + if (set_one_signal_handler(SIGHUP, exit_handler, 0) == -1 || + set_one_signal_handler(SIGINT, exit_handler, 0) == -1 || + set_one_signal_handler(SIGTERM, exit_handler, 0) == -1 || + set_one_signal_handler(SIGPIPE, do_nothing, 0) == -1) { + return -1; + } + + fuse_instance = se; + return 0; +} + +void fuse_remove_signal_handlers(struct fuse_session *se) +{ + if (fuse_instance != se) { + fuse_log(FUSE_LOG_ERR, + "fuse: fuse_remove_signal_handlers: unknown session\n"); + } else { + fuse_instance = NULL; + } + + set_one_signal_handler(SIGHUP, exit_handler, 1); + set_one_signal_handler(SIGINT, exit_handler, 1); + set_one_signal_handler(SIGTERM, exit_handler, 1); + set_one_signal_handler(SIGPIPE, do_nothing, 1); +} diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c new file mode 100644 index 000000000..60b96470c --- /dev/null +++ b/tools/virtiofsd/fuse_virtio.c @@ -0,0 +1,1079 @@ +/* + * virtio-fs glue for FUSE + * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert <dgilbert@redhat.com> + * + * Implements the glue between libfuse and libvhost-user + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#include "qemu/osdep.h" +#include "qemu/iov.h" +#include "qapi/error.h" +#include "fuse_i.h" +#include "standard-headers/linux/fuse.h" +#include "fuse_misc.h" +#include "fuse_opt.h" +#include "fuse_virtio.h" + +#include <sys/eventfd.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <grp.h> + +#include "libvhost-user.h" + +struct fv_VuDev; +struct fv_QueueInfo { + pthread_t thread; + /* + * This lock protects the VuVirtq preventing races between + * fv_queue_thread() and fv_queue_worker(). + */ + pthread_mutex_t vq_lock; + + struct fv_VuDev *virtio_dev; + + /* Our queue index, corresponds to array position */ + int qidx; + int kick_fd; + int kill_fd; /* For killing the thread */ +}; + +/* A FUSE request */ +typedef struct { + VuVirtqElement elem; + struct fuse_chan ch; + + /* Used to complete requests that involve no reply */ + bool reply_sent; +} FVRequest; + +/* + * We pass the dev element into libvhost-user + * and then use it to get back to the outer + * container for other data. + */ +struct fv_VuDev { + VuDev dev; + struct fuse_session *se; + + /* + * Either handle virtqueues or vhost-user protocol messages. Don't do + * both at the same time since that could lead to race conditions if + * virtqueues or memory tables change while another thread is accessing + * them. + * + * The assumptions are: + * 1. fv_queue_thread() reads/writes to virtqueues and only reads VuDev. + * 2. virtio_loop() reads/writes virtqueues and VuDev. + */ + pthread_rwlock_t vu_dispatch_rwlock; + + /* + * The following pair of fields are only accessed in the main + * virtio_loop + */ + size_t nqueues; + struct fv_QueueInfo **qi; +}; + +/* Callback from libvhost-user */ +static uint64_t fv_get_features(VuDev *dev) +{ + return 1ULL << VIRTIO_F_VERSION_1; +} + +/* Callback from libvhost-user */ +static void fv_set_features(VuDev *dev, uint64_t features) +{ +} + +/* + * Callback from libvhost-user if there's a new fd we're supposed to listen + * to, typically a queue kick? + */ +static void fv_set_watch(VuDev *dev, int fd, int condition, vu_watch_cb cb, + void *data) +{ + fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); +} + +/* + * Callback from libvhost-user if we're no longer supposed to listen on an fd + */ +static void fv_remove_watch(VuDev *dev, int fd) +{ + fuse_log(FUSE_LOG_WARNING, "%s: TODO! fd=%d\n", __func__, fd); +} + +/* Callback from libvhost-user to panic */ +static void fv_panic(VuDev *dev, const char *err) +{ + fuse_log(FUSE_LOG_ERR, "%s: libvhost-user: %s\n", __func__, err); + /* TODO: Allow reconnects?? */ + exit(EXIT_FAILURE); +} + +/* + * Copy from an iovec into a fuse_buf (memory only) + * Caller must ensure there is space + */ +static size_t copy_from_iov(struct fuse_buf *buf, size_t out_num, + const struct iovec *out_sg, + size_t max) +{ + void *dest = buf->mem; + size_t copied = 0; + + while (out_num && max) { + size_t onelen = out_sg->iov_len; + onelen = MIN(onelen, max); + memcpy(dest, out_sg->iov_base, onelen); + dest += onelen; + copied += onelen; + out_sg++; + out_num--; + max -= onelen; + } + + return copied; +} + +/* + * Skip 'skip' bytes in the iov; 'sg_1stindex' is set as + * the index for the 1st iovec to read data from, and + * 'sg_1stskip' is the number of bytes to skip in that entry. + * + * Returns True if there are at least 'skip' bytes in the iovec + * + */ +static bool skip_iov(const struct iovec *sg, size_t sg_size, + size_t skip, + size_t *sg_1stindex, size_t *sg_1stskip) +{ + size_t vec; + + for (vec = 0; vec < sg_size; vec++) { + if (sg[vec].iov_len > skip) { + *sg_1stskip = skip; + *sg_1stindex = vec; + + return true; + } + + skip -= sg[vec].iov_len; + } + + *sg_1stindex = vec; + *sg_1stskip = 0; + return skip == 0; +} + +/* + * Copy from one iov to another, the given number of bytes + * The caller must have checked sizes. + */ +static void copy_iov(struct iovec *src_iov, int src_count, + struct iovec *dst_iov, int dst_count, size_t to_copy) +{ + size_t dst_offset = 0; + /* Outer loop copies 'src' elements */ + while (to_copy) { + assert(src_count); + size_t src_len = src_iov[0].iov_len; + size_t src_offset = 0; + + if (src_len > to_copy) { + src_len = to_copy; + } + /* Inner loop copies contents of one 'src' to maybe multiple dst. */ + while (src_len) { + assert(dst_count); + size_t dst_len = dst_iov[0].iov_len - dst_offset; + if (dst_len > src_len) { + dst_len = src_len; + } + + memcpy(dst_iov[0].iov_base + dst_offset, + src_iov[0].iov_base + src_offset, dst_len); + src_len -= dst_len; + to_copy -= dst_len; + src_offset += dst_len; + dst_offset += dst_len; + + assert(dst_offset <= dst_iov[0].iov_len); + if (dst_offset == dst_iov[0].iov_len) { + dst_offset = 0; + dst_iov++; + dst_count--; + } + } + src_iov++; + src_count--; + } +} + +/* + * pthread_rwlock_rdlock() and pthread_rwlock_wrlock can fail if + * a deadlock condition is detected or the current thread already + * owns the lock. They can also fail, like pthread_rwlock_unlock(), + * if the mutex wasn't properly initialized. None of these are ever + * expected to happen. + */ +static void vu_dispatch_rdlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_rdlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_wrlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_wrlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vu_dispatch_unlock(struct fv_VuDev *vud) +{ + int ret = pthread_rwlock_unlock(&vud->vu_dispatch_rwlock); + assert(ret == 0); +} + +static void vq_send_element(struct fv_QueueInfo *qi, VuVirtqElement *elem, + ssize_t len) +{ + struct fuse_session *se = qi->virtio_dev->se; + VuDev *dev = &se->virtio_dev->dev; + VuVirtq *q = vu_get_queue(dev, qi->qidx); + + vu_dispatch_rdlock(qi->virtio_dev); + pthread_mutex_lock(&qi->vq_lock); + vu_queue_push(dev, q, elem, len); + vu_queue_notify(dev, q); + pthread_mutex_unlock(&qi->vq_lock); + vu_dispatch_unlock(qi->virtio_dev); +} + +/* + * Called back by ll whenever it wants to send a reply/message back + * The 1st element of the iov starts with the fuse_out_header + * 'unique'==0 means it's a notify message. + */ +int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count) +{ + FVRequest *req = container_of(ch, FVRequest, ch); + struct fv_QueueInfo *qi = ch->qi; + VuVirtqElement *elem = &req->elem; + int ret = 0; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); + + struct fuse_out_header *out = iov[0].iov_base; + /* TODO: Endianness! */ + + size_t tosend_len = iov_size(iov, count); + + /* unique == 0 is notification, which we don't support */ + assert(out->unique); + assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; + struct iovec *in_sg = elem->in_sg; + size_t in_len = iov_size(in_sg, in_num); + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", + __func__, elem->index, in_num, in_len); + + /* + * The elem should have room for a 'fuse_out_header' (out from fuse) + * plus the data based on the len in the header. + */ + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); + ret = -E2BIG; + goto err; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); + ret = -E2BIG; + goto err; + } + + copy_iov(iov, count, in_sg, in_num, tosend_len); + + vq_send_element(qi, elem, tosend_len); + req->reply_sent = true; + +err: + return ret; +} + +/* + * Callback from fuse_send_data_iov_* when it's virtio and the buffer + * is a single FD with FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK + * We need send the iov and then the buffer. + * Return 0 on success + */ +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, struct fuse_bufvec *buf, + size_t len) +{ + FVRequest *req = container_of(ch, FVRequest, ch); + struct fv_QueueInfo *qi = ch->qi; + VuVirtqElement *elem = &req->elem; + int ret = 0; + g_autofree struct iovec *in_sg_cpy = NULL; + + assert(count >= 1); + assert(iov[0].iov_len >= sizeof(struct fuse_out_header)); + + struct fuse_out_header *out = iov[0].iov_base; + /* TODO: Endianness! */ + + size_t iov_len = iov_size(iov, count); + size_t tosend_len = iov_len + len; + + out->len = tosend_len; + + fuse_log(FUSE_LOG_DEBUG, "%s: count=%d len=%zd iov_len=%zd\n", __func__, + count, len, iov_len); + + /* unique == 0 is notification which we don't support */ + assert(out->unique); + + assert(!req->reply_sent); + + /* The 'in' part of the elem is to qemu */ + unsigned int in_num = elem->in_num; + struct iovec *in_sg = elem->in_sg; + size_t in_len = iov_size(in_sg, in_num); + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d: with %d in desc of length %zd\n", + __func__, elem->index, in_num, in_len); + + /* + * The elem should have room for a 'fuse_out_header' (out from fuse) + * plus the data based on the len in the header. + */ + if (in_len < sizeof(struct fuse_out_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for out_header\n", + __func__, elem->index); + return E2BIG; + } + if (in_len < tosend_len) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too small for data len %zd\n", + __func__, elem->index, tosend_len); + return E2BIG; + } + + /* TODO: Limit to 'len' */ + + /* First copy the header data from iov->in_sg */ + copy_iov(iov, count, in_sg, in_num, iov_len); + + /* + * Build a copy of the the in_sg iov so we can skip bits in it, + * including changing the offsets + */ + in_sg_cpy = g_new(struct iovec, in_num); + memcpy(in_sg_cpy, in_sg, sizeof(struct iovec) * in_num); + /* These get updated as we skip */ + struct iovec *in_sg_ptr = in_sg_cpy; + unsigned int in_sg_cpy_count = in_num; + + /* skip over parts of in_sg that contained the header iov */ + iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, iov_len); + + do { + fuse_log(FUSE_LOG_DEBUG, "%s: in_sg_cpy_count=%d len remaining=%zd\n", + __func__, in_sg_cpy_count, len); + + ret = preadv(buf->buf[0].fd, in_sg_ptr, in_sg_cpy_count, + buf->buf[0].pos); + + if (ret == -1) { + ret = errno; + if (ret == EINTR) { + continue; + } + fuse_log(FUSE_LOG_DEBUG, "%s: preadv failed (%m) len=%zd\n", + __func__, len); + return ret; + } + + if (!ret) { + /* EOF case? */ + fuse_log(FUSE_LOG_DEBUG, "%s: !ret len remaining=%zd\n", __func__, + len); + break; + } + fuse_log(FUSE_LOG_DEBUG, "%s: preadv ret=%d len=%zd\n", __func__, + ret, len); + + len -= ret; + /* Short read. Retry reading remaining bytes */ + if (len) { + fuse_log(FUSE_LOG_DEBUG, "%s: ret < len\n", __func__); + /* Skip over this much next time around */ + iov_discard_front(&in_sg_ptr, &in_sg_cpy_count, ret); + buf->buf[0].pos += ret; + } + } while (len); + + /* Need to fix out->len on EOF */ + if (len) { + struct fuse_out_header *out_sg = in_sg[0].iov_base; + + tosend_len -= len; + out_sg->len = tosend_len; + } + + vq_send_element(qi, elem, tosend_len); + req->reply_sent = true; + return 0; +} + +static __thread bool clone_fs_called; + +/* Process one FVRequest in a thread pool */ +static void fv_queue_worker(gpointer data, gpointer user_data) +{ + struct fv_QueueInfo *qi = user_data; + struct fuse_session *se = qi->virtio_dev->se; + FVRequest *req = data; + VuVirtqElement *elem = &req->elem; + struct fuse_buf fbuf = {}; + bool allocated_bufv = false; + struct fuse_bufvec bufv; + struct fuse_bufvec *pbufv; + struct fuse_in_header inh; + + assert(se->bufsize > sizeof(struct fuse_in_header)); + + if (!clone_fs_called) { + int ret; + + /* unshare FS for xattr operation */ + ret = unshare(CLONE_FS); + /* should not fail */ + assert(ret == 0); + + clone_fs_called = true; + } + + /* + * An element contains one request and the space to send our response + * They're spread over multiple descriptors in a scatter/gather set + * and we can't trust the guest to keep them still; so copy in/out. + */ + fbuf.mem = g_malloc(se->bufsize); + + fuse_mutex_init(&req->ch.lock); + req->ch.fd = -1; + req->ch.qi = qi; + + /* The 'out' part of the elem is from qemu */ + unsigned int out_num = elem->out_num; + struct iovec *out_sg = elem->out_sg; + size_t out_len = iov_size(out_sg, out_num); + fuse_log(FUSE_LOG_DEBUG, + "%s: elem %d: with %d out desc of length %zd\n", + __func__, elem->index, out_num, out_len); + + /* + * The elem should contain a 'fuse_in_header' (in to fuse) + * plus the data based on the len in the header. + */ + if (out_len < sizeof(struct fuse_in_header)) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too short for in_header\n", + __func__, elem->index); + assert(0); /* TODO */ + } + if (out_len > se->bufsize) { + fuse_log(FUSE_LOG_ERR, "%s: elem %d too large for buffer\n", __func__, + elem->index); + assert(0); /* TODO */ + } + /* Copy just the fuse_in_header and look at it */ + copy_from_iov(&fbuf, out_num, out_sg, + sizeof(struct fuse_in_header)); + memcpy(&inh, fbuf.mem, sizeof(struct fuse_in_header)); + + pbufv = NULL; /* Compiler thinks an unitialised path */ + if (inh.opcode == FUSE_WRITE && + out_len >= (sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in))) { + /* + * For a write we don't actually need to copy the + * data, we can just do it straight out of guest memory + * but we must still copy the headers in case the guest + * was nasty and changed them while we were using them. + */ + fuse_log(FUSE_LOG_DEBUG, "%s: Write special case\n", __func__); + + fbuf.size = copy_from_iov(&fbuf, out_num, out_sg, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in)); + /* That copy reread the in_header, make sure we use the original */ + memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header)); + + /* Allocate the bufv, with space for the rest of the iov */ + pbufv = g_try_malloc(sizeof(struct fuse_bufvec) + + sizeof(struct fuse_buf) * out_num); + if (!pbufv) { + fuse_log(FUSE_LOG_ERR, "%s: pbufv malloc failed\n", + __func__); + goto out; + } + + allocated_bufv = true; + pbufv->count = 1; + pbufv->buf[0] = fbuf; + + size_t iovindex, pbufvindex, iov_bytes_skip; + pbufvindex = 1; /* 2 headers, 1 fusebuf */ + + if (!skip_iov(out_sg, out_num, + sizeof(struct fuse_in_header) + + sizeof(struct fuse_write_in), + &iovindex, &iov_bytes_skip)) { + fuse_log(FUSE_LOG_ERR, "%s: skip failed\n", + __func__); + goto out; + } + + for (; iovindex < out_num; iovindex++, pbufvindex++) { + pbufv->count++; + pbufv->buf[pbufvindex].pos = ~0; /* Dummy */ + pbufv->buf[pbufvindex].flags = 0; + pbufv->buf[pbufvindex].mem = out_sg[iovindex].iov_base; + pbufv->buf[pbufvindex].size = out_sg[iovindex].iov_len; + + if (iov_bytes_skip) { + pbufv->buf[pbufvindex].mem += iov_bytes_skip; + pbufv->buf[pbufvindex].size -= iov_bytes_skip; + iov_bytes_skip = 0; + } + } + } else { + /* Normal (non fast write) path */ + + copy_from_iov(&fbuf, out_num, out_sg, se->bufsize); + /* That copy reread the in_header, make sure we use the original */ + memcpy(fbuf.mem, &inh, sizeof(struct fuse_in_header)); + fbuf.size = out_len; + + /* TODO! Endianness of header */ + + /* TODO: Add checks for fuse_session_exited */ + bufv.buf[0] = fbuf; + bufv.count = 1; + pbufv = &bufv; + } + pbufv->idx = 0; + pbufv->off = 0; + fuse_session_process_buf_int(se, pbufv, &req->ch); + +out: + if (allocated_bufv) { + g_free(pbufv); + } + + /* If the request has no reply, still recycle the virtqueue element */ + if (!req->reply_sent) { + fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__, + elem->index); + vq_send_element(qi, elem, 0); + } + + pthread_mutex_destroy(&req->ch.lock); + g_free(fbuf.mem); + free(req); +} + +/* Thread function for individual queues, created when a queue is 'started' */ +static void *fv_queue_thread(void *opaque) +{ + struct fv_QueueInfo *qi = opaque; + struct VuDev *dev = &qi->virtio_dev->dev; + struct VuVirtq *q = vu_get_queue(dev, qi->qidx); + struct fuse_session *se = qi->virtio_dev->se; + GThreadPool *pool = NULL; + GList *req_list = NULL; + + if (se->thread_pool_size) { + fuse_log(FUSE_LOG_DEBUG, "%s: Creating thread pool for Queue %d\n", + __func__, qi->qidx); + pool = g_thread_pool_new(fv_queue_worker, qi, se->thread_pool_size, + FALSE, NULL); + if (!pool) { + fuse_log(FUSE_LOG_ERR, "%s: g_thread_pool_new failed\n", __func__); + return NULL; + } + } + + fuse_log(FUSE_LOG_INFO, "%s: Start for queue %d kick_fd %d\n", __func__, + qi->qidx, qi->kick_fd); + while (1) { + struct pollfd pf[2]; + + pf[0].fd = qi->kick_fd; + pf[0].events = POLLIN; + pf[0].revents = 0; + pf[1].fd = qi->kill_fd; + pf[1].events = POLLIN; + pf[1].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for Queue %d event\n", __func__, + qi->qidx); + int poll_res = ppoll(pf, 2, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { + fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", + __func__); + continue; + } + fuse_log(FUSE_LOG_ERR, "fv_queue_thread ppoll: %m\n"); + break; + } + assert(poll_res >= 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x Queue %d\n", + __func__, pf[0].revents, qi->qidx); + break; + } + if (pf[1].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, + "%s: Unexpected poll revents %x Queue %d killfd\n", + __func__, pf[1].revents, qi->qidx); + break; + } + if (pf[1].revents) { + fuse_log(FUSE_LOG_INFO, "%s: kill event on queue %d - quitting\n", + __func__, qi->qidx); + break; + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got queue event on Queue %d\n", __func__, + qi->qidx); + + eventfd_t evalue; + if (eventfd_read(qi->kick_fd, &evalue)) { + fuse_log(FUSE_LOG_ERR, "Eventfd_read for queue: %m\n"); + break; + } + /* Mutual exclusion with virtio_loop() */ + vu_dispatch_rdlock(qi->virtio_dev); + pthread_mutex_lock(&qi->vq_lock); + /* out is from guest, in is too guest */ + unsigned int in_bytes, out_bytes; + vu_queue_get_avail_bytes(dev, q, &in_bytes, &out_bytes, ~0, ~0); + + fuse_log(FUSE_LOG_DEBUG, + "%s: Queue %d gave evalue: %zx available: in: %u out: %u\n", + __func__, qi->qidx, (size_t)evalue, in_bytes, out_bytes); + + while (1) { + FVRequest *req = vu_queue_pop(dev, q, sizeof(FVRequest)); + if (!req) { + break; + } + + req->reply_sent = false; + + if (!se->thread_pool_size) { + req_list = g_list_prepend(req_list, req); + } else { + g_thread_pool_push(pool, req, NULL); + } + } + + pthread_mutex_unlock(&qi->vq_lock); + vu_dispatch_unlock(qi->virtio_dev); + + /* Process all the requests. */ + if (!se->thread_pool_size && req_list != NULL) { + req_list = g_list_reverse(req_list); + g_list_foreach(req_list, fv_queue_worker, qi); + g_list_free(req_list); + req_list = NULL; + } + } + + if (pool) { + g_thread_pool_free(pool, FALSE, TRUE); + } + + return NULL; +} + +static void fv_queue_cleanup_thread(struct fv_VuDev *vud, int qidx) +{ + int ret; + struct fv_QueueInfo *ourqi; + + assert(qidx < vud->nqueues); + ourqi = vud->qi[qidx]; + + /* Kill the thread */ + if (eventfd_write(ourqi->kill_fd, 1)) { + fuse_log(FUSE_LOG_ERR, "Eventfd_write for queue %d: %s\n", + qidx, strerror(errno)); + } + ret = pthread_join(ourqi->thread, NULL); + if (ret) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to join thread idx %d err %d\n", + __func__, qidx, ret); + } + pthread_mutex_destroy(&ourqi->vq_lock); + close(ourqi->kill_fd); + ourqi->kick_fd = -1; + g_free(vud->qi[qidx]); + vud->qi[qidx] = NULL; +} + +static void stop_all_queues(struct fv_VuDev *vud) +{ + for (int i = 0; i < vud->nqueues; i++) { + if (!vud->qi[i]) { + continue; + } + + fuse_log(FUSE_LOG_INFO, "%s: Stopping queue %d thread\n", __func__, i); + fv_queue_cleanup_thread(vud, i); + } +} + +/* Callback from libvhost-user on start or stop of a queue */ +static void fv_queue_set_started(VuDev *dev, int qidx, bool started) +{ + struct fv_VuDev *vud = container_of(dev, struct fv_VuDev, dev); + struct fv_QueueInfo *ourqi; + + fuse_log(FUSE_LOG_INFO, "%s: qidx=%d started=%d\n", __func__, qidx, + started); + assert(qidx >= 0); + + /* + * Ignore additional request queues for now. passthrough_ll.c must be + * audited for thread-safety issues first. It was written with a + * well-behaved client in mind and may not protect against all types of + * races yet. + */ + if (qidx > 1) { + fuse_log(FUSE_LOG_ERR, + "%s: multiple request queues not yet implemented, please only " + "configure 1 request queue\n", + __func__); + exit(EXIT_FAILURE); + } + + if (started) { + /* Fire up a thread to watch this queue */ + if (qidx >= vud->nqueues) { + vud->qi = g_realloc_n(vud->qi, qidx + 1, sizeof(vud->qi[0])); + memset(vud->qi + vud->nqueues, 0, + sizeof(vud->qi[0]) * (1 + (qidx - vud->nqueues))); + vud->nqueues = qidx + 1; + } + if (!vud->qi[qidx]) { + vud->qi[qidx] = g_new0(struct fv_QueueInfo, 1); + vud->qi[qidx]->virtio_dev = vud; + vud->qi[qidx]->qidx = qidx; + } else { + /* Shouldn't have been started */ + assert(vud->qi[qidx]->kick_fd == -1); + } + ourqi = vud->qi[qidx]; + ourqi->kick_fd = dev->vq[qidx].kick_fd; + + ourqi->kill_fd = eventfd(0, EFD_CLOEXEC | EFD_SEMAPHORE); + assert(ourqi->kill_fd != -1); + pthread_mutex_init(&ourqi->vq_lock, NULL); + + if (pthread_create(&ourqi->thread, NULL, fv_queue_thread, ourqi)) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create thread for queue %d\n", + __func__, qidx); + assert(0); + } + } else { + /* + * Temporarily drop write-lock taken in virtio_loop() so that + * the queue thread doesn't block in virtio_send_msg(). + */ + vu_dispatch_unlock(vud); + fv_queue_cleanup_thread(vud, qidx); + vu_dispatch_wrlock(vud); + } +} + +static bool fv_queue_order(VuDev *dev, int qidx) +{ + return false; +} + +static const VuDevIface fv_iface = { + .get_features = fv_get_features, + .set_features = fv_set_features, + + /* Don't need process message, we've not got any at vhost-user level */ + .queue_set_started = fv_queue_set_started, + + .queue_is_processed_in_order = fv_queue_order, +}; + +/* + * Main loop; this mostly deals with events on the vhost-user + * socket itself, and not actual fuse data. + */ +int virtio_loop(struct fuse_session *se) +{ + fuse_log(FUSE_LOG_INFO, "%s: Entry\n", __func__); + + while (!fuse_session_exited(se)) { + struct pollfd pf[1]; + bool ok; + pf[0].fd = se->vu_socketfd; + pf[0].events = POLLIN; + pf[0].revents = 0; + + fuse_log(FUSE_LOG_DEBUG, "%s: Waiting for VU event\n", __func__); + int poll_res = ppoll(pf, 1, NULL, NULL); + + if (poll_res == -1) { + if (errno == EINTR) { + fuse_log(FUSE_LOG_INFO, "%s: ppoll interrupted, going around\n", + __func__); + continue; + } + fuse_log(FUSE_LOG_ERR, "virtio_loop ppoll: %m\n"); + break; + } + assert(poll_res == 1); + if (pf[0].revents & (POLLERR | POLLHUP | POLLNVAL)) { + fuse_log(FUSE_LOG_ERR, "%s: Unexpected poll revents %x\n", __func__, + pf[0].revents); + break; + } + assert(pf[0].revents & POLLIN); + fuse_log(FUSE_LOG_DEBUG, "%s: Got VU event\n", __func__); + /* Mutual exclusion with fv_queue_thread() */ + vu_dispatch_wrlock(se->virtio_dev); + + ok = vu_dispatch(&se->virtio_dev->dev); + + vu_dispatch_unlock(se->virtio_dev); + + if (!ok) { + fuse_log(FUSE_LOG_ERR, "%s: vu_dispatch failed\n", __func__); + break; + } + } + + /* + * Make sure all fv_queue_thread()s quit on exit, as we're about to + * free virtio dev and fuse session, no one should access them anymore. + */ + stop_all_queues(se->virtio_dev); + fuse_log(FUSE_LOG_INFO, "%s: Exit\n", __func__); + + return 0; +} + +static void strreplace(char *s, char old, char new) +{ + for (; *s; ++s) { + if (*s == old) { + *s = new; + } + } +} + +static bool fv_socket_lock(struct fuse_session *se) +{ + g_autofree gchar *sk_name = NULL; + g_autofree gchar *pidfile = NULL; + g_autofree gchar *dir = NULL; + Error *local_err = NULL; + + dir = qemu_get_local_state_pathname("run/virtiofsd"); + + if (g_mkdir_with_parents(dir, S_IRWXU) < 0) { + fuse_log(FUSE_LOG_ERR, "%s: Failed to create directory %s: %s\n", + __func__, dir, strerror(errno)); + return false; + } + + sk_name = g_strdup(se->vu_socket_path); + strreplace(sk_name, '/', '.'); + pidfile = g_strdup_printf("%s/%s.pid", dir, sk_name); + + if (!qemu_write_pidfile(pidfile, &local_err)) { + error_report_err(local_err); + return false; + } + + return true; +} + +static int fv_create_listen_socket(struct fuse_session *se) +{ + struct sockaddr_un un; + mode_t old_umask; + + /* Nothing to do if fd is already initialized */ + if (se->vu_listen_fd >= 0) { + return 0; + } + + if (strlen(se->vu_socket_path) >= sizeof(un.sun_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path too long\n"); + return -1; + } + + if (!strlen(se->vu_socket_path)) { + fuse_log(FUSE_LOG_ERR, "Socket path is empty\n"); + return -1; + } + + /* Check the vu_socket_path is already used */ + if (!fv_socket_lock(se)) { + return -1; + } + + /* + * Create the Unix socket to communicate with qemu + * based on QEMU's vhost-user-bridge + */ + unlink(se->vu_socket_path); + strcpy(un.sun_path, se->vu_socket_path); + size_t addr_len = sizeof(un); + + int listen_sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (listen_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket creation: %m\n"); + return -1; + } + un.sun_family = AF_UNIX; + + /* + * Unfortunately bind doesn't let you set the mask on the socket, + * so set umask appropriately and restore it later. + */ + if (se->vu_socket_group) { + old_umask = umask(S_IROTH | S_IWOTH | S_IXOTH); + } else { + old_umask = umask(S_IRGRP | S_IWGRP | S_IXGRP | + S_IROTH | S_IWOTH | S_IXOTH); + } + if (bind(listen_sock, (struct sockaddr *)&un, addr_len) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket bind: %m\n"); + close(listen_sock); + umask(old_umask); + return -1; + } + if (se->vu_socket_group) { + struct group *g = getgrnam(se->vu_socket_group); + if (g) { + if (chown(se->vu_socket_path, -1, g->gr_gid) == -1) { + fuse_log(FUSE_LOG_WARNING, + "vhost socket failed to set group to %s (%d): %m\n", + se->vu_socket_group, g->gr_gid); + } + } else { + fuse_log(FUSE_LOG_ERR, + "vhost socket: unable to find group '%s'\n", + se->vu_socket_group); + close(listen_sock); + umask(old_umask); + return -1; + } + } + umask(old_umask); + + if (listen(listen_sock, 1) == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket listen: %m\n"); + close(listen_sock); + return -1; + } + + se->vu_listen_fd = listen_sock; + return 0; +} + +int virtio_session_mount(struct fuse_session *se) +{ + int ret; + + /* + * Test that unshare(CLONE_FS) works. fv_queue_worker() will need it. It's + * an unprivileged system call but some Docker/Moby versions are known to + * reject it via seccomp when CAP_SYS_ADMIN is not given. + * + * Note that the program is single-threaded here so this syscall has no + * visible effect and is safe to make. + */ + ret = unshare(CLONE_FS); + if (ret == -1 && errno == EPERM) { + fuse_log(FUSE_LOG_ERR, "unshare(CLONE_FS) failed with EPERM. If " + "running in a container please check that the container " + "runtime seccomp policy allows unshare.\n"); + return -1; + } + + ret = fv_create_listen_socket(se); + if (ret < 0) { + return ret; + } + + se->fd = -1; + + fuse_log(FUSE_LOG_INFO, "%s: Waiting for vhost-user socket connection...\n", + __func__); + int data_sock = accept(se->vu_listen_fd, NULL, NULL); + if (data_sock == -1) { + fuse_log(FUSE_LOG_ERR, "vhost socket accept: %m\n"); + close(se->vu_listen_fd); + return -1; + } + close(se->vu_listen_fd); + se->vu_listen_fd = -1; + fuse_log(FUSE_LOG_INFO, "%s: Received vhost-user socket connection\n", + __func__); + + /* TODO: Some cleanup/deallocation! */ + se->virtio_dev = g_new0(struct fv_VuDev, 1); + + se->vu_socketfd = data_sock; + se->virtio_dev->se = se; + pthread_rwlock_init(&se->virtio_dev->vu_dispatch_rwlock, NULL); + if (!vu_init(&se->virtio_dev->dev, 2, se->vu_socketfd, fv_panic, NULL, + fv_set_watch, fv_remove_watch, &fv_iface)) { + fuse_log(FUSE_LOG_ERR, "%s: vu_init failed\n", __func__); + return -1; + } + + return 0; +} + +void virtio_session_close(struct fuse_session *se) +{ + close(se->vu_socketfd); + + if (!se->virtio_dev) { + return; + } + + g_free(se->virtio_dev->qi); + pthread_rwlock_destroy(&se->virtio_dev->vu_dispatch_rwlock); + g_free(se->virtio_dev); + se->virtio_dev = NULL; +} diff --git a/tools/virtiofsd/fuse_virtio.h b/tools/virtiofsd/fuse_virtio.h new file mode 100644 index 000000000..111684032 --- /dev/null +++ b/tools/virtiofsd/fuse_virtio.h @@ -0,0 +1,33 @@ +/* + * virtio-fs glue for FUSE + * Copyright (C) 2018 Red Hat, Inc. and/or its affiliates + * + * Authors: + * Dave Gilbert <dgilbert@redhat.com> + * + * Implements the glue between libfuse and libvhost-user + * + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB + */ + +#ifndef FUSE_VIRTIO_H +#define FUSE_VIRTIO_H + +#include "fuse_i.h" + +struct fuse_session; + +int virtio_session_mount(struct fuse_session *se); +void virtio_session_close(struct fuse_session *se); +int virtio_loop(struct fuse_session *se); + + +int virtio_send_msg(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count); + +int virtio_send_data_iov(struct fuse_session *se, struct fuse_chan *ch, + struct iovec *iov, int count, + struct fuse_bufvec *buf, size_t len); + +#endif diff --git a/tools/virtiofsd/helper.c b/tools/virtiofsd/helper.c new file mode 100644 index 000000000..a8295d975 --- /dev/null +++ b/tools/virtiofsd/helper.c @@ -0,0 +1,405 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * Helper functions to create (simple) standalone programs. With the + * aid of these functions it should be possible to create full FUSE + * file system by implementing nothing but the request handlers. + + * This program can be distributed under the terms of the GNU LGPLv2. + * See the file COPYING.LIB. + */ + +#include "qemu/osdep.h" +#include "fuse_i.h" +#include "fuse_lowlevel.h" +#include "fuse_misc.h" +#include "fuse_opt.h" + +#include <sys/param.h> +#include <sys/resource.h> + +#define FUSE_HELPER_OPT(t, p) \ + { \ + t, offsetof(struct fuse_cmdline_opts, p), 1 \ + } +#define FUSE_HELPER_OPT_VALUE(t, p, v) \ + { \ + t, offsetof(struct fuse_cmdline_opts, p), v \ + } + +static const struct fuse_opt fuse_helper_opts[] = { + FUSE_HELPER_OPT("-h", show_help), + FUSE_HELPER_OPT("--help", show_help), + FUSE_HELPER_OPT("-V", show_version), + FUSE_HELPER_OPT("--version", show_version), + FUSE_HELPER_OPT("--print-capabilities", print_capabilities), + FUSE_HELPER_OPT("-d", debug), + FUSE_HELPER_OPT("debug", debug), + FUSE_HELPER_OPT("-d", foreground), + FUSE_HELPER_OPT("debug", foreground), + FUSE_OPT_KEY("-d", FUSE_OPT_KEY_KEEP), + FUSE_OPT_KEY("debug", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("-f", foreground), + FUSE_HELPER_OPT_VALUE("--daemonize", foreground, 0), + FUSE_HELPER_OPT("fsname=", nodefault_subtype), + FUSE_OPT_KEY("fsname=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("subtype=", nodefault_subtype), + FUSE_OPT_KEY("subtype=", FUSE_OPT_KEY_KEEP), + FUSE_HELPER_OPT("max_idle_threads=%u", max_idle_threads), + FUSE_HELPER_OPT("--rlimit-nofile=%lu", rlimit_nofile), + FUSE_HELPER_OPT("--syslog", syslog), + FUSE_HELPER_OPT_VALUE("log_level=debug", log_level, FUSE_LOG_DEBUG), + FUSE_HELPER_OPT_VALUE("log_level=info", log_level, FUSE_LOG_INFO), + FUSE_HELPER_OPT_VALUE("log_level=warn", log_level, FUSE_LOG_WARNING), + FUSE_HELPER_OPT_VALUE("log_level=err", log_level, FUSE_LOG_ERR), + FUSE_OPT_END +}; + +struct fuse_conn_info_opts { + int atomic_o_trunc; + int no_remote_posix_lock; + int no_remote_flock; + int splice_write; + int splice_move; + int splice_read; + int no_splice_write; + int no_splice_move; + int no_splice_read; + int auto_inval_data; + int no_auto_inval_data; + int no_readdirplus; + int no_readdirplus_auto; + int async_dio; + int no_async_dio; + int writeback_cache; + int no_writeback_cache; + int async_read; + int sync_read; + unsigned max_write; + unsigned max_readahead; + unsigned max_background; + unsigned congestion_threshold; + unsigned time_gran; + int set_max_write; + int set_max_readahead; + int set_max_background; + int set_congestion_threshold; + int set_time_gran; +}; + +#define CONN_OPTION(t, p, v) \ + { \ + t, offsetof(struct fuse_conn_info_opts, p), v \ + } +static const struct fuse_opt conn_info_opt_spec[] = { + CONN_OPTION("max_write=%u", max_write, 0), + CONN_OPTION("max_write=", set_max_write, 1), + CONN_OPTION("max_readahead=%u", max_readahead, 0), + CONN_OPTION("max_readahead=", set_max_readahead, 1), + CONN_OPTION("max_background=%u", max_background, 0), + CONN_OPTION("max_background=", set_max_background, 1), + CONN_OPTION("congestion_threshold=%u", congestion_threshold, 0), + CONN_OPTION("congestion_threshold=", set_congestion_threshold, 1), + CONN_OPTION("sync_read", sync_read, 1), + CONN_OPTION("async_read", async_read, 1), + CONN_OPTION("atomic_o_trunc", atomic_o_trunc, 1), + CONN_OPTION("no_remote_lock", no_remote_posix_lock, 1), + CONN_OPTION("no_remote_lock", no_remote_flock, 1), + CONN_OPTION("no_remote_flock", no_remote_flock, 1), + CONN_OPTION("no_remote_posix_lock", no_remote_posix_lock, 1), + CONN_OPTION("splice_write", splice_write, 1), + CONN_OPTION("no_splice_write", no_splice_write, 1), + CONN_OPTION("splice_move", splice_move, 1), + CONN_OPTION("no_splice_move", no_splice_move, 1), + CONN_OPTION("splice_read", splice_read, 1), + CONN_OPTION("no_splice_read", no_splice_read, 1), + CONN_OPTION("auto_inval_data", auto_inval_data, 1), + CONN_OPTION("no_auto_inval_data", no_auto_inval_data, 1), + CONN_OPTION("readdirplus=no", no_readdirplus, 1), + CONN_OPTION("readdirplus=yes", no_readdirplus, 0), + CONN_OPTION("readdirplus=yes", no_readdirplus_auto, 1), + CONN_OPTION("readdirplus=auto", no_readdirplus, 0), + CONN_OPTION("readdirplus=auto", no_readdirplus_auto, 0), + CONN_OPTION("async_dio", async_dio, 1), + CONN_OPTION("no_async_dio", no_async_dio, 1), + CONN_OPTION("writeback_cache", writeback_cache, 1), + CONN_OPTION("no_writeback_cache", no_writeback_cache, 1), + CONN_OPTION("time_gran=%u", time_gran, 0), + CONN_OPTION("time_gran=", set_time_gran, 1), + FUSE_OPT_END +}; + + +void fuse_cmdline_help(void) +{ + printf(" -h --help print help\n" + " -V --version print version\n" + " --print-capabilities print vhost-user.json\n" + " -d -o debug enable debug output (implies -f)\n" + " --syslog log to syslog (default stderr)\n" + " -f foreground operation\n" + " --daemonize run in background\n" + " -o cache=<mode> cache mode. could be one of \"auto, " + "always, none\"\n" + " default: auto\n" + " -o flock|no_flock enable/disable flock\n" + " default: no_flock\n" + " -o log_level=<level> log level, default to \"info\"\n" + " level could be one of \"debug, " + "info, warn, err\"\n" + " -o max_idle_threads the maximum number of idle worker " + "threads\n" + " allowed (default: 10)\n" + " -o posix_lock|no_posix_lock\n" + " enable/disable remote posix lock\n" + " default: no_posix_lock\n" + " -o readdirplus|no_readdirplus\n" + " enable/disable readirplus\n" + " default: readdirplus except with " + "cache=none\n" + " -o sandbox=namespace|chroot\n" + " sandboxing mode:\n" + " - namespace: mount, pid, and net\n" + " namespaces with pivot_root(2)\n" + " into shared directory\n" + " - chroot: chroot(2) into shared\n" + " directory (use in containers)\n" + " default: namespace\n" + " -o timeout=<number> I/O timeout (seconds)\n" + " default: depends on cache= option.\n" + " -o writeback|no_writeback enable/disable writeback cache\n" + " default: no_writeback\n" + " -o xattr|no_xattr enable/disable xattr\n" + " default: no_xattr\n" + " -o xattrmap=<mapping> Enable xattr mapping (enables xattr)\n" + " <mapping> is a string consists of a series of rules\n" + " e.g. -o xattrmap=:map::user.virtiofs.:\n" + " -o modcaps=CAPLIST Modify the list of capabilities\n" + " e.g. -o modcaps=+sys_admin:-chown\n" + " --rlimit-nofile=<num> set maximum number of file descriptors\n" + " (0 leaves rlimit unchanged)\n" + " default: min(1000000, fs.file-max - 16384)\n" + " if the current rlimit is lower\n" + " -o allow_direct_io|no_allow_direct_io\n" + " retain/discard O_DIRECT flags passed down\n" + " to virtiofsd from guest applications.\n" + " default: no_allow_direct_io\n" + " -o announce_submounts Announce sub-mount points to the guest\n" + " -o posix_acl/no_posix_acl Enable/Disable posix_acl. (default: disabled)\n" + ); +} + +static int fuse_helper_opt_proc(void *data, const char *arg, int key, + struct fuse_args *outargs) +{ + (void)data; + (void)outargs; + + switch (key) { + case FUSE_OPT_KEY_NONOPT: + fuse_log(FUSE_LOG_ERR, "fuse: invalid argument `%s'\n", arg); + return -1; + + default: + /* Pass through unknown options */ + return 1; + } +} + +static unsigned long get_default_rlimit_nofile(void) +{ + g_autofree gchar *file_max_str = NULL; + const rlim_t reserved_fds = 16384; /* leave at least this many fds free */ + rlim_t max_fds = 1000000; /* our default RLIMIT_NOFILE target */ + rlim_t file_max; + struct rlimit rlim; + + /* + * Reduce max_fds below the system-wide maximum, if necessary. This + * ensures there are fds available for other processes so we don't + * cause resource exhaustion. + */ + if (!g_file_get_contents("/proc/sys/fs/file-max", &file_max_str, + NULL, NULL)) { + fuse_log(FUSE_LOG_ERR, "can't read /proc/sys/fs/file-max\n"); + exit(1); + } + file_max = g_ascii_strtoull(file_max_str, NULL, 10); + if (file_max < 2 * reserved_fds) { + fuse_log(FUSE_LOG_ERR, + "The fs.file-max sysctl is too low (%lu) to allow a " + "reasonable number of open files.\n", + (unsigned long)file_max); + exit(1); + } + max_fds = MIN(file_max - reserved_fds, max_fds); + + if (getrlimit(RLIMIT_NOFILE, &rlim) < 0) { + fuse_log(FUSE_LOG_ERR, "getrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } + + if (rlim.rlim_cur >= max_fds) { + return 0; /* we have more fds available than required! */ + } + return max_fds; +} + +int fuse_parse_cmdline(struct fuse_args *args, struct fuse_cmdline_opts *opts) +{ + memset(opts, 0, sizeof(struct fuse_cmdline_opts)); + + opts->max_idle_threads = 10; + opts->rlimit_nofile = get_default_rlimit_nofile(); + opts->foreground = 1; + + if (fuse_opt_parse(args, opts, fuse_helper_opts, fuse_helper_opt_proc) == + -1) { + return -1; + } + + return 0; +} + + +int fuse_daemonize(int foreground) +{ + int ret = 0, rett; + if (!foreground) { + int nullfd; + int waiter[2]; + char completed; + + if (pipe(waiter)) { + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: pipe: %s\n", + strerror(errno)); + return -1; + } + + /* + * demonize current process by forking it and killing the + * parent. This makes current process as a child of 'init'. + */ + switch (fork()) { + case -1: + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: fork: %s\n", + strerror(errno)); + return -1; + case 0: + break; + default: + _exit(read(waiter[0], &completed, + sizeof(completed) != sizeof(completed))); + } + + if (setsid() == -1) { + fuse_log(FUSE_LOG_ERR, "fuse_daemonize: setsid: %s\n", + strerror(errno)); + return -1; + } + + ret = chdir("/"); + + nullfd = open("/dev/null", O_RDWR, 0); + if (nullfd != -1) { + rett = dup2(nullfd, 0); + if (!ret) { + ret = rett; + } + rett = dup2(nullfd, 1); + if (!ret) { + ret = rett; + } + rett = dup2(nullfd, 2); + if (!ret) { + ret = rett; + } + if (nullfd > 2) { + close(nullfd); + } + } + + /* Propagate completion of daemon initialization */ + completed = 1; + rett = write(waiter[1], &completed, sizeof(completed)); + if (!ret) { + ret = rett; + } + close(waiter[0]); + close(waiter[1]); + } else { + ret = chdir("/"); + } + return ret; +} + +void fuse_apply_conn_info_opts(struct fuse_conn_info_opts *opts, + struct fuse_conn_info *conn) +{ + if (opts->set_max_write) { + conn->max_write = opts->max_write; + } + if (opts->set_max_background) { + conn->max_background = opts->max_background; + } + if (opts->set_congestion_threshold) { + conn->congestion_threshold = opts->congestion_threshold; + } + if (opts->set_time_gran) { + conn->time_gran = opts->time_gran; + } + if (opts->set_max_readahead) { + conn->max_readahead = opts->max_readahead; + } + +#define LL_ENABLE(cond, cap) \ + if (cond) \ + conn->want |= (cap) +#define LL_DISABLE(cond, cap) \ + if (cond) \ + conn->want &= ~(cap) + + LL_ENABLE(opts->splice_read, FUSE_CAP_SPLICE_READ); + LL_DISABLE(opts->no_splice_read, FUSE_CAP_SPLICE_READ); + + LL_ENABLE(opts->splice_write, FUSE_CAP_SPLICE_WRITE); + LL_DISABLE(opts->no_splice_write, FUSE_CAP_SPLICE_WRITE); + + LL_ENABLE(opts->splice_move, FUSE_CAP_SPLICE_MOVE); + LL_DISABLE(opts->no_splice_move, FUSE_CAP_SPLICE_MOVE); + + LL_ENABLE(opts->auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); + LL_DISABLE(opts->no_auto_inval_data, FUSE_CAP_AUTO_INVAL_DATA); + + LL_DISABLE(opts->no_readdirplus, FUSE_CAP_READDIRPLUS); + LL_DISABLE(opts->no_readdirplus_auto, FUSE_CAP_READDIRPLUS_AUTO); + + LL_ENABLE(opts->async_dio, FUSE_CAP_ASYNC_DIO); + LL_DISABLE(opts->no_async_dio, FUSE_CAP_ASYNC_DIO); + + LL_ENABLE(opts->writeback_cache, FUSE_CAP_WRITEBACK_CACHE); + LL_DISABLE(opts->no_writeback_cache, FUSE_CAP_WRITEBACK_CACHE); + + LL_ENABLE(opts->async_read, FUSE_CAP_ASYNC_READ); + LL_DISABLE(opts->sync_read, FUSE_CAP_ASYNC_READ); + + LL_DISABLE(opts->no_remote_posix_lock, FUSE_CAP_POSIX_LOCKS); + LL_DISABLE(opts->no_remote_flock, FUSE_CAP_FLOCK_LOCKS); +} + +struct fuse_conn_info_opts *fuse_parse_conn_info_opts(struct fuse_args *args) +{ + struct fuse_conn_info_opts *opts; + + opts = calloc(1, sizeof(struct fuse_conn_info_opts)); + if (opts == NULL) { + fuse_log(FUSE_LOG_ERR, "calloc failed\n"); + return NULL; + } + if (fuse_opt_parse(args, opts, conn_info_opt_spec, NULL) == -1) { + free(opts); + return NULL; + } + return opts; +} diff --git a/tools/virtiofsd/meson.build b/tools/virtiofsd/meson.build new file mode 100644 index 000000000..c134ba633 --- /dev/null +++ b/tools/virtiofsd/meson.build @@ -0,0 +1,18 @@ +executable('virtiofsd', files( + 'buffer.c', + 'fuse_opt.c', + 'fuse_log.c', + 'fuse_lowlevel.c', + 'fuse_signals.c', + 'fuse_virtio.c', + 'helper.c', + 'passthrough_ll.c', + 'passthrough_seccomp.c'), + dependencies: [seccomp, qemuutil, libcap_ng, vhost_user], + install: true, + install_dir: get_option('libexecdir')) + +configure_file(input: '50-qemu-virtiofsd.json.in', + output: '50-qemu-virtiofsd.json', + configuration: { 'libexecdir' : get_option('prefix') / get_option('libexecdir') }, + install_dir: qemu_datadir / 'vhost-user') diff --git a/tools/virtiofsd/passthrough_helpers.h b/tools/virtiofsd/passthrough_helpers.h new file mode 100644 index 000000000..0b98275ed --- /dev/null +++ b/tools/virtiofsd/passthrough_helpers.h @@ -0,0 +1,51 @@ +/* + * FUSE: Filesystem in Userspace + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +/* + * Creates files on the underlying file system in response to a FUSE_MKNOD + * operation + */ +static int mknod_wrapper(int dirfd, const char *path, const char *link, + int mode, dev_t rdev) +{ + int res; + + if (S_ISREG(mode)) { + res = openat(dirfd, path, O_CREAT | O_EXCL | O_WRONLY, mode); + if (res >= 0) { + res = close(res); + } + } else if (S_ISDIR(mode)) { + res = mkdirat(dirfd, path, mode); + } else if (S_ISLNK(mode) && link != NULL) { + res = symlinkat(link, dirfd, path); + } else if (S_ISFIFO(mode)) { + res = mkfifoat(dirfd, path, mode); + } else { + res = mknodat(dirfd, path, mode, rdev); + } + + return res; +} diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c new file mode 100644 index 000000000..64b5b4fbb --- /dev/null +++ b/tools/virtiofsd/passthrough_ll.c @@ -0,0 +1,4090 @@ +/* + * FUSE: Filesystem in Userspace + * Copyright (C) 2001-2007 Miklos Szeredi <miklos@szeredi.hu> + * + * This program can be distributed under the terms of the GNU GPLv2. + * See the file COPYING. + */ + +/* + * + * This file system mirrors the existing file system hierarchy of the + * system, starting at the root file system. This is implemented by + * just "passing through" all requests to the corresponding user-space + * libc functions. In contrast to passthrough.c and passthrough_fh.c, + * this implementation uses the low-level API. Its performance should + * be the least bad among the three, but many operations are not + * implemented. In particular, it is not possible to remove files (or + * directories) because the code necessary to defer actual removal + * until the file is not opened anymore would make the example much + * more complicated. + * + * When writeback caching is enabled (-o writeback mount option), it + * is only possible to write to files for which the mounting user has + * read permissions. This is because the writeback cache requires the + * kernel to be able to issue read requests for all files (which the + * passthrough filesystem cannot satisfy if it can't read the file in + * the underlying filesystem). + * + * Compile with: + * + * gcc -Wall passthrough_ll.c `pkg-config fuse3 --cflags --libs` -o + * passthrough_ll + * + * ## Source code ## + * \include passthrough_ll.c + */ + +#include "qemu/osdep.h" +#include "qemu/timer.h" +#include "qemu-version.h" +#include "qemu-common.h" +#include "fuse_virtio.h" +#include "fuse_log.h" +#include "fuse_lowlevel.h" +#include "standard-headers/linux/fuse.h" +#include <cap-ng.h> +#include <dirent.h> +#include <pthread.h> +#include <sys/file.h> +#include <sys/mount.h> +#include <sys/prctl.h> +#include <sys/resource.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <sys/xattr.h> +#include <syslog.h> + +#include "qemu/cutils.h" +#include "passthrough_helpers.h" +#include "passthrough_seccomp.h" + +/* Keep track of inode posix locks for each owner. */ +struct lo_inode_plock { + uint64_t lock_owner; + int fd; /* fd for OFD locks */ +}; + +struct lo_map_elem { + union { + struct lo_inode *inode; + struct lo_dirp *dirp; + int fd; + ssize_t freelist; + }; + bool in_use; +}; + +/* Maps FUSE fh or ino values to internal objects */ +struct lo_map { + struct lo_map_elem *elems; + size_t nelems; + ssize_t freelist; +}; + +struct lo_key { + ino_t ino; + dev_t dev; + uint64_t mnt_id; +}; + +struct lo_inode { + int fd; + + /* + * Atomic reference count for this object. The nlookup field holds a + * reference and release it when nlookup reaches 0. + */ + gint refcount; + + struct lo_key key; + + /* + * This counter keeps the inode alive during the FUSE session. + * Incremented when the FUSE inode number is sent in a reply + * (FUSE_LOOKUP, FUSE_READDIRPLUS, etc). Decremented when an inode is + * released by a FUSE_FORGET request. + * + * Note that this value is untrusted because the client can manipulate + * it arbitrarily using FUSE_FORGET requests. + * + * Protected by lo->mutex. + */ + uint64_t nlookup; + + fuse_ino_t fuse_ino; + pthread_mutex_t plock_mutex; + GHashTable *posix_locks; /* protected by lo_inode->plock_mutex */ + + mode_t filetype; +}; + +struct lo_cred { + uid_t euid; + gid_t egid; + mode_t umask; +}; + +enum { + CACHE_NONE, + CACHE_AUTO, + CACHE_ALWAYS, +}; + +enum { + SANDBOX_NAMESPACE, + SANDBOX_CHROOT, +}; + +typedef struct xattr_map_entry { + char *key; + char *prepend; + unsigned int flags; +} XattrMapEntry; + +struct lo_data { + pthread_mutex_t mutex; + int sandbox; + int debug; + int writeback; + int flock; + int posix_lock; + int xattr; + char *xattrmap; + char *xattr_security_capability; + char *source; + char *modcaps; + double timeout; + int cache; + int timeout_set; + int readdirplus_set; + int readdirplus_clear; + int allow_direct_io; + int announce_submounts; + bool use_statx; + struct lo_inode root; + GHashTable *inodes; /* protected by lo->mutex */ + struct lo_map ino_map; /* protected by lo->mutex */ + struct lo_map dirp_map; /* protected by lo->mutex */ + struct lo_map fd_map; /* protected by lo->mutex */ + XattrMapEntry *xattr_map_list; + size_t xattr_map_nentries; + + /* An O_PATH file descriptor to /proc/self/fd/ */ + int proc_self_fd; + int user_killpriv_v2, killpriv_v2; + /* If set, virtiofsd is responsible for setting umask during creation */ + bool change_umask; + int user_posix_acl, posix_acl; +}; + +static const struct fuse_opt lo_opts[] = { + { "sandbox=namespace", + offsetof(struct lo_data, sandbox), + SANDBOX_NAMESPACE }, + { "sandbox=chroot", + offsetof(struct lo_data, sandbox), + SANDBOX_CHROOT }, + { "writeback", offsetof(struct lo_data, writeback), 1 }, + { "no_writeback", offsetof(struct lo_data, writeback), 0 }, + { "source=%s", offsetof(struct lo_data, source), 0 }, + { "flock", offsetof(struct lo_data, flock), 1 }, + { "no_flock", offsetof(struct lo_data, flock), 0 }, + { "posix_lock", offsetof(struct lo_data, posix_lock), 1 }, + { "no_posix_lock", offsetof(struct lo_data, posix_lock), 0 }, + { "xattr", offsetof(struct lo_data, xattr), 1 }, + { "no_xattr", offsetof(struct lo_data, xattr), 0 }, + { "xattrmap=%s", offsetof(struct lo_data, xattrmap), 0 }, + { "modcaps=%s", offsetof(struct lo_data, modcaps), 0 }, + { "timeout=%lf", offsetof(struct lo_data, timeout), 0 }, + { "timeout=", offsetof(struct lo_data, timeout_set), 1 }, + { "cache=none", offsetof(struct lo_data, cache), CACHE_NONE }, + { "cache=auto", offsetof(struct lo_data, cache), CACHE_AUTO }, + { "cache=always", offsetof(struct lo_data, cache), CACHE_ALWAYS }, + { "readdirplus", offsetof(struct lo_data, readdirplus_set), 1 }, + { "no_readdirplus", offsetof(struct lo_data, readdirplus_clear), 1 }, + { "allow_direct_io", offsetof(struct lo_data, allow_direct_io), 1 }, + { "no_allow_direct_io", offsetof(struct lo_data, allow_direct_io), 0 }, + { "announce_submounts", offsetof(struct lo_data, announce_submounts), 1 }, + { "killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 1 }, + { "no_killpriv_v2", offsetof(struct lo_data, user_killpriv_v2), 0 }, + { "posix_acl", offsetof(struct lo_data, user_posix_acl), 1 }, + { "no_posix_acl", offsetof(struct lo_data, user_posix_acl), 0 }, + FUSE_OPT_END +}; +static bool use_syslog = false; +static int current_log_level; +static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + uint64_t n); + +static struct { + pthread_mutex_t mutex; + void *saved; +} cap; +/* That we loaded cap-ng in the current thread from the saved */ +static __thread bool cap_loaded = 0; + +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, + uint64_t mnt_id); +static int xattr_map_client(const struct lo_data *lo, const char *client_name, + char **out_name); + +static bool is_dot_or_dotdot(const char *name) +{ + return name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0')); +} + +/* Is `path` a single path component that is not "." or ".."? */ +static bool is_safe_path_component(const char *path) +{ + if (strchr(path, '/')) { + return false; + } + + return !is_dot_or_dotdot(path); +} + +static bool is_empty(const char *name) +{ + return name[0] == '\0'; +} + +static struct lo_data *lo_data(fuse_req_t req) +{ + return (struct lo_data *)fuse_req_userdata(req); +} + +/* + * Load capng's state from our saved state if the current thread + * hadn't previously been loaded. + * returns 0 on success + */ +static int load_capng(void) +{ + if (!cap_loaded) { + pthread_mutex_lock(&cap.mutex); + capng_restore_state(&cap.saved); + /* + * restore_state free's the saved copy + * so make another. + */ + cap.saved = capng_save_state(); + if (!cap.saved) { + pthread_mutex_unlock(&cap.mutex); + fuse_log(FUSE_LOG_ERR, "capng_save_state (thread)\n"); + return -EINVAL; + } + pthread_mutex_unlock(&cap.mutex); + + /* + * We want to use the loaded state for our pid, + * not the original + */ + capng_setpid(syscall(SYS_gettid)); + cap_loaded = true; + } + return 0; +} + +/* + * Helpers for dropping and regaining effective capabilities. Returns 0 + * on success, error otherwise + */ +static int drop_effective_cap(const char *cap_name, bool *cap_dropped) +{ + int cap, ret; + + cap = capng_name_to_capability(cap_name); + if (cap < 0) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", + cap_name, strerror(errno)); + goto out; + } + + if (load_capng()) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); + goto out; + } + + /* We dont have this capability in effective set already. */ + if (!capng_have_capability(CAPNG_EFFECTIVE, cap)) { + ret = 0; + goto out; + } + + if (capng_update(CAPNG_DROP, CAPNG_EFFECTIVE, cap)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_update(DROP,) failed\n"); + goto out; + } + + if (capng_apply(CAPNG_SELECT_CAPS)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "drop:capng_apply() failed\n"); + goto out; + } + + ret = 0; + if (cap_dropped) { + *cap_dropped = true; + } + +out: + return ret; +} + +static int gain_effective_cap(const char *cap_name) +{ + int cap; + int ret = 0; + + cap = capng_name_to_capability(cap_name); + if (cap < 0) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_name_to_capability(%s) failed:%s\n", + cap_name, strerror(errno)); + goto out; + } + + if (load_capng()) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "load_capng() failed\n"); + goto out; + } + + if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE, cap)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "capng_update(ADD,) failed\n"); + goto out; + } + + if (capng_apply(CAPNG_SELECT_CAPS)) { + ret = errno; + fuse_log(FUSE_LOG_ERR, "gain:capng_apply() failed\n"); + goto out; + } + ret = 0; + +out: + return ret; +} + +/* + * The host kernel normally drops security.capability xattr's on + * any write, however if we're remapping xattr names we need to drop + * whatever the clients security.capability is actually stored as. + */ +static int drop_security_capability(const struct lo_data *lo, int fd) +{ + if (!lo->xattr_security_capability) { + /* We didn't remap the name, let the host kernel do it */ + return 0; + } + if (!fremovexattr(fd, lo->xattr_security_capability)) { + /* All good */ + return 0; + } + + switch (errno) { + case ENODATA: + /* Attribute didn't exist, that's fine */ + return 0; + + case ENOTSUP: + /* FS didn't support attribute anyway, also fine */ + return 0; + + default: + /* Hmm other error */ + return errno; + } +} + +static void lo_map_init(struct lo_map *map) +{ + map->elems = NULL; + map->nelems = 0; + map->freelist = -1; +} + +static void lo_map_destroy(struct lo_map *map) +{ + g_free(map->elems); +} + +static int lo_map_grow(struct lo_map *map, size_t new_nelems) +{ + struct lo_map_elem *new_elems; + size_t i; + + if (new_nelems <= map->nelems) { + return 1; + } + + new_elems = g_try_realloc_n(map->elems, new_nelems, sizeof(map->elems[0])); + if (!new_elems) { + return 0; + } + + for (i = map->nelems; i < new_nelems; i++) { + new_elems[i].freelist = i + 1; + new_elems[i].in_use = false; + } + new_elems[new_nelems - 1].freelist = -1; + + map->elems = new_elems; + map->freelist = map->nelems; + map->nelems = new_nelems; + return 1; +} + +static struct lo_map_elem *lo_map_alloc_elem(struct lo_map *map) +{ + struct lo_map_elem *elem; + + if (map->freelist == -1 && !lo_map_grow(map, map->nelems + 256)) { + return NULL; + } + + elem = &map->elems[map->freelist]; + map->freelist = elem->freelist; + + elem->in_use = true; + + return elem; +} + +static struct lo_map_elem *lo_map_reserve(struct lo_map *map, size_t key) +{ + ssize_t *prev; + + if (!lo_map_grow(map, key + 1)) { + return NULL; + } + + for (prev = &map->freelist; *prev != -1; + prev = &map->elems[*prev].freelist) { + if (*prev == key) { + struct lo_map_elem *elem = &map->elems[key]; + + *prev = elem->freelist; + elem->in_use = true; + return elem; + } + } + return NULL; +} + +static struct lo_map_elem *lo_map_get(struct lo_map *map, size_t key) +{ + if (key >= map->nelems) { + return NULL; + } + if (!map->elems[key].in_use) { + return NULL; + } + return &map->elems[key]; +} + +static void lo_map_remove(struct lo_map *map, size_t key) +{ + struct lo_map_elem *elem; + + if (key >= map->nelems) { + return; + } + + elem = &map->elems[key]; + if (!elem->in_use) { + return; + } + + elem->in_use = false; + + elem->freelist = map->freelist; + map->freelist = key; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_fd_mapping(struct lo_data *lo, int fd) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo->fd_map); + if (!elem) { + return -1; + } + + elem->fd = fd; + return elem - lo->fd_map.elems; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_dirp_mapping(fuse_req_t req, struct lo_dirp *dirp) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo_data(req)->dirp_map); + if (!elem) { + return -1; + } + + elem->dirp = dirp; + return elem - lo_data(req)->dirp_map.elems; +} + +/* Assumes lo->mutex is held */ +static ssize_t lo_add_inode_mapping(fuse_req_t req, struct lo_inode *inode) +{ + struct lo_map_elem *elem; + + elem = lo_map_alloc_elem(&lo_data(req)->ino_map); + if (!elem) { + return -1; + } + + elem->inode = inode; + return elem - lo_data(req)->ino_map.elems; +} + +static void lo_inode_put(struct lo_data *lo, struct lo_inode **inodep) +{ + struct lo_inode *inode = *inodep; + + if (!inode) { + return; + } + + *inodep = NULL; + + if (g_atomic_int_dec_and_test(&inode->refcount)) { + close(inode->fd); + free(inode); + } +} + +/* Caller must release refcount using lo_inode_put() */ +static struct lo_inode *lo_inode(fuse_req_t req, fuse_ino_t ino) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->ino_map, ino); + if (elem) { + g_atomic_int_inc(&elem->inode->refcount); + } + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { + return NULL; + } + + return elem->inode; +} + +/* + * TODO Remove this helper and force callers to hold an inode refcount until + * they are done with the fd. This will be done in a later patch to make + * review easier. + */ +static int lo_fd(fuse_req_t req, fuse_ino_t ino) +{ + struct lo_inode *inode = lo_inode(req, ino); + int fd; + + if (!inode) { + return -1; + } + + fd = inode->fd; + lo_inode_put(lo_data(req), &inode); + return fd; +} + +/* + * Open a file descriptor for an inode. Returns -EBADF if the inode is not a + * regular file or a directory. + * + * Use this helper function instead of raw openat(2) to prevent security issues + * when a malicious client opens special files such as block device nodes. + * Symlink inodes are also rejected since symlinks must already have been + * traversed on the client side. + */ +static int lo_inode_open(struct lo_data *lo, struct lo_inode *inode, + int open_flags) +{ + g_autofree char *fd_str = g_strdup_printf("%d", inode->fd); + int fd; + + if (!S_ISREG(inode->filetype) && !S_ISDIR(inode->filetype)) { + return -EBADF; + } + + /* + * The file is a symlink so O_NOFOLLOW must be ignored. We checked earlier + * that the inode is not a special file but if an external process races + * with us then symlinks are traversed here. It is not possible to escape + * the shared directory since it is mounted as "/" though. + */ + fd = openat(lo->proc_self_fd, fd_str, open_flags & ~O_NOFOLLOW); + if (fd < 0) { + return -errno; + } + return fd; +} + +static void lo_init(void *userdata, struct fuse_conn_info *conn) +{ + struct lo_data *lo = (struct lo_data *)userdata; + + if (conn->capable & FUSE_CAP_EXPORT_SUPPORT) { + conn->want |= FUSE_CAP_EXPORT_SUPPORT; + } + + if (lo->writeback && conn->capable & FUSE_CAP_WRITEBACK_CACHE) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating writeback\n"); + conn->want |= FUSE_CAP_WRITEBACK_CACHE; + } + if (conn->capable & FUSE_CAP_FLOCK_LOCKS) { + if (lo->flock) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating flock locks\n"); + conn->want |= FUSE_CAP_FLOCK_LOCKS; + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling flock locks\n"); + conn->want &= ~FUSE_CAP_FLOCK_LOCKS; + } + } + + if (conn->capable & FUSE_CAP_POSIX_LOCKS) { + if (lo->posix_lock) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: activating posix locks\n"); + conn->want |= FUSE_CAP_POSIX_LOCKS; + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix locks\n"); + conn->want &= ~FUSE_CAP_POSIX_LOCKS; + } + } + + if ((lo->cache == CACHE_NONE && !lo->readdirplus_set) || + lo->readdirplus_clear) { + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling readdirplus\n"); + conn->want &= ~FUSE_CAP_READDIRPLUS; + } + + if (!(conn->capable & FUSE_CAP_SUBMOUNTS) && lo->announce_submounts) { + fuse_log(FUSE_LOG_WARNING, "lo_init: Cannot announce submounts, client " + "does not support it\n"); + lo->announce_submounts = false; + } + + if (lo->user_killpriv_v2 == 1) { + /* + * User explicitly asked for this option. Enable it unconditionally. + * If connection does not have this capability, it should fail + * in fuse_lowlevel.c + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else if (lo->user_killpriv_v2 == -1 && + conn->capable & FUSE_CAP_HANDLE_KILLPRIV_V2) { + /* + * User did not specify a value for killpriv_v2. By default enable it + * if connection offers this capability + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling killpriv_v2\n"); + conn->want |= FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 1; + } else { + /* + * Either user specified to disable killpriv_v2, or connection does + * not offer this capability. Disable killpriv_v2 in both the cases + */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling killpriv_v2\n"); + conn->want &= ~FUSE_CAP_HANDLE_KILLPRIV_V2; + lo->killpriv_v2 = 0; + } + + if (lo->user_posix_acl == 1) { + /* + * User explicitly asked for this option. Enable it unconditionally. + * If connection does not have this capability, print error message + * now. It will fail later in fuse_lowlevel.c + */ + if (!(conn->capable & FUSE_CAP_POSIX_ACL) || + !(conn->capable & FUSE_CAP_DONT_MASK) || + !(conn->capable & FUSE_CAP_SETXATTR_EXT)) { + fuse_log(FUSE_LOG_ERR, "lo_init: Can not enable posix acl." + " kernel does not support FUSE_POSIX_ACL, FUSE_DONT_MASK" + " or FUSE_SETXATTR_EXT capability.\n"); + } else { + fuse_log(FUSE_LOG_DEBUG, "lo_init: enabling posix acl\n"); + } + + conn->want |= FUSE_CAP_POSIX_ACL | FUSE_CAP_DONT_MASK | + FUSE_CAP_SETXATTR_EXT; + lo->change_umask = true; + lo->posix_acl = true; + } else { + /* User either did not specify anything or wants it disabled */ + fuse_log(FUSE_LOG_DEBUG, "lo_init: disabling posix_acl\n"); + conn->want &= ~FUSE_CAP_POSIX_ACL; + } +} + +static void lo_getattr(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + int res; + struct stat buf; + struct lo_data *lo = lo_data(req); + + (void)fi; + + res = + fstatat(lo_fd(req, ino), "", &buf, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + + fuse_reply_attr(req, &buf, lo->timeout); +} + +static int lo_fi_fd(fuse_req_t req, struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->fd_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + + if (!elem) { + return -1; + } + + return elem->fd; +} + +static void lo_setattr(fuse_req_t req, fuse_ino_t ino, struct stat *attr, + int valid, struct fuse_file_info *fi) +{ + int saverr; + char procname[64]; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + int ifd; + int res; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + ifd = inode->fd; + + /* If fi->fh is invalid we'll report EBADF later */ + if (fi) { + fd = lo_fi_fd(req, fi); + } + + if (valid & FUSE_SET_ATTR_MODE) { + if (fi) { + res = fchmod(fd, attr->st_mode); + } else { + sprintf(procname, "%i", ifd); + res = fchmodat(lo->proc_self_fd, procname, attr->st_mode, 0); + } + if (res == -1) { + saverr = errno; + goto out_err; + } + } + if (valid & (FUSE_SET_ATTR_UID | FUSE_SET_ATTR_GID)) { + uid_t uid = (valid & FUSE_SET_ATTR_UID) ? attr->st_uid : (uid_t)-1; + gid_t gid = (valid & FUSE_SET_ATTR_GID) ? attr->st_gid : (gid_t)-1; + + saverr = drop_security_capability(lo, ifd); + if (saverr) { + goto out_err; + } + + res = fchownat(ifd, "", uid, gid, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + saverr = errno; + goto out_err; + } + } + if (valid & FUSE_SET_ATTR_SIZE) { + int truncfd; + bool kill_suidgid; + bool cap_fsetid_dropped = false; + + kill_suidgid = lo->killpriv_v2 && (valid & FUSE_SET_ATTR_KILL_SUIDGID); + if (fi) { + truncfd = fd; + } else { + truncfd = lo_inode_open(lo, inode, O_RDWR); + if (truncfd < 0) { + saverr = -truncfd; + goto out_err; + } + } + + saverr = drop_security_capability(lo, truncfd); + if (saverr) { + if (!fi) { + close(truncfd); + } + goto out_err; + } + + if (kill_suidgid) { + res = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (res != 0) { + saverr = res; + if (!fi) { + close(truncfd); + } + goto out_err; + } + } + + res = ftruncate(truncfd, attr->st_size); + saverr = res == -1 ? errno : 0; + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } + if (!fi) { + close(truncfd); + } + if (res == -1) { + goto out_err; + } + } + if (valid & (FUSE_SET_ATTR_ATIME | FUSE_SET_ATTR_MTIME)) { + struct timespec tv[2]; + + tv[0].tv_sec = 0; + tv[1].tv_sec = 0; + tv[0].tv_nsec = UTIME_OMIT; + tv[1].tv_nsec = UTIME_OMIT; + + if (valid & FUSE_SET_ATTR_ATIME_NOW) { + tv[0].tv_nsec = UTIME_NOW; + } else if (valid & FUSE_SET_ATTR_ATIME) { + tv[0] = attr->st_atim; + } + + if (valid & FUSE_SET_ATTR_MTIME_NOW) { + tv[1].tv_nsec = UTIME_NOW; + } else if (valid & FUSE_SET_ATTR_MTIME) { + tv[1] = attr->st_mtim; + } + + if (fi) { + res = futimens(fd, tv); + } else { + sprintf(procname, "%i", inode->fd); + res = utimensat(lo->proc_self_fd, procname, tv, 0); + } + if (res == -1) { + saverr = errno; + goto out_err; + } + } + lo_inode_put(lo, &inode); + + return lo_getattr(req, ino, fi); + +out_err: + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +static struct lo_inode *lo_find(struct lo_data *lo, struct stat *st, + uint64_t mnt_id) +{ + struct lo_inode *p; + struct lo_key key = { + .ino = st->st_ino, + .dev = st->st_dev, + .mnt_id = mnt_id, + }; + + pthread_mutex_lock(&lo->mutex); + p = g_hash_table_lookup(lo->inodes, &key); + if (p) { + assert(p->nlookup > 0); + p->nlookup++; + g_atomic_int_inc(&p->refcount); + } + pthread_mutex_unlock(&lo->mutex); + + return p; +} + +/* value_destroy_func for posix_locks GHashTable */ +static void posix_locks_value_destroy(gpointer data) +{ + struct lo_inode_plock *plock = data; + + /* + * We had used open() for locks and had only one fd. So + * closing this fd should release all OFD locks. + */ + close(plock->fd); + free(plock); +} + +static int do_statx(struct lo_data *lo, int dirfd, const char *pathname, + struct stat *statbuf, int flags, uint64_t *mnt_id) +{ + int res; + +#if defined(CONFIG_STATX) && defined(STATX_MNT_ID) + if (lo->use_statx) { + struct statx statxbuf; + + res = statx(dirfd, pathname, flags, STATX_BASIC_STATS | STATX_MNT_ID, + &statxbuf); + if (!res) { + memset(statbuf, 0, sizeof(*statbuf)); + statbuf->st_dev = makedev(statxbuf.stx_dev_major, + statxbuf.stx_dev_minor); + statbuf->st_ino = statxbuf.stx_ino; + statbuf->st_mode = statxbuf.stx_mode; + statbuf->st_nlink = statxbuf.stx_nlink; + statbuf->st_uid = statxbuf.stx_uid; + statbuf->st_gid = statxbuf.stx_gid; + statbuf->st_rdev = makedev(statxbuf.stx_rdev_major, + statxbuf.stx_rdev_minor); + statbuf->st_size = statxbuf.stx_size; + statbuf->st_blksize = statxbuf.stx_blksize; + statbuf->st_blocks = statxbuf.stx_blocks; + statbuf->st_atim.tv_sec = statxbuf.stx_atime.tv_sec; + statbuf->st_atim.tv_nsec = statxbuf.stx_atime.tv_nsec; + statbuf->st_mtim.tv_sec = statxbuf.stx_mtime.tv_sec; + statbuf->st_mtim.tv_nsec = statxbuf.stx_mtime.tv_nsec; + statbuf->st_ctim.tv_sec = statxbuf.stx_ctime.tv_sec; + statbuf->st_ctim.tv_nsec = statxbuf.stx_ctime.tv_nsec; + + if (statxbuf.stx_mask & STATX_MNT_ID) { + *mnt_id = statxbuf.stx_mnt_id; + } else { + *mnt_id = 0; + } + return 0; + } else if (errno != ENOSYS) { + return -1; + } + lo->use_statx = false; + /* fallback */ + } +#endif + res = fstatat(dirfd, pathname, statbuf, flags); + if (res == -1) { + return -1; + } + *mnt_id = 0; + + return 0; +} + +/* + * Increments nlookup on the inode on success. unref_inode_lolocked() must be + * called eventually to decrement nlookup again. If inodep is non-NULL, the + * inode pointer is stored and the caller must call lo_inode_put(). + */ +static int lo_do_lookup(fuse_req_t req, fuse_ino_t parent, const char *name, + struct fuse_entry_param *e, + struct lo_inode **inodep) +{ + int newfd; + int res; + int saverr; + uint64_t mnt_id; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode = NULL; + struct lo_inode *dir = lo_inode(req, parent); + + if (inodep) { + *inodep = NULL; /* in case there is an error */ + } + + /* + * name_to_handle_at() and open_by_handle_at() can reach here with fuse + * mount point in guest, but we don't have its inode info in the + * ino_map. + */ + if (!dir) { + return ENOENT; + } + + memset(e, 0, sizeof(*e)); + e->attr_timeout = lo->timeout; + e->entry_timeout = lo->timeout; + + /* Do not allow escaping root directory */ + if (dir == &lo->root && strcmp(name, "..") == 0) { + name = "."; + } + + newfd = openat(dir->fd, name, O_PATH | O_NOFOLLOW); + if (newfd == -1) { + goto out_err; + } + + res = do_statx(lo, newfd, "", &e->attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, + &mnt_id); + if (res == -1) { + goto out_err; + } + + if (S_ISDIR(e->attr.st_mode) && lo->announce_submounts && + (e->attr.st_dev != dir->key.dev || mnt_id != dir->key.mnt_id)) { + e->attr_flags |= FUSE_ATTR_SUBMOUNT; + } + + inode = lo_find(lo, &e->attr, mnt_id); + if (inode) { + close(newfd); + } else { + inode = calloc(1, sizeof(struct lo_inode)); + if (!inode) { + goto out_err; + } + + /* cache only filetype */ + inode->filetype = (e->attr.st_mode & S_IFMT); + + /* + * One for the caller and one for nlookup (released in + * unref_inode_lolocked()) + */ + g_atomic_int_set(&inode->refcount, 2); + + inode->nlookup = 1; + inode->fd = newfd; + inode->key.ino = e->attr.st_ino; + inode->key.dev = e->attr.st_dev; + inode->key.mnt_id = mnt_id; + if (lo->posix_lock) { + pthread_mutex_init(&inode->plock_mutex, NULL); + inode->posix_locks = g_hash_table_new_full( + g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); + } + pthread_mutex_lock(&lo->mutex); + inode->fuse_ino = lo_add_inode_mapping(req, inode); + g_hash_table_insert(lo->inodes, &inode->key, inode); + pthread_mutex_unlock(&lo->mutex); + } + e->ino = inode->fuse_ino; + + /* Transfer ownership of inode pointer to caller or drop it */ + if (inodep) { + *inodep = inode; + } else { + lo_inode_put(lo, &inode); + } + + lo_inode_put(lo, &dir); + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e->ino); + + return 0; + +out_err: + saverr = errno; + if (newfd != -1) { + close(newfd); + } + lo_inode_put(lo, &inode); + lo_inode_put(lo, &dir); + return saverr; +} + +static void lo_lookup(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + struct fuse_entry_param e; + int err; + + fuse_log(FUSE_LOG_DEBUG, "lo_lookup(parent=%" PRIu64 ", name=%s)\n", parent, + name); + + if (is_empty(name)) { + fuse_reply_err(req, ENOENT); + return; + } + + /* + * Don't use is_safe_path_component(), allow "." and ".." for NFS export + * support. + */ + if (strchr(name, '/')) { + fuse_reply_err(req, EINVAL); + return; + } + + err = lo_do_lookup(req, parent, name, &e, NULL); + if (err) { + fuse_reply_err(req, err); + } else { + fuse_reply_entry(req, &e); + } +} + +/* + * On some archs, setres*id is limited to 2^16 but they + * provide setres*id32 variants that allow 2^32. + * Others just let setres*id do 2^32 anyway. + */ +#ifdef SYS_setresgid32 +#define OURSYS_setresgid SYS_setresgid32 +#else +#define OURSYS_setresgid SYS_setresgid +#endif + +#ifdef SYS_setresuid32 +#define OURSYS_setresuid SYS_setresuid32 +#else +#define OURSYS_setresuid SYS_setresuid +#endif + +/* + * Change to uid/gid of caller so that file is created with + * ownership of caller. + * TODO: What about selinux context? + */ +static int lo_change_cred(fuse_req_t req, struct lo_cred *old, + bool change_umask) +{ + int res; + + old->euid = geteuid(); + old->egid = getegid(); + + res = syscall(OURSYS_setresgid, -1, fuse_req_ctx(req)->gid, -1); + if (res == -1) { + return errno; + } + + res = syscall(OURSYS_setresuid, -1, fuse_req_ctx(req)->uid, -1); + if (res == -1) { + int errno_save = errno; + + syscall(OURSYS_setresgid, -1, old->egid, -1); + return errno_save; + } + + if (change_umask) { + old->umask = umask(req->ctx.umask); + } + return 0; +} + +/* Regain Privileges */ +static void lo_restore_cred(struct lo_cred *old, bool restore_umask) +{ + int res; + + res = syscall(OURSYS_setresuid, -1, old->euid, -1); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "seteuid(%u): %m\n", old->euid); + exit(1); + } + + res = syscall(OURSYS_setresgid, -1, old->egid, -1); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "setegid(%u): %m\n", old->egid); + exit(1); + } + + if (restore_umask) + umask(old->umask); +} + +/* + * A helper to change cred and drop capability. Returns 0 on success and + * errno on error + */ +static int lo_drop_cap_change_cred(fuse_req_t req, struct lo_cred *old, + bool change_umask, const char *cap_name, + bool *cap_dropped) +{ + int ret; + bool __cap_dropped; + + assert(cap_name); + + ret = drop_effective_cap(cap_name, &__cap_dropped); + if (ret) { + return ret; + } + + ret = lo_change_cred(req, old, change_umask); + if (ret) { + if (__cap_dropped) { + if (gain_effective_cap(cap_name)) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); + } + } + } + + if (cap_dropped) { + *cap_dropped = __cap_dropped; + } + return ret; +} + +static void lo_restore_cred_gain_cap(struct lo_cred *old, bool restore_umask, + const char *cap_name) +{ + assert(cap_name); + + lo_restore_cred(old, restore_umask); + + if (gain_effective_cap(cap_name)) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_%s\n", cap_name); + } +} + +static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t parent, + const char *name, mode_t mode, dev_t rdev, + const char *link) +{ + int res; + int saverr; + struct lo_data *lo = lo_data(req); + struct lo_inode *dir; + struct fuse_entry_param e; + struct lo_cred old = {}; + + if (is_empty(name)) { + fuse_reply_err(req, ENOENT); + return; + } + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + dir = lo_inode(req, parent); + if (!dir) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = lo_change_cred(req, &old, lo->change_umask && !S_ISLNK(mode)); + if (saverr) { + goto out; + } + + res = mknod_wrapper(dir->fd, name, link, mode, rdev); + + saverr = errno; + + lo_restore_cred(&old, lo->change_umask && !S_ISLNK(mode)); + + if (res == -1) { + goto out; + } + + saverr = lo_do_lookup(req, parent, name, &e, NULL); + if (saverr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + lo_inode_put(lo, &dir); + return; + +out: + lo_inode_put(lo, &dir); + fuse_reply_err(req, saverr); +} + +static void lo_mknod(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, dev_t rdev) +{ + lo_mknod_symlink(req, parent, name, mode, rdev, NULL); +} + +static void lo_mkdir(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode) +{ + lo_mknod_symlink(req, parent, name, S_IFDIR | mode, 0, NULL); +} + +static void lo_symlink(fuse_req_t req, const char *link, fuse_ino_t parent, + const char *name) +{ + lo_mknod_symlink(req, parent, name, S_IFLNK, 0, link); +} + +static void lo_link(fuse_req_t req, fuse_ino_t ino, fuse_ino_t parent, + const char *name) +{ + int res; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; + struct lo_inode *inode; + struct fuse_entry_param e; + char procname[64]; + int saverr; + + if (is_empty(name)) { + fuse_reply_err(req, ENOENT); + return; + } + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + inode = lo_inode(req, ino); + if (!parent_inode || !inode) { + errno = EBADF; + goto out_err; + } + + memset(&e, 0, sizeof(struct fuse_entry_param)); + e.attr_timeout = lo->timeout; + e.entry_timeout = lo->timeout; + + sprintf(procname, "%i", inode->fd); + res = linkat(lo->proc_self_fd, procname, parent_inode->fd, name, + AT_SYMLINK_FOLLOW); + if (res == -1) { + goto out_err; + } + + res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW); + if (res == -1) { + goto out_err; + } + + pthread_mutex_lock(&lo->mutex); + inode->nlookup++; + pthread_mutex_unlock(&lo->mutex); + e.ino = inode->fuse_ino; + + fuse_log(FUSE_LOG_DEBUG, " %lli/%s -> %lli\n", (unsigned long long)parent, + name, (unsigned long long)e.ino); + + fuse_reply_entry(req, &e); + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &inode); + fuse_reply_err(req, saverr); +} + +/* Increments nlookup and caller must release refcount using lo_inode_put() */ +static struct lo_inode *lookup_name(fuse_req_t req, fuse_ino_t parent, + const char *name) +{ + int res; + uint64_t mnt_id; + struct stat attr; + struct lo_data *lo = lo_data(req); + struct lo_inode *dir = lo_inode(req, parent); + + if (!dir) { + return NULL; + } + + res = do_statx(lo, dir->fd, name, &attr, AT_SYMLINK_NOFOLLOW, &mnt_id); + lo_inode_put(lo, &dir); + if (res == -1) { + return NULL; + } + + return lo_find(lo, &attr, mnt_id); +} + +static void lo_rmdir(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int res; + struct lo_inode *inode; + struct lo_data *lo = lo_data(req); + + if (is_empty(name)) { + fuse_reply_err(req, ENOENT); + return; + } + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + inode = lookup_name(req, parent, name); + if (!inode) { + fuse_reply_err(req, EIO); + return; + } + + res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR); + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); + lo_inode_put(lo, &inode); +} + +static void lo_rename(fuse_req_t req, fuse_ino_t parent, const char *name, + fuse_ino_t newparent, const char *newname, + unsigned int flags) +{ + int res; + struct lo_inode *parent_inode; + struct lo_inode *newparent_inode; + struct lo_inode *oldinode = NULL; + struct lo_inode *newinode = NULL; + struct lo_data *lo = lo_data(req); + + if (is_empty(name) || is_empty(newname)) { + fuse_reply_err(req, ENOENT); + return; + } + + if (!is_safe_path_component(name) || !is_safe_path_component(newname)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + newparent_inode = lo_inode(req, newparent); + if (!parent_inode || !newparent_inode) { + fuse_reply_err(req, EBADF); + goto out; + } + + oldinode = lookup_name(req, parent, name); + newinode = lookup_name(req, newparent, newname); + + if (!oldinode) { + fuse_reply_err(req, EIO); + goto out; + } + + if (flags) { +#ifndef SYS_renameat2 + fuse_reply_err(req, EINVAL); +#else + res = syscall(SYS_renameat2, parent_inode->fd, name, + newparent_inode->fd, newname, flags); + if (res == -1 && errno == ENOSYS) { + fuse_reply_err(req, EINVAL); + } else { + fuse_reply_err(req, res == -1 ? errno : 0); + } +#endif + goto out; + } + + res = renameat(parent_inode->fd, name, newparent_inode->fd, newname); + + fuse_reply_err(req, res == -1 ? errno : 0); +out: + unref_inode_lolocked(lo, oldinode, 1); + unref_inode_lolocked(lo, newinode, 1); + lo_inode_put(lo, &oldinode); + lo_inode_put(lo, &newinode); + lo_inode_put(lo, &parent_inode); + lo_inode_put(lo, &newparent_inode); +} + +static void lo_unlink(fuse_req_t req, fuse_ino_t parent, const char *name) +{ + int res; + struct lo_inode *inode; + struct lo_data *lo = lo_data(req); + + if (is_empty(name)) { + fuse_reply_err(req, ENOENT); + return; + } + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + inode = lookup_name(req, parent, name); + if (!inode) { + fuse_reply_err(req, EIO); + return; + } + + res = unlinkat(lo_fd(req, parent), name, 0); + + fuse_reply_err(req, res == -1 ? errno : 0); + unref_inode_lolocked(lo, inode, 1); + lo_inode_put(lo, &inode); +} + +/* To be called with lo->mutex held */ +static void unref_inode(struct lo_data *lo, struct lo_inode *inode, uint64_t n) +{ + if (!inode) { + return; + } + + assert(inode->nlookup >= n); + inode->nlookup -= n; + if (!inode->nlookup) { + lo_map_remove(&lo->ino_map, inode->fuse_ino); + g_hash_table_remove(lo->inodes, &inode->key); + if (lo->posix_lock) { + if (g_hash_table_size(inode->posix_locks)) { + fuse_log(FUSE_LOG_WARNING, "Hash table is not empty\n"); + } + g_hash_table_destroy(inode->posix_locks); + pthread_mutex_destroy(&inode->plock_mutex); + } + /* Drop our refcount from lo_do_lookup() */ + lo_inode_put(lo, &inode); + } +} + +static void unref_inode_lolocked(struct lo_data *lo, struct lo_inode *inode, + uint64_t n) +{ + if (!inode) { + return; + } + + pthread_mutex_lock(&lo->mutex); + unref_inode(lo, inode, n); + pthread_mutex_unlock(&lo->mutex); +} + +static void lo_forget_one(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + + inode = lo_inode(req, ino); + if (!inode) { + return; + } + + fuse_log(FUSE_LOG_DEBUG, " forget %lli %lli -%lli\n", + (unsigned long long)ino, (unsigned long long)inode->nlookup, + (unsigned long long)nlookup); + + unref_inode_lolocked(lo, inode, nlookup); + lo_inode_put(lo, &inode); +} + +static void lo_forget(fuse_req_t req, fuse_ino_t ino, uint64_t nlookup) +{ + lo_forget_one(req, ino, nlookup); + fuse_reply_none(req); +} + +static void lo_forget_multi(fuse_req_t req, size_t count, + struct fuse_forget_data *forgets) +{ + int i; + + for (i = 0; i < count; i++) { + lo_forget_one(req, forgets[i].ino, forgets[i].nlookup); + } + fuse_reply_none(req); +} + +static void lo_readlink(fuse_req_t req, fuse_ino_t ino) +{ + char buf[PATH_MAX + 1]; + int res; + + res = readlinkat(lo_fd(req, ino), "", buf, sizeof(buf)); + if (res == -1) { + return (void)fuse_reply_err(req, errno); + } + + if (res == sizeof(buf)) { + return (void)fuse_reply_err(req, ENAMETOOLONG); + } + + buf[res] = '\0'; + + fuse_reply_readlink(req, buf); +} + +struct lo_dirp { + gint refcount; + DIR *dp; + struct dirent *entry; + off_t offset; +}; + +static void lo_dirp_put(struct lo_dirp **dp) +{ + struct lo_dirp *d = *dp; + + if (!d) { + return; + } + *dp = NULL; + + if (g_atomic_int_dec_and_test(&d->refcount)) { + closedir(d->dp); + free(d); + } +} + +/* Call lo_dirp_put() on the return value when no longer needed */ +static struct lo_dirp *lo_dirp(fuse_req_t req, struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); + if (elem) { + g_atomic_int_inc(&elem->dirp->refcount); + } + pthread_mutex_unlock(&lo->mutex); + if (!elem) { + return NULL; + } + + return elem->dirp; +} + +static void lo_opendir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + int error = ENOMEM; + struct lo_data *lo = lo_data(req); + struct lo_dirp *d; + int fd; + ssize_t fh; + + d = calloc(1, sizeof(struct lo_dirp)); + if (d == NULL) { + goto out_err; + } + + fd = openat(lo_fd(req, ino), ".", O_RDONLY); + if (fd == -1) { + goto out_errno; + } + + d->dp = fdopendir(fd); + if (d->dp == NULL) { + goto out_errno; + } + + d->offset = 0; + d->entry = NULL; + + g_atomic_int_set(&d->refcount, 1); /* paired with lo_releasedir() */ + pthread_mutex_lock(&lo->mutex); + fh = lo_add_dirp_mapping(req, d); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + goto out_err; + } + + fi->fh = fh; + if (lo->cache == CACHE_ALWAYS) { + fi->cache_readdir = 1; + } + fuse_reply_open(req, fi); + return; + +out_errno: + error = errno; +out_err: + if (d) { + if (d->dp) { + closedir(d->dp); + } else if (fd != -1) { + close(fd); + } + free(d); + } + fuse_reply_err(req, error); +} + +static void lo_do_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi, int plus) +{ + struct lo_data *lo = lo_data(req); + struct lo_dirp *d = NULL; + struct lo_inode *dinode; + g_autofree char *buf = NULL; + char *p; + size_t rem = size; + int err = EBADF; + + dinode = lo_inode(req, ino); + if (!dinode) { + goto error; + } + + d = lo_dirp(req, fi); + if (!d) { + goto error; + } + + err = ENOMEM; + buf = g_try_malloc0(size); + if (!buf) { + goto error; + } + p = buf; + + if (offset != d->offset) { + seekdir(d->dp, offset); + d->entry = NULL; + d->offset = offset; + } + while (1) { + size_t entsize; + off_t nextoff; + const char *name; + + if (!d->entry) { + errno = 0; + d->entry = readdir(d->dp); + if (!d->entry) { + if (errno) { /* Error */ + err = errno; + goto error; + } else { /* End of stream */ + break; + } + } + } + nextoff = d->entry->d_off; + name = d->entry->d_name; + + fuse_ino_t entry_ino = 0; + struct fuse_entry_param e = (struct fuse_entry_param){ + .attr.st_ino = d->entry->d_ino, + .attr.st_mode = d->entry->d_type << 12, + }; + + /* Hide root's parent directory */ + if (dinode == &lo->root && strcmp(name, "..") == 0) { + e.attr.st_ino = lo->root.key.ino; + e.attr.st_mode = DT_DIR << 12; + } + + if (plus) { + if (!is_dot_or_dotdot(name)) { + err = lo_do_lookup(req, ino, name, &e, NULL); + if (err) { + goto error; + } + entry_ino = e.ino; + } + + entsize = fuse_add_direntry_plus(req, p, rem, name, &e, nextoff); + } else { + entsize = fuse_add_direntry(req, p, rem, name, &e.attr, nextoff); + } + if (entsize > rem) { + if (entry_ino != 0) { + lo_forget_one(req, entry_ino, 1); + } + break; + } + + p += entsize; + rem -= entsize; + + d->entry = NULL; + d->offset = nextoff; + } + + err = 0; +error: + lo_dirp_put(&d); + lo_inode_put(lo, &dinode); + + /* + * If there's an error, we can only signal it if we haven't stored + * any entries yet - otherwise we'd end up with wrong lookup + * counts for the entries that are already in the buffer. So we + * return what we've collected until that point. + */ + if (err && rem == size) { + fuse_reply_err(req, err); + } else { + fuse_reply_buf(req, buf, size - rem); + } +} + +static void lo_readdir(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + lo_do_readdir(req, ino, size, offset, fi, 0); +} + +static void lo_readdirplus(fuse_req_t req, fuse_ino_t ino, size_t size, + off_t offset, struct fuse_file_info *fi) +{ + lo_do_readdir(req, ino, size, offset, fi, 1); +} + +static void lo_releasedir(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + struct lo_dirp *d; + + (void)ino; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->dirp_map, fi->fh); + if (!elem) { + pthread_mutex_unlock(&lo->mutex); + fuse_reply_err(req, EBADF); + return; + } + + d = elem->dirp; + lo_map_remove(&lo->dirp_map, fi->fh); + pthread_mutex_unlock(&lo->mutex); + + lo_dirp_put(&d); /* paired with lo_opendir() */ + + fuse_reply_err(req, 0); +} + +static void update_open_flags(int writeback, int allow_direct_io, + struct fuse_file_info *fi) +{ + /* + * With writeback cache, kernel may send read requests even + * when userspace opened write-only + */ + if (writeback && (fi->flags & O_ACCMODE) == O_WRONLY) { + fi->flags &= ~O_ACCMODE; + fi->flags |= O_RDWR; + } + + /* + * With writeback cache, O_APPEND is handled by the kernel. + * This breaks atomicity (since the file may change in the + * underlying filesystem, so that the kernel's idea of the + * end of the file isn't accurate anymore). In this example, + * we just accept that. A more rigorous filesystem may want + * to return an error here + */ + if (writeback && (fi->flags & O_APPEND)) { + fi->flags &= ~O_APPEND; + } + + /* + * O_DIRECT in guest should not necessarily mean bypassing page + * cache on host as well. Therefore, we discard it by default + * ('-o no_allow_direct_io'). If somebody needs that behavior, + * the '-o allow_direct_io' option should be set. + */ + if (!allow_direct_io) { + fi->flags &= ~O_DIRECT; + } +} + +/* + * Open a regular file, set up an fd mapping, and fill out the struct + * fuse_file_info for it. If existing_fd is not negative, use that fd instead + * opening a new one. Takes ownership of existing_fd. + * + * Returns 0 on success or a positive errno. + */ +static int lo_do_open(struct lo_data *lo, struct lo_inode *inode, + int existing_fd, struct fuse_file_info *fi) +{ + ssize_t fh; + int fd = existing_fd; + int err; + bool cap_fsetid_dropped = false; + bool kill_suidgid = lo->killpriv_v2 && fi->kill_priv; + + update_open_flags(lo->writeback, lo->allow_direct_io, fi); + + if (fd < 0) { + if (kill_suidgid) { + err = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (err) { + return err; + } + } + + fd = lo_inode_open(lo, inode, fi->flags); + + if (cap_fsetid_dropped) { + if (gain_effective_cap("FSETID")) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } + if (fd < 0) { + return -fd; + } + if (fi->flags & (O_TRUNC)) { + int err = drop_security_capability(lo, fd); + if (err) { + close(fd); + return err; + } + } + } + + pthread_mutex_lock(&lo->mutex); + fh = lo_add_fd_mapping(lo, fd); + pthread_mutex_unlock(&lo->mutex); + if (fh == -1) { + close(fd); + return ENOMEM; + } + + fi->fh = fh; + if (lo->cache == CACHE_NONE) { + fi->direct_io = 1; + } else if (lo->cache == CACHE_ALWAYS) { + fi->keep_cache = 1; + } + return 0; +} + +static void lo_create(fuse_req_t req, fuse_ino_t parent, const char *name, + mode_t mode, struct fuse_file_info *fi) +{ + int fd = -1; + struct lo_data *lo = lo_data(req); + struct lo_inode *parent_inode; + struct lo_inode *inode = NULL; + struct fuse_entry_param e; + int err; + struct lo_cred old = {}; + + fuse_log(FUSE_LOG_DEBUG, "lo_create(parent=%" PRIu64 ", name=%s)" + " kill_priv=%d\n", parent, name, fi->kill_priv); + + if (!is_safe_path_component(name)) { + fuse_reply_err(req, EINVAL); + return; + } + + parent_inode = lo_inode(req, parent); + if (!parent_inode) { + fuse_reply_err(req, EBADF); + return; + } + + err = lo_change_cred(req, &old, lo->change_umask); + if (err) { + goto out; + } + + update_open_flags(lo->writeback, lo->allow_direct_io, fi); + + /* Try to create a new file but don't open existing files */ + fd = openat(parent_inode->fd, name, fi->flags | O_CREAT | O_EXCL, mode); + err = fd == -1 ? errno : 0; + + lo_restore_cred(&old, lo->change_umask); + + /* Ignore the error if file exists and O_EXCL was not given */ + if (err && (err != EEXIST || (fi->flags & O_EXCL))) { + goto out; + } + + err = lo_do_lookup(req, parent, name, &e, &inode); + if (err) { + goto out; + } + + err = lo_do_open(lo, inode, fd, fi); + fd = -1; /* lo_do_open() takes ownership of fd */ + if (err) { + /* Undo lo_do_lookup() nlookup ref */ + unref_inode_lolocked(lo, inode, 1); + } + +out: + lo_inode_put(lo, &inode); + lo_inode_put(lo, &parent_inode); + + if (err) { + if (fd >= 0) { + close(fd); + } + + fuse_reply_err(req, err); + } else { + fuse_reply_create(req, &e, fi); + } +} + +/* Should be called with inode->plock_mutex held */ +static struct lo_inode_plock *lookup_create_plock_ctx(struct lo_data *lo, + struct lo_inode *inode, + uint64_t lock_owner, + pid_t pid, int *err) +{ + struct lo_inode_plock *plock; + int fd; + + plock = + g_hash_table_lookup(inode->posix_locks, GUINT_TO_POINTER(lock_owner)); + + if (plock) { + return plock; + } + + plock = malloc(sizeof(struct lo_inode_plock)); + if (!plock) { + *err = ENOMEM; + return NULL; + } + + /* Open another instance of file which can be used for ofd locks. */ + /* TODO: What if file is not writable? */ + fd = lo_inode_open(lo, inode, O_RDWR); + if (fd < 0) { + *err = -fd; + free(plock); + return NULL; + } + + plock->lock_owner = lock_owner; + plock->fd = fd; + g_hash_table_insert(inode->posix_locks, GUINT_TO_POINTER(plock->lock_owner), + plock); + return plock; +} + +static void lo_getlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + struct lo_inode_plock *plock; + int ret, saverr = 0; + + fuse_log(FUSE_LOG_DEBUG, + "lo_getlk(ino=%" PRIu64 ", flags=%d)" + " owner=0x%" PRIx64 ", l_type=%d l_start=0x%" PRIx64 + " l_len=0x%" PRIx64 "\n", + ino, fi->flags, fi->lock_owner, lock->l_type, + (uint64_t)lock->l_start, (uint64_t)lock->l_len); + + if (!lo->posix_lock) { + fuse_reply_err(req, ENOSYS); + return; + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + pthread_mutex_lock(&inode->plock_mutex); + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + if (!plock) { + saverr = ret; + goto out; + } + + ret = fcntl(plock->fd, F_OFD_GETLK, lock); + if (ret == -1) { + saverr = errno; + } + +out: + pthread_mutex_unlock(&inode->plock_mutex); + lo_inode_put(lo, &inode); + + if (saverr) { + fuse_reply_err(req, saverr); + } else { + fuse_reply_lock(req, lock); + } +} + +static void lo_setlk(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + struct flock *lock, int sleep) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + struct lo_inode_plock *plock; + int ret, saverr = 0; + + fuse_log(FUSE_LOG_DEBUG, + "lo_setlk(ino=%" PRIu64 ", flags=%d)" + " cmd=%d pid=%d owner=0x%" PRIx64 " sleep=%d l_whence=%d" + " l_start=0x%" PRIx64 " l_len=0x%" PRIx64 "\n", + ino, fi->flags, lock->l_type, lock->l_pid, fi->lock_owner, sleep, + lock->l_whence, (uint64_t)lock->l_start, (uint64_t)lock->l_len); + + if (!lo->posix_lock) { + fuse_reply_err(req, ENOSYS); + return; + } + + if (sleep) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + pthread_mutex_lock(&inode->plock_mutex); + plock = + lookup_create_plock_ctx(lo, inode, fi->lock_owner, lock->l_pid, &ret); + + if (!plock) { + saverr = ret; + goto out; + } + + /* TODO: Is it alright to modify flock? */ + lock->l_pid = 0; + ret = fcntl(plock->fd, F_OFD_SETLK, lock); + if (ret == -1) { + saverr = errno; + } + +out: + pthread_mutex_unlock(&inode->plock_mutex); + lo_inode_put(lo, &inode); + + fuse_reply_err(req, saverr); +} + +static void lo_fsyncdir(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + int res; + struct lo_dirp *d; + int fd; + + (void)ino; + + d = lo_dirp(req, fi); + if (!d) { + fuse_reply_err(req, EBADF); + return; + } + + fd = dirfd(d->dp); + if (datasync) { + res = fdatasync(fd); + } else { + res = fsync(fd); + } + + lo_dirp_put(&d); + + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_open(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_inode *inode = lo_inode(req, ino); + int err; + + fuse_log(FUSE_LOG_DEBUG, "lo_open(ino=%" PRIu64 ", flags=%d, kill_priv=%d)" + "\n", ino, fi->flags, fi->kill_priv); + + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + err = lo_do_open(lo, inode, -1, fi); + lo_inode_put(lo, &inode); + if (err) { + fuse_reply_err(req, err); + } else { + fuse_reply_open(req, fi); + } +} + +static void lo_release(fuse_req_t req, fuse_ino_t ino, + struct fuse_file_info *fi) +{ + struct lo_data *lo = lo_data(req); + struct lo_map_elem *elem; + int fd = -1; + + (void)ino; + + pthread_mutex_lock(&lo->mutex); + elem = lo_map_get(&lo->fd_map, fi->fh); + if (elem) { + fd = elem->fd; + elem = NULL; + lo_map_remove(&lo->fd_map, fi->fh); + } + pthread_mutex_unlock(&lo->mutex); + + close(fd); + fuse_reply_err(req, 0); +} + +static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi) +{ + int res; + (void)ino; + struct lo_inode *inode; + struct lo_data *lo = lo_data(req); + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + if (!S_ISREG(inode->filetype)) { + lo_inode_put(lo, &inode); + fuse_reply_err(req, EBADF); + return; + } + + /* An fd is going away. Cleanup associated posix locks */ + if (lo->posix_lock) { + pthread_mutex_lock(&inode->plock_mutex); + g_hash_table_remove(inode->posix_locks, + GUINT_TO_POINTER(fi->lock_owner)); + pthread_mutex_unlock(&inode->plock_mutex); + } + res = close(dup(lo_fi_fd(req, fi))); + lo_inode_put(lo, &inode); + fuse_reply_err(req, res == -1 ? errno : 0); +} + +static void lo_fsync(fuse_req_t req, fuse_ino_t ino, int datasync, + struct fuse_file_info *fi) +{ + struct lo_inode *inode = lo_inode(req, ino); + struct lo_data *lo = lo_data(req); + int res; + int fd; + + fuse_log(FUSE_LOG_DEBUG, "lo_fsync(ino=%" PRIu64 ", fi=0x%p)\n", ino, + (void *)fi); + + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + if (!fi) { + fd = lo_inode_open(lo, inode, O_RDWR); + if (fd < 0) { + res = -fd; + goto out; + } + } else { + fd = lo_fi_fd(req, fi); + } + + if (datasync) { + res = fdatasync(fd) == -1 ? errno : 0; + } else { + res = fsync(fd) == -1 ? errno : 0; + } + if (!fi) { + close(fd); + } +out: + lo_inode_put(lo, &inode); + fuse_reply_err(req, res); +} + +static void lo_read(fuse_req_t req, fuse_ino_t ino, size_t size, off_t offset, + struct fuse_file_info *fi) +{ + struct fuse_bufvec buf = FUSE_BUFVEC_INIT(size); + + fuse_log(FUSE_LOG_DEBUG, + "lo_read(ino=%" PRIu64 ", size=%zd, " + "off=%lu)\n", + ino, size, (unsigned long)offset); + + buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + buf.buf[0].fd = lo_fi_fd(req, fi); + buf.buf[0].pos = offset; + + fuse_reply_data(req, &buf); +} + +static void lo_write_buf(fuse_req_t req, fuse_ino_t ino, + struct fuse_bufvec *in_buf, off_t off, + struct fuse_file_info *fi) +{ + (void)ino; + ssize_t res; + struct fuse_bufvec out_buf = FUSE_BUFVEC_INIT(fuse_buf_size(in_buf)); + bool cap_fsetid_dropped = false; + + out_buf.buf[0].flags = FUSE_BUF_IS_FD | FUSE_BUF_FD_SEEK; + out_buf.buf[0].fd = lo_fi_fd(req, fi); + out_buf.buf[0].pos = off; + + fuse_log(FUSE_LOG_DEBUG, + "lo_write_buf(ino=%" PRIu64 ", size=%zd, off=%lu kill_priv=%d)\n", + ino, out_buf.buf[0].size, (unsigned long)off, fi->kill_priv); + + res = drop_security_capability(lo_data(req), out_buf.buf[0].fd); + if (res) { + fuse_reply_err(req, res); + return; + } + + /* + * If kill_priv is set, drop CAP_FSETID which should lead to kernel + * clearing setuid/setgid on file. Note, for WRITE, we need to do + * this even if killpriv_v2 is not enabled. fuse direct write path + * relies on this. + */ + if (fi->kill_priv) { + res = drop_effective_cap("FSETID", &cap_fsetid_dropped); + if (res != 0) { + fuse_reply_err(req, res); + return; + } + } + + res = fuse_buf_copy(&out_buf, in_buf); + if (res < 0) { + fuse_reply_err(req, -res); + } else { + fuse_reply_write(req, (size_t)res); + } + + if (cap_fsetid_dropped) { + res = gain_effective_cap("FSETID"); + if (res) { + fuse_log(FUSE_LOG_ERR, "Failed to gain CAP_FSETID\n"); + } + } +} + +static void lo_statfs(fuse_req_t req, fuse_ino_t ino) +{ + int res; + struct statvfs stbuf; + + res = fstatvfs(lo_fd(req, ino), &stbuf); + if (res == -1) { + fuse_reply_err(req, errno); + } else { + fuse_reply_statfs(req, &stbuf); + } +} + +static void lo_fallocate(fuse_req_t req, fuse_ino_t ino, int mode, off_t offset, + off_t length, struct fuse_file_info *fi) +{ + int err = EOPNOTSUPP; + (void)ino; + +#ifdef CONFIG_FALLOCATE + err = fallocate(lo_fi_fd(req, fi), mode, offset, length); + if (err < 0) { + err = errno; + } + +#elif defined(CONFIG_POSIX_FALLOCATE) + if (mode) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + err = posix_fallocate(lo_fi_fd(req, fi), offset, length); +#endif + + fuse_reply_err(req, err); +} + +static void lo_flock(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi, + int op) +{ + int res; + (void)ino; + + res = flock(lo_fi_fd(req, fi), op); + + fuse_reply_err(req, res == -1 ? errno : 0); +} + +/* types */ +/* + * Exit; process attribute unmodified if matched. + * An empty key applies to all. + */ +#define XATTR_MAP_FLAG_OK (1 << 0) +/* + * The attribute is unwanted; + * EPERM on write, hidden on read. + */ +#define XATTR_MAP_FLAG_BAD (1 << 1) +/* + * For attr that start with 'key' prepend 'prepend' + * 'key' may be empty to prepend for all attrs + * key is defined from set/remove point of view. + * Automatically reversed on read + */ +#define XATTR_MAP_FLAG_PREFIX (1 << 2) +/* + * The attribute is unsupported; + * ENOTSUP on write, hidden on read. + */ +#define XATTR_MAP_FLAG_UNSUPPORTED (1 << 3) + +/* scopes */ +/* Apply rule to get/set/remove */ +#define XATTR_MAP_FLAG_CLIENT (1 << 16) +/* Apply rule to list */ +#define XATTR_MAP_FLAG_SERVER (1 << 17) +/* Apply rule to all */ +#define XATTR_MAP_FLAG_ALL (XATTR_MAP_FLAG_SERVER | XATTR_MAP_FLAG_CLIENT) + +static void add_xattrmap_entry(struct lo_data *lo, + const XattrMapEntry *new_entry) +{ + XattrMapEntry *res = g_realloc_n(lo->xattr_map_list, + lo->xattr_map_nentries + 1, + sizeof(XattrMapEntry)); + res[lo->xattr_map_nentries++] = *new_entry; + + lo->xattr_map_list = res; +} + +static void free_xattrmap(struct lo_data *lo) +{ + XattrMapEntry *map = lo->xattr_map_list; + size_t i; + + if (!map) { + return; + } + + for (i = 0; i < lo->xattr_map_nentries; i++) { + g_free(map[i].key); + g_free(map[i].prepend); + }; + + g_free(map); + lo->xattr_map_list = NULL; + lo->xattr_map_nentries = -1; +} + +/* + * Handle the 'map' type, which is sugar for a set of commands + * for the common case of prefixing a subset or everything, + * and allowing anything not prefixed through. + * It must be the last entry in the stream, although there + * can be other entries before it. + * The form is: + * :map:key:prefix: + * + * key maybe empty in which case all entries are prefixed. + */ +static void parse_xattrmap_map(struct lo_data *lo, + const char *rule, char sep) +{ + const char *tmp; + char *key; + char *prefix; + XattrMapEntry tmp_entry; + + if (*rule != sep) { + fuse_log(FUSE_LOG_ERR, + "%s: Expecting '%c' after 'map' keyword, found '%c'\n", + __func__, sep, *rule); + exit(1); + } + + rule++; + + /* At start of 'key' field */ + tmp = strchr(rule, sep); + if (!tmp) { + fuse_log(FUSE_LOG_ERR, + "%s: Missing '%c' at end of key field in map rule\n", + __func__, sep); + exit(1); + } + + key = g_strndup(rule, tmp - rule); + rule = tmp + 1; + + /* At start of prefix field */ + tmp = strchr(rule, sep); + if (!tmp) { + fuse_log(FUSE_LOG_ERR, + "%s: Missing '%c' at end of prefix field in map rule\n", + __func__, sep); + exit(1); + } + + prefix = g_strndup(rule, tmp - rule); + rule = tmp + 1; + + /* + * This should be the end of the string, we don't allow + * any more commands after 'map'. + */ + if (*rule) { + fuse_log(FUSE_LOG_ERR, + "%s: Expecting end of command after map, found '%c'\n", + __func__, *rule); + exit(1); + } + + /* 1st: Prefix matches/everything */ + tmp_entry.flags = XATTR_MAP_FLAG_PREFIX | XATTR_MAP_FLAG_ALL; + tmp_entry.key = g_strdup(key); + tmp_entry.prepend = g_strdup(prefix); + add_xattrmap_entry(lo, &tmp_entry); + + if (!*key) { + /* Prefix all case */ + + /* 2nd: Hide any non-prefixed entries on the host */ + tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_ALL; + tmp_entry.key = g_strdup(""); + tmp_entry.prepend = g_strdup(""); + add_xattrmap_entry(lo, &tmp_entry); + } else { + /* Prefix matching case */ + + /* 2nd: Hide non-prefixed but matching entries on the host */ + tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_SERVER; + tmp_entry.key = g_strdup(""); /* Not used */ + tmp_entry.prepend = g_strdup(key); + add_xattrmap_entry(lo, &tmp_entry); + + /* 3rd: Stop the client accessing prefixed attributes directly */ + tmp_entry.flags = XATTR_MAP_FLAG_BAD | XATTR_MAP_FLAG_CLIENT; + tmp_entry.key = g_strdup(prefix); + tmp_entry.prepend = g_strdup(""); /* Not used */ + add_xattrmap_entry(lo, &tmp_entry); + + /* 4th: Everything else is OK */ + tmp_entry.flags = XATTR_MAP_FLAG_OK | XATTR_MAP_FLAG_ALL; + tmp_entry.key = g_strdup(""); + tmp_entry.prepend = g_strdup(""); + add_xattrmap_entry(lo, &tmp_entry); + } + + g_free(key); + g_free(prefix); +} + +static void parse_xattrmap(struct lo_data *lo) +{ + const char *map = lo->xattrmap; + const char *tmp; + int ret; + + lo->xattr_map_nentries = 0; + while (*map) { + XattrMapEntry tmp_entry; + char sep; + + if (isspace(*map)) { + map++; + continue; + } + /* The separator is the first non-space of the rule */ + sep = *map++; + if (!sep) { + break; + } + + tmp_entry.flags = 0; + /* Start of 'type' */ + if (strstart(map, "prefix", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_PREFIX; + } else if (strstart(map, "ok", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_OK; + } else if (strstart(map, "bad", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_BAD; + } else if (strstart(map, "unsupported", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_UNSUPPORTED; + } else if (strstart(map, "map", &map)) { + /* + * map is sugar that adds a number of rules, and must be + * the last entry. + */ + parse_xattrmap_map(lo, map, sep); + break; + } else { + fuse_log(FUSE_LOG_ERR, + "%s: Unexpected type;" + "Expecting 'prefix', 'ok', 'bad', 'unsupported' or 'map'" + " in rule %zu\n", __func__, lo->xattr_map_nentries); + exit(1); + } + + if (*map++ != sep) { + fuse_log(FUSE_LOG_ERR, + "%s: Missing '%c' at end of type field of rule %zu\n", + __func__, sep, lo->xattr_map_nentries); + exit(1); + } + + /* Start of 'scope' */ + if (strstart(map, "client", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_CLIENT; + } else if (strstart(map, "server", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_SERVER; + } else if (strstart(map, "all", &map)) { + tmp_entry.flags |= XATTR_MAP_FLAG_ALL; + } else { + fuse_log(FUSE_LOG_ERR, + "%s: Unexpected scope;" + " Expecting 'client', 'server', or 'all', in rule %zu\n", + __func__, lo->xattr_map_nentries); + exit(1); + } + + if (*map++ != sep) { + fuse_log(FUSE_LOG_ERR, + "%s: Expecting '%c' found '%c'" + " after scope in rule %zu\n", + __func__, sep, *map, lo->xattr_map_nentries); + exit(1); + } + + /* At start of 'key' field */ + tmp = strchr(map, sep); + if (!tmp) { + fuse_log(FUSE_LOG_ERR, + "%s: Missing '%c' at end of key field of rule %zu", + __func__, sep, lo->xattr_map_nentries); + exit(1); + } + tmp_entry.key = g_strndup(map, tmp - map); + map = tmp + 1; + + /* At start of 'prepend' field */ + tmp = strchr(map, sep); + if (!tmp) { + fuse_log(FUSE_LOG_ERR, + "%s: Missing '%c' at end of prepend field of rule %zu", + __func__, sep, lo->xattr_map_nentries); + exit(1); + } + tmp_entry.prepend = g_strndup(map, tmp - map); + map = tmp + 1; + + add_xattrmap_entry(lo, &tmp_entry); + /* End of rule - go around again for another rule */ + } + + if (!lo->xattr_map_nentries) { + fuse_log(FUSE_LOG_ERR, "Empty xattr map\n"); + exit(1); + } + + ret = xattr_map_client(lo, "security.capability", + &lo->xattr_security_capability); + if (ret) { + fuse_log(FUSE_LOG_ERR, "Failed to map security.capability: %s\n", + strerror(ret)); + exit(1); + } + if (!lo->xattr_security_capability || + !strcmp(lo->xattr_security_capability, "security.capability")) { + /* 1-1 mapping, don't need to do anything */ + free(lo->xattr_security_capability); + lo->xattr_security_capability = NULL; + } +} + +/* + * For use with getxattr/setxattr/removexattr, where the client + * gives us a name and we may need to choose a different one. + * Allocates a buffer for the result placing it in *out_name. + * If there's no change then *out_name is not set. + * Returns 0 on success + * Can return -EPERM to indicate we block a given attribute + * (in which case out_name is not allocated) + * Can return -ENOMEM to indicate out_name couldn't be allocated. + */ +static int xattr_map_client(const struct lo_data *lo, const char *client_name, + char **out_name) +{ + size_t i; + for (i = 0; i < lo->xattr_map_nentries; i++) { + const XattrMapEntry *cur_entry = lo->xattr_map_list + i; + + if ((cur_entry->flags & XATTR_MAP_FLAG_CLIENT) && + (strstart(client_name, cur_entry->key, NULL))) { + if (cur_entry->flags & XATTR_MAP_FLAG_BAD) { + return -EPERM; + } + if (cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) { + return -ENOTSUP; + } + if (cur_entry->flags & XATTR_MAP_FLAG_OK) { + /* Unmodified name */ + return 0; + } + if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) { + *out_name = g_try_malloc(strlen(client_name) + + strlen(cur_entry->prepend) + 1); + if (!*out_name) { + return -ENOMEM; + } + sprintf(*out_name, "%s%s", cur_entry->prepend, client_name); + return 0; + } + } + } + + return -EPERM; +} + +/* + * For use with listxattr where the server fs gives us a name and we may need + * to sanitize this for the client. + * Returns a pointer to the result in *out_name + * This is always the original string or the current string with some prefix + * removed; no reallocation is done. + * Returns 0 on success + * Can return -ENODATA to indicate the name should be dropped from the list. + */ +static int xattr_map_server(const struct lo_data *lo, const char *server_name, + const char **out_name) +{ + size_t i; + const char *end; + + for (i = 0; i < lo->xattr_map_nentries; i++) { + const XattrMapEntry *cur_entry = lo->xattr_map_list + i; + + if ((cur_entry->flags & XATTR_MAP_FLAG_SERVER) && + (strstart(server_name, cur_entry->prepend, &end))) { + if (cur_entry->flags & XATTR_MAP_FLAG_BAD || + cur_entry->flags & XATTR_MAP_FLAG_UNSUPPORTED) { + return -ENODATA; + } + if (cur_entry->flags & XATTR_MAP_FLAG_OK) { + *out_name = server_name; + return 0; + } + if (cur_entry->flags & XATTR_MAP_FLAG_PREFIX) { + /* Remove prefix */ + *out_name = end; + return 0; + } + } + } + + return -ENODATA; +} + +#define FCHDIR_NOFAIL(fd) do { \ + int fchdir_res = fchdir(fd); \ + assert(fchdir_res == 0); \ + } while (0) + +static bool block_xattr(struct lo_data *lo, const char *name) +{ + /* + * If user explicitly enabled posix_acl or did not provide any option, + * do not block acl. Otherwise block system.posix_acl_access and + * system.posix_acl_default xattrs. + */ + if (lo->user_posix_acl) { + return false; + } + if (!strcmp(name, "system.posix_acl_access") || + !strcmp(name, "system.posix_acl_default")) + return true; + + return false; +} + +/* + * Returns number of bytes in xattr_list after filtering on success. This + * could be zero as well if nothing is left after filtering. + * + * Returns negative error code on failure. + * xattr_list is modified in place. + */ +static int remove_blocked_xattrs(struct lo_data *lo, char *xattr_list, + unsigned in_size) +{ + size_t out_index, in_index; + + /* + * As of now we only filter out acl xattrs. If acls are enabled or + * they have not been explicitly disabled, there is nothing to + * filter. + */ + if (lo->user_posix_acl) { + return in_size; + } + + out_index = 0; + in_index = 0; + while (in_index < in_size) { + char *in_ptr = xattr_list + in_index; + + /* Length of current attribute name */ + size_t in_len = strlen(xattr_list + in_index) + 1; + + if (!block_xattr(lo, in_ptr)) { + if (in_index != out_index) { + memmove(xattr_list + out_index, xattr_list + in_index, in_len); + } + out_index += in_len; + } + in_index += in_len; + } + return out_index; +} + +static void lo_getxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, + size_t size) +{ + struct lo_data *lo = lo_data(req); + g_autofree char *value = NULL; + char procname[64]; + const char *name; + char *mapped_name; + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + mapped_name = NULL; + name = in_name; + if (lo->xattrmap) { + ret = xattr_map_client(lo, in_name, &mapped_name); + if (ret < 0) { + if (ret == -EPERM) { + ret = -ENODATA; + } + fuse_reply_err(req, -ret); + return; + } + if (mapped_name) { + name = mapped_name; + } + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + g_free(mapped_name); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_getxattr(ino=%" PRIu64 ", name=%s size=%zd)\n", + ino, name, size); + + if (size) { + value = g_try_malloc(size); + if (!value) { + goto out_err; + } + } + + sprintf(procname, "%i", inode->fd); + /* + * It is not safe to open() non-regular/non-dir files in file server + * unless O_PATH is used, so use that method for regular files/dir + * only (as it seems giving less performance overhead). + * Otherwise, call fchdir() to avoid open(). + */ + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ret = fgetxattr(fd, name, value, size); + saverr = ret == -1 ? errno : 0; + } else { + /* fchdir should not fail here */ + FCHDIR_NOFAIL(lo->proc_self_fd); + ret = getxattr(procname, name, value, size); + saverr = ret == -1 ? errno : 0; + FCHDIR_NOFAIL(lo->root.fd); + } + + if (ret == -1) { + goto out; + } + if (size) { + saverr = 0; + if (ret == 0) { + goto out; + } + fuse_reply_buf(req, value, ret); + } else { + fuse_reply_xattr(req, ret); + } +out_free: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; +out: + fuse_reply_err(req, saverr); + g_free(mapped_name); + goto out_free; +} + +static void lo_listxattr(fuse_req_t req, fuse_ino_t ino, size_t size) +{ + struct lo_data *lo = lo_data(req); + g_autofree char *value = NULL; + char procname[64]; + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_listxattr(ino=%" PRIu64 ", size=%zd)\n", ino, + size); + + if (size) { + value = g_try_malloc(size); + if (!value) { + goto out_err; + } + } + + sprintf(procname, "%i", inode->fd); + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + goto out_err; + } + ret = flistxattr(fd, value, size); + saverr = ret == -1 ? errno : 0; + } else { + /* fchdir should not fail here */ + FCHDIR_NOFAIL(lo->proc_self_fd); + ret = listxattr(procname, value, size); + saverr = ret == -1 ? errno : 0; + FCHDIR_NOFAIL(lo->root.fd); + } + + if (ret == -1) { + goto out; + } + if (size) { + saverr = 0; + if (ret == 0) { + goto out; + } + + if (lo->xattr_map_list) { + /* + * Map the names back, some attributes might be dropped, + * some shortened, but not increased, so we shouldn't + * run out of room. + */ + size_t out_index, in_index; + out_index = 0; + in_index = 0; + while (in_index < ret) { + const char *map_out; + char *in_ptr = value + in_index; + /* Length of current attribute name */ + size_t in_len = strlen(value + in_index) + 1; + + int mapret = xattr_map_server(lo, in_ptr, &map_out); + if (mapret != -ENODATA && mapret != 0) { + /* Shouldn't happen */ + saverr = -mapret; + goto out; + } + if (mapret == 0) { + /* Either unchanged, or truncated */ + size_t out_len; + if (map_out != in_ptr) { + /* +1 copies the NIL */ + out_len = strlen(map_out) + 1; + } else { + /* No change */ + out_len = in_len; + } + /* + * Move result along, may still be needed for an unchanged + * entry if a previous entry was changed. + */ + memmove(value + out_index, map_out, out_len); + + out_index += out_len; + } + in_index += in_len; + } + ret = out_index; + if (ret == 0) { + goto out; + } + } + + ret = remove_blocked_xattrs(lo, value, ret); + if (ret <= 0) { + saverr = -ret; + goto out; + } + fuse_reply_buf(req, value, ret); + } else { + /* + * xattrmap only ever shortens the result, + * so we don't need to do anything clever with the + * allocation length here. + */ + fuse_reply_xattr(req, ret); + } +out_free: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + return; + +out_err: + saverr = errno; +out: + fuse_reply_err(req, saverr); + goto out_free; +} + +static void lo_setxattr(fuse_req_t req, fuse_ino_t ino, const char *in_name, + const char *value, size_t size, int flags, + uint32_t extra_flags) +{ + char procname[64]; + const char *name; + char *mapped_name; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + bool switched_creds = false; + bool cap_fsetid_dropped = false; + struct lo_cred old = {}; + + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + mapped_name = NULL; + name = in_name; + if (lo->xattrmap) { + ret = xattr_map_client(lo, in_name, &mapped_name); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + if (mapped_name) { + name = mapped_name; + } + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + g_free(mapped_name); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_setxattr(ino=%" PRIu64 + ", name=%s value=%s size=%zd)\n", ino, name, value, size); + + sprintf(procname, "%i", inode->fd); + /* + * If we are setting posix access acl and if SGID needs to be + * cleared, then switch to caller's gid and drop CAP_FSETID + * and that should make sure host kernel clears SGID. + * + * This probably will not work when we support idmapped mounts. + * In that case we will need to find a non-root gid and switch + * to it. (Instead of gid in request). Fix it when we support + * idmapped mounts. + */ + if (lo->posix_acl && !strcmp(name, "system.posix_acl_access") + && (extra_flags & FUSE_SETXATTR_ACL_KILL_SGID)) { + ret = lo_drop_cap_change_cred(req, &old, false, "FSETID", + &cap_fsetid_dropped); + if (ret) { + saverr = ret; + goto out; + } + switched_creds = true; + } + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + saverr = errno; + goto out; + } + ret = fsetxattr(fd, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + } else { + /* fchdir should not fail here */ + FCHDIR_NOFAIL(lo->proc_self_fd); + ret = setxattr(procname, name, value, size, flags); + saverr = ret == -1 ? errno : 0; + FCHDIR_NOFAIL(lo->root.fd); + } + if (switched_creds) { + if (cap_fsetid_dropped) + lo_restore_cred_gain_cap(&old, false, "FSETID"); + else + lo_restore_cred(&old, false); + } + +out: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + g_free(mapped_name); + fuse_reply_err(req, saverr); +} + +static void lo_removexattr(fuse_req_t req, fuse_ino_t ino, const char *in_name) +{ + char procname[64]; + const char *name; + char *mapped_name; + struct lo_data *lo = lo_data(req); + struct lo_inode *inode; + ssize_t ret; + int saverr; + int fd = -1; + + if (block_xattr(lo, in_name)) { + fuse_reply_err(req, EOPNOTSUPP); + return; + } + + mapped_name = NULL; + name = in_name; + if (lo->xattrmap) { + ret = xattr_map_client(lo, in_name, &mapped_name); + if (ret < 0) { + fuse_reply_err(req, -ret); + return; + } + if (mapped_name) { + name = mapped_name; + } + } + + inode = lo_inode(req, ino); + if (!inode) { + fuse_reply_err(req, EBADF); + g_free(mapped_name); + return; + } + + saverr = ENOSYS; + if (!lo_data(req)->xattr) { + goto out; + } + + fuse_log(FUSE_LOG_DEBUG, "lo_removexattr(ino=%" PRIu64 ", name=%s)\n", ino, + name); + + sprintf(procname, "%i", inode->fd); + if (S_ISREG(inode->filetype) || S_ISDIR(inode->filetype)) { + fd = openat(lo->proc_self_fd, procname, O_RDONLY); + if (fd < 0) { + saverr = errno; + goto out; + } + ret = fremovexattr(fd, name); + saverr = ret == -1 ? errno : 0; + } else { + /* fchdir should not fail here */ + FCHDIR_NOFAIL(lo->proc_self_fd); + ret = removexattr(procname, name); + saverr = ret == -1 ? errno : 0; + FCHDIR_NOFAIL(lo->root.fd); + } + +out: + if (fd >= 0) { + close(fd); + } + + lo_inode_put(lo, &inode); + g_free(mapped_name); + fuse_reply_err(req, saverr); +} + +#ifdef HAVE_COPY_FILE_RANGE +static void lo_copy_file_range(fuse_req_t req, fuse_ino_t ino_in, off_t off_in, + struct fuse_file_info *fi_in, fuse_ino_t ino_out, + off_t off_out, struct fuse_file_info *fi_out, + size_t len, int flags) +{ + int in_fd, out_fd; + ssize_t res; + + in_fd = lo_fi_fd(req, fi_in); + out_fd = lo_fi_fd(req, fi_out); + + fuse_log(FUSE_LOG_DEBUG, + "lo_copy_file_range(ino=%" PRIu64 "/fd=%d, " + "off=%ju, ino=%" PRIu64 "/fd=%d, " + "off=%ju, size=%zd, flags=0x%x)\n", + ino_in, in_fd, (intmax_t)off_in, + ino_out, out_fd, (intmax_t)off_out, len, flags); + + res = copy_file_range(in_fd, &off_in, out_fd, &off_out, len, flags); + if (res < 0) { + fuse_reply_err(req, errno); + } else { + fuse_reply_write(req, res); + } +} +#endif + +static void lo_lseek(fuse_req_t req, fuse_ino_t ino, off_t off, int whence, + struct fuse_file_info *fi) +{ + off_t res; + + (void)ino; + res = lseek(lo_fi_fd(req, fi), off, whence); + if (res != -1) { + fuse_reply_lseek(req, res); + } else { + fuse_reply_err(req, errno); + } +} + +static void lo_destroy(void *userdata) +{ + struct lo_data *lo = (struct lo_data *)userdata; + + pthread_mutex_lock(&lo->mutex); + while (true) { + GHashTableIter iter; + gpointer key, value; + + g_hash_table_iter_init(&iter, lo->inodes); + if (!g_hash_table_iter_next(&iter, &key, &value)) { + break; + } + + struct lo_inode *inode = value; + unref_inode(lo, inode, inode->nlookup); + } + pthread_mutex_unlock(&lo->mutex); +} + +static struct fuse_lowlevel_ops lo_oper = { + .init = lo_init, + .lookup = lo_lookup, + .mkdir = lo_mkdir, + .mknod = lo_mknod, + .symlink = lo_symlink, + .link = lo_link, + .unlink = lo_unlink, + .rmdir = lo_rmdir, + .rename = lo_rename, + .forget = lo_forget, + .forget_multi = lo_forget_multi, + .getattr = lo_getattr, + .setattr = lo_setattr, + .readlink = lo_readlink, + .opendir = lo_opendir, + .readdir = lo_readdir, + .readdirplus = lo_readdirplus, + .releasedir = lo_releasedir, + .fsyncdir = lo_fsyncdir, + .create = lo_create, + .getlk = lo_getlk, + .setlk = lo_setlk, + .open = lo_open, + .release = lo_release, + .flush = lo_flush, + .fsync = lo_fsync, + .read = lo_read, + .write_buf = lo_write_buf, + .statfs = lo_statfs, + .fallocate = lo_fallocate, + .flock = lo_flock, + .getxattr = lo_getxattr, + .listxattr = lo_listxattr, + .setxattr = lo_setxattr, + .removexattr = lo_removexattr, +#ifdef HAVE_COPY_FILE_RANGE + .copy_file_range = lo_copy_file_range, +#endif + .lseek = lo_lseek, + .destroy = lo_destroy, +}; + +/* Print vhost-user.json backend program capabilities */ +static void print_capabilities(void) +{ + printf("{\n"); + printf(" \"type\": \"fs\"\n"); + printf("}\n"); +} + +/* + * Drop all Linux capabilities because the wait parent process only needs to + * sit in waitpid(2) and terminate. + */ +static void setup_wait_parent_capabilities(void) +{ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + capng_apply(CAPNG_SELECT_BOTH); +} + +/* + * Move to a new mount, net, and pid namespaces to isolate this process. + */ +static void setup_namespaces(struct lo_data *lo, struct fuse_session *se) +{ + pid_t child; + + /* + * Create a new pid namespace for *child* processes. We'll have to + * fork in order to enter the new pid namespace. A new mount namespace + * is also needed so that we can remount /proc for the new pid + * namespace. + * + * Our UNIX domain sockets have been created. Now we can move to + * an empty network namespace to prevent TCP/IP and other network + * activity in case this process is compromised. + */ + if (unshare(CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET) != 0) { + fuse_log(FUSE_LOG_ERR, "unshare(CLONE_NEWPID | CLONE_NEWNS): %m\n"); + exit(1); + } + + child = fork(); + if (child < 0) { + fuse_log(FUSE_LOG_ERR, "fork() failed: %m\n"); + exit(1); + } + if (child > 0) { + pid_t waited; + int wstatus; + + setup_wait_parent_capabilities(); + + /* The parent waits for the child */ + do { + waited = waitpid(child, &wstatus, 0); + } while (waited < 0 && errno == EINTR && !se->exited); + + /* We were terminated by a signal, see fuse_signals.c */ + if (se->exited) { + exit(0); + } + + if (WIFEXITED(wstatus)) { + exit(WEXITSTATUS(wstatus)); + } + + exit(1); + } + + /* Send us SIGTERM when the parent thread terminates, see prctl(2) */ + prctl(PR_SET_PDEATHSIG, SIGTERM); + + /* + * If the mounts have shared propagation then we want to opt out so our + * mount changes don't affect the parent mount namespace. + */ + if (mount(NULL, "/", NULL, MS_REC | MS_SLAVE, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/, MS_REC|MS_SLAVE): %m\n"); + exit(1); + } + + /* The child must remount /proc to use the new pid namespace */ + if (mount("proc", "/proc", "proc", + MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RELATIME, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc): %m\n"); + exit(1); + } + + /* + * We only need /proc/self/fd. Prevent ".." from accessing parent + * directories of /proc/self/fd by bind-mounting it over /proc. Since / was + * previously remounted with MS_REC | MS_SLAVE this mount change only + * affects our process. + */ + if (mount("/proc/self/fd", "/proc", NULL, MS_BIND, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(/proc/self/fd, MS_BIND): %m\n"); + exit(1); + } + + /* Get the /proc (actually /proc/self/fd, see above) file descriptor */ + lo->proc_self_fd = open("/proc", O_PATH); + if (lo->proc_self_fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(/proc, O_PATH): %m\n"); + exit(1); + } +} + +/* + * Capture the capability state, we'll need to restore this for individual + * threads later; see load_capng. + */ +static void setup_capng(void) +{ + /* Note this accesses /proc so has to happen before the sandbox */ + if (capng_get_caps_process()) { + fuse_log(FUSE_LOG_ERR, "capng_get_caps_process\n"); + exit(1); + } + pthread_mutex_init(&cap.mutex, NULL); + pthread_mutex_lock(&cap.mutex); + cap.saved = capng_save_state(); + if (!cap.saved) { + fuse_log(FUSE_LOG_ERR, "capng_save_state\n"); + exit(1); + } + pthread_mutex_unlock(&cap.mutex); +} + +static void cleanup_capng(void) +{ + free(cap.saved); + cap.saved = NULL; + pthread_mutex_destroy(&cap.mutex); +} + + +/* + * Make the source directory our root so symlinks cannot escape and no other + * files are accessible. Assumes unshare(CLONE_NEWNS) was already called. + */ +static void setup_mounts(const char *source) +{ + int oldroot; + int newroot; + + if (mount(source, source, NULL, MS_BIND | MS_REC, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(%s, %s, MS_BIND): %m\n", source, source); + exit(1); + } + + /* This magic is based on lxc's lxc_pivot_root() */ + oldroot = open("/", O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (oldroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(/): %m\n"); + exit(1); + } + + newroot = open(source, O_DIRECTORY | O_RDONLY | O_CLOEXEC); + if (newroot < 0) { + fuse_log(FUSE_LOG_ERR, "open(%s): %m\n", source); + exit(1); + } + + if (fchdir(newroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); + exit(1); + } + + if (syscall(__NR_pivot_root, ".", ".") < 0) { + fuse_log(FUSE_LOG_ERR, "pivot_root(., .): %m\n"); + exit(1); + } + + if (fchdir(oldroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(oldroot): %m\n"); + exit(1); + } + + if (mount("", ".", "", MS_SLAVE | MS_REC, NULL) < 0) { + fuse_log(FUSE_LOG_ERR, "mount(., MS_SLAVE | MS_REC): %m\n"); + exit(1); + } + + if (umount2(".", MNT_DETACH) < 0) { + fuse_log(FUSE_LOG_ERR, "umount2(., MNT_DETACH): %m\n"); + exit(1); + } + + if (fchdir(newroot) < 0) { + fuse_log(FUSE_LOG_ERR, "fchdir(newroot): %m\n"); + exit(1); + } + + close(newroot); + close(oldroot); +} + +/* + * Only keep capabilities in allowlist that are needed for file system operation + * The (possibly NULL) modcaps_in string passed in is free'd before exit. + */ +static void setup_capabilities(char *modcaps_in) +{ + char *modcaps = modcaps_in; + pthread_mutex_lock(&cap.mutex); + capng_restore_state(&cap.saved); + + /* + * Add to allowlist file system-related capabilities that are needed for a + * file server to act like root. Drop everything else like networking and + * sysadmin capabilities. + * + * Exclusions: + * 1. CAP_LINUX_IMMUTABLE is not included because it's only used via ioctl + * and we don't support that. + * 2. CAP_MAC_OVERRIDE is not included because it only seems to be + * used by the Smack LSM. Omit it until there is demand for it. + */ + capng_setpid(syscall(SYS_gettid)); + capng_clear(CAPNG_SELECT_BOTH); + if (capng_updatev(CAPNG_ADD, CAPNG_PERMITTED | CAPNG_EFFECTIVE, + CAP_CHOWN, + CAP_DAC_OVERRIDE, + CAP_FOWNER, + CAP_FSETID, + CAP_SETGID, + CAP_SETUID, + CAP_MKNOD, + CAP_SETFCAP, + -1)) { + fuse_log(FUSE_LOG_ERR, "%s: capng_updatev failed\n", __func__); + exit(1); + } + + /* + * The modcaps option is a colon separated list of caps, + * each preceded by either + or -. + */ + while (modcaps) { + capng_act_t action; + int cap; + + char *next = strchr(modcaps, ':'); + if (next) { + *next = '\0'; + next++; + } + + switch (modcaps[0]) { + case '+': + action = CAPNG_ADD; + break; + + case '-': + action = CAPNG_DROP; + break; + + default: + fuse_log(FUSE_LOG_ERR, + "%s: Expecting '+'/'-' in modcaps but found '%c'\n", + __func__, modcaps[0]); + exit(1); + } + cap = capng_name_to_capability(modcaps + 1); + if (cap < 0) { + fuse_log(FUSE_LOG_ERR, "%s: Unknown capability '%s'\n", __func__, + modcaps); + exit(1); + } + if (capng_update(action, CAPNG_PERMITTED | CAPNG_EFFECTIVE, cap)) { + fuse_log(FUSE_LOG_ERR, "%s: capng_update failed for '%s'\n", + __func__, modcaps); + exit(1); + } + + modcaps = next; + } + g_free(modcaps_in); + + if (capng_apply(CAPNG_SELECT_BOTH)) { + fuse_log(FUSE_LOG_ERR, "%s: capng_apply failed\n", __func__); + exit(1); + } + + cap.saved = capng_save_state(); + if (!cap.saved) { + fuse_log(FUSE_LOG_ERR, "%s: capng_save_state failed\n", __func__); + exit(1); + } + pthread_mutex_unlock(&cap.mutex); +} + +/* + * Use chroot as a weaker sandbox for environments where the process is + * launched without CAP_SYS_ADMIN. + */ +static void setup_chroot(struct lo_data *lo) +{ + lo->proc_self_fd = open("/proc/self/fd", O_PATH); + if (lo->proc_self_fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(\"/proc/self/fd\", O_PATH): %m\n"); + exit(1); + } + + /* + * Make the shared directory the file system root so that FUSE_OPEN + * (lo_open()) cannot escape the shared directory by opening a symlink. + * + * The chroot(2) syscall is later disabled by seccomp and the + * CAP_SYS_CHROOT capability is dropped so that tampering with the chroot + * is not possible. + * + * However, it's still possible to escape the chroot via lo->proc_self_fd + * but that requires first gaining control of the process. + */ + if (chroot(lo->source) != 0) { + fuse_log(FUSE_LOG_ERR, "chroot(\"%s\"): %m\n", lo->source); + exit(1); + } + + /* Move into the chroot */ + if (chdir("/") != 0) { + fuse_log(FUSE_LOG_ERR, "chdir(\"/\"): %m\n"); + exit(1); + } +} + +/* + * Lock down this process to prevent access to other processes or files outside + * source directory. This reduces the impact of arbitrary code execution bugs. + */ +static void setup_sandbox(struct lo_data *lo, struct fuse_session *se, + bool enable_syslog) +{ + if (lo->sandbox == SANDBOX_NAMESPACE) { + setup_namespaces(lo, se); + setup_mounts(lo->source); + } else { + setup_chroot(lo); + } + + setup_seccomp(enable_syslog); + setup_capabilities(g_strdup(lo->modcaps)); +} + +/* Set the maximum number of open file descriptors */ +static void setup_nofile_rlimit(unsigned long rlimit_nofile) +{ + struct rlimit rlim = { + .rlim_cur = rlimit_nofile, + .rlim_max = rlimit_nofile, + }; + + if (rlimit_nofile == 0) { + return; /* nothing to do */ + } + + if (setrlimit(RLIMIT_NOFILE, &rlim) < 0) { + /* Ignore SELinux denials */ + if (errno == EPERM) { + return; + } + + fuse_log(FUSE_LOG_ERR, "setrlimit(RLIMIT_NOFILE): %m\n"); + exit(1); + } +} + +static void log_func(enum fuse_log_level level, const char *fmt, va_list ap) +{ + g_autofree char *localfmt = NULL; + + if (current_log_level < level) { + return; + } + + if (current_log_level == FUSE_LOG_DEBUG) { + if (use_syslog) { + /* no timestamp needed */ + localfmt = g_strdup_printf("[ID: %08ld] %s", syscall(__NR_gettid), + fmt); + } else { + g_autoptr(GDateTime) now = g_date_time_new_now_utc(); + g_autofree char *nowstr = g_date_time_format(now, "%Y-%m-%d %H:%M:%S.%f%z"); + localfmt = g_strdup_printf("[%s] [ID: %08ld] %s", + nowstr, syscall(__NR_gettid), fmt); + } + fmt = localfmt; + } + + if (use_syslog) { + int priority = LOG_ERR; + switch (level) { + case FUSE_LOG_EMERG: + priority = LOG_EMERG; + break; + case FUSE_LOG_ALERT: + priority = LOG_ALERT; + break; + case FUSE_LOG_CRIT: + priority = LOG_CRIT; + break; + case FUSE_LOG_ERR: + priority = LOG_ERR; + break; + case FUSE_LOG_WARNING: + priority = LOG_WARNING; + break; + case FUSE_LOG_NOTICE: + priority = LOG_NOTICE; + break; + case FUSE_LOG_INFO: + priority = LOG_INFO; + break; + case FUSE_LOG_DEBUG: + priority = LOG_DEBUG; + break; + } + vsyslog(priority, fmt, ap); + } else { + vfprintf(stderr, fmt, ap); + } +} + +static void setup_root(struct lo_data *lo, struct lo_inode *root) +{ + int fd, res; + struct stat stat; + uint64_t mnt_id; + + fd = open("/", O_PATH); + if (fd == -1) { + fuse_log(FUSE_LOG_ERR, "open(%s, O_PATH): %m\n", lo->source); + exit(1); + } + + res = do_statx(lo, fd, "", &stat, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW, + &mnt_id); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "fstatat(%s): %m\n", lo->source); + exit(1); + } + + root->filetype = S_IFDIR; + root->fd = fd; + root->key.ino = stat.st_ino; + root->key.dev = stat.st_dev; + root->key.mnt_id = mnt_id; + root->nlookup = 2; + g_atomic_int_set(&root->refcount, 2); + if (lo->posix_lock) { + pthread_mutex_init(&root->plock_mutex, NULL); + root->posix_locks = g_hash_table_new_full( + g_direct_hash, g_direct_equal, NULL, posix_locks_value_destroy); + } +} + +static guint lo_key_hash(gconstpointer key) +{ + const struct lo_key *lkey = key; + + return (guint)lkey->ino + (guint)lkey->dev + (guint)lkey->mnt_id; +} + +static gboolean lo_key_equal(gconstpointer a, gconstpointer b) +{ + const struct lo_key *la = a; + const struct lo_key *lb = b; + + return la->ino == lb->ino && la->dev == lb->dev && la->mnt_id == lb->mnt_id; +} + +static void fuse_lo_data_cleanup(struct lo_data *lo) +{ + if (lo->inodes) { + g_hash_table_destroy(lo->inodes); + } + + if (lo->root.posix_locks) { + g_hash_table_destroy(lo->root.posix_locks); + } + lo_map_destroy(&lo->fd_map); + lo_map_destroy(&lo->dirp_map); + lo_map_destroy(&lo->ino_map); + + if (lo->proc_self_fd >= 0) { + close(lo->proc_self_fd); + } + + if (lo->root.fd >= 0) { + close(lo->root.fd); + } + + free(lo->xattrmap); + free_xattrmap(lo); + free(lo->xattr_security_capability); + free(lo->source); +} + +static void qemu_version(void) +{ + printf("virtiofsd version " QEMU_FULL_VERSION "\n" QEMU_COPYRIGHT "\n"); +} + +int main(int argc, char *argv[]) +{ + struct fuse_args args = FUSE_ARGS_INIT(argc, argv); + struct fuse_session *se; + struct fuse_cmdline_opts opts; + struct lo_data lo = { + .sandbox = SANDBOX_NAMESPACE, + .debug = 0, + .writeback = 0, + .posix_lock = 0, + .allow_direct_io = 0, + .proc_self_fd = -1, + .user_killpriv_v2 = -1, + .user_posix_acl = -1, + }; + struct lo_map_elem *root_elem; + struct lo_map_elem *reserve_elem; + int ret = -1; + + /* Initialize time conversion information for localtime_r(). */ + tzset(); + + /* Don't mask creation mode, kernel already did that */ + umask(0); + + qemu_init_exec_dir(argv[0]); + + pthread_mutex_init(&lo.mutex, NULL); + lo.inodes = g_hash_table_new(lo_key_hash, lo_key_equal); + lo.root.fd = -1; + lo.root.fuse_ino = FUSE_ROOT_ID; + lo.cache = CACHE_AUTO; + + /* + * Set up the ino map like this: + * [0] Reserved (will not be used) + * [1] Root inode + */ + lo_map_init(&lo.ino_map); + reserve_elem = lo_map_reserve(&lo.ino_map, 0); + if (!reserve_elem) { + fuse_log(FUSE_LOG_ERR, "failed to alloc reserve_elem.\n"); + goto err_out1; + } + reserve_elem->in_use = false; + root_elem = lo_map_reserve(&lo.ino_map, lo.root.fuse_ino); + if (!root_elem) { + fuse_log(FUSE_LOG_ERR, "failed to alloc root_elem.\n"); + goto err_out1; + } + root_elem->inode = &lo.root; + + lo_map_init(&lo.dirp_map); + lo_map_init(&lo.fd_map); + + if (fuse_parse_cmdline(&args, &opts) != 0) { + goto err_out1; + } + fuse_set_log_func(log_func); + use_syslog = opts.syslog; + if (use_syslog) { + openlog("virtiofsd", LOG_PID, LOG_DAEMON); + } + + if (opts.show_help) { + printf("usage: %s [options]\n\n", argv[0]); + fuse_cmdline_help(); + printf(" -o source=PATH shared directory tree\n"); + fuse_lowlevel_help(); + ret = 0; + goto err_out1; + } else if (opts.show_version) { + qemu_version(); + fuse_lowlevel_version(); + ret = 0; + goto err_out1; + } else if (opts.print_capabilities) { + print_capabilities(); + ret = 0; + goto err_out1; + } + + if (fuse_opt_parse(&args, &lo, lo_opts, NULL) == -1) { + goto err_out1; + } + + if (opts.log_level != 0) { + current_log_level = opts.log_level; + } else { + /* default log level is INFO */ + current_log_level = FUSE_LOG_INFO; + } + lo.debug = opts.debug; + if (lo.debug) { + current_log_level = FUSE_LOG_DEBUG; + } + if (lo.source) { + struct stat stat; + int res; + + res = lstat(lo.source, &stat); + if (res == -1) { + fuse_log(FUSE_LOG_ERR, "failed to stat source (\"%s\"): %m\n", + lo.source); + exit(1); + } + if (!S_ISDIR(stat.st_mode)) { + fuse_log(FUSE_LOG_ERR, "source is not a directory\n"); + exit(1); + } + } else { + lo.source = strdup("/"); + if (!lo.source) { + fuse_log(FUSE_LOG_ERR, "failed to strdup source\n"); + goto err_out1; + } + } + + if (lo.xattrmap) { + lo.xattr = 1; + parse_xattrmap(&lo); + } + + if (!lo.timeout_set) { + switch (lo.cache) { + case CACHE_NONE: + lo.timeout = 0.0; + break; + + case CACHE_AUTO: + lo.timeout = 1.0; + break; + + case CACHE_ALWAYS: + lo.timeout = 86400.0; + break; + } + } else if (lo.timeout < 0) { + fuse_log(FUSE_LOG_ERR, "timeout is negative (%lf)\n", lo.timeout); + exit(1); + } + + if (lo.user_posix_acl == 1 && !lo.xattr) { + fuse_log(FUSE_LOG_ERR, "Can't enable posix ACLs. xattrs are disabled." + "\n"); + exit(1); + } + + lo.use_statx = true; + + se = fuse_session_new(&args, &lo_oper, sizeof(lo_oper), &lo); + if (se == NULL) { + goto err_out1; + } + + if (fuse_set_signal_handlers(se) != 0) { + goto err_out2; + } + + if (fuse_session_mount(se) != 0) { + goto err_out3; + } + + fuse_daemonize(opts.foreground); + + setup_nofile_rlimit(opts.rlimit_nofile); + + /* Must be before sandbox since it wants /proc */ + setup_capng(); + + setup_sandbox(&lo, se, opts.syslog); + + setup_root(&lo, &lo.root); + /* Block until ctrl+c or fusermount -u */ + ret = virtio_loop(se); + + fuse_session_unmount(se); + cleanup_capng(); +err_out3: + fuse_remove_signal_handlers(se); +err_out2: + fuse_session_destroy(se); +err_out1: + fuse_opt_free_args(&args); + + fuse_lo_data_cleanup(&lo); + + return ret ? 1 : 0; +} diff --git a/tools/virtiofsd/passthrough_seccomp.c b/tools/virtiofsd/passthrough_seccomp.c new file mode 100644 index 000000000..a3ce9f898 --- /dev/null +++ b/tools/virtiofsd/passthrough_seccomp.c @@ -0,0 +1,177 @@ +/* + * Seccomp sandboxing for virtiofsd + * + * Copyright (C) 2019 Red Hat, Inc. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#include "qemu/osdep.h" +#include "passthrough_seccomp.h" +#include "fuse_i.h" +#include "fuse_log.h" +#include <seccomp.h> + +/* Bodge for libseccomp 2.4.2 which broke ppoll */ +#if !defined(__SNR_ppoll) && defined(__SNR_brk) +#ifdef __NR_ppoll +#define __SNR_ppoll __NR_ppoll +#else +#define __SNR_ppoll __PNR_ppoll +#endif +#endif + +static const int syscall_allowlist[] = { + /* TODO ireg sem*() syscalls */ + SCMP_SYS(brk), + SCMP_SYS(capget), /* For CAP_FSETID */ + SCMP_SYS(capset), + SCMP_SYS(clock_gettime), + SCMP_SYS(clone), +#ifdef __NR_clone3 + SCMP_SYS(clone3), +#endif + SCMP_SYS(close), + SCMP_SYS(copy_file_range), + SCMP_SYS(dup), + SCMP_SYS(eventfd2), + SCMP_SYS(exit), + SCMP_SYS(exit_group), + SCMP_SYS(fallocate), + SCMP_SYS(fchdir), + SCMP_SYS(fchmod), + SCMP_SYS(fchmodat), + SCMP_SYS(fchownat), + SCMP_SYS(fcntl), + SCMP_SYS(fdatasync), + SCMP_SYS(fgetxattr), + SCMP_SYS(flistxattr), + SCMP_SYS(flock), + SCMP_SYS(fremovexattr), + SCMP_SYS(fsetxattr), + SCMP_SYS(fstat), + SCMP_SYS(fstatfs), + SCMP_SYS(fstatfs64), + SCMP_SYS(fsync), + SCMP_SYS(ftruncate), + SCMP_SYS(futex), + SCMP_SYS(getdents), + SCMP_SYS(getdents64), + SCMP_SYS(getegid), + SCMP_SYS(geteuid), + SCMP_SYS(getpid), + SCMP_SYS(gettid), + SCMP_SYS(gettimeofday), + SCMP_SYS(getxattr), + SCMP_SYS(linkat), + SCMP_SYS(listxattr), + SCMP_SYS(lseek), + SCMP_SYS(_llseek), /* For POWER */ + SCMP_SYS(madvise), + SCMP_SYS(mkdirat), + SCMP_SYS(mknodat), + SCMP_SYS(mmap), + SCMP_SYS(mprotect), + SCMP_SYS(mremap), + SCMP_SYS(munmap), + SCMP_SYS(newfstatat), + SCMP_SYS(statx), + SCMP_SYS(open), + SCMP_SYS(openat), + SCMP_SYS(ppoll), + SCMP_SYS(prctl), /* TODO restrict to just PR_SET_NAME? */ + SCMP_SYS(preadv), + SCMP_SYS(pread64), + SCMP_SYS(pwritev), + SCMP_SYS(pwrite64), + SCMP_SYS(read), + SCMP_SYS(readlinkat), + SCMP_SYS(recvmsg), + SCMP_SYS(renameat), + SCMP_SYS(renameat2), + SCMP_SYS(removexattr), + SCMP_SYS(restart_syscall), + SCMP_SYS(rt_sigaction), + SCMP_SYS(rt_sigprocmask), + SCMP_SYS(rt_sigreturn), + SCMP_SYS(sched_getattr), + SCMP_SYS(sched_setattr), + SCMP_SYS(sendmsg), + SCMP_SYS(setresgid), + SCMP_SYS(setresuid), +#ifdef __NR_setresgid32 + SCMP_SYS(setresgid32), +#endif +#ifdef __NR_setresuid32 + SCMP_SYS(setresuid32), +#endif + SCMP_SYS(set_robust_list), + SCMP_SYS(setxattr), + SCMP_SYS(symlinkat), + SCMP_SYS(time), /* Rarely needed, except on static builds */ + SCMP_SYS(tgkill), + SCMP_SYS(unlinkat), + SCMP_SYS(unshare), + SCMP_SYS(utimensat), + SCMP_SYS(write), + SCMP_SYS(writev), + SCMP_SYS(umask), +}; + +/* Syscalls used when --syslog is enabled */ +static const int syscall_allowlist_syslog[] = { + SCMP_SYS(send), + SCMP_SYS(sendto), +}; + +static void add_allowlist(scmp_filter_ctx ctx, const int syscalls[], size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscalls[i], 0) != 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_rule_add syscall %d failed\n", + syscalls[i]); + exit(1); + } + } +} + +void setup_seccomp(bool enable_syslog) +{ + scmp_filter_ctx ctx; + +#ifdef SCMP_ACT_KILL_PROCESS + ctx = seccomp_init(SCMP_ACT_KILL_PROCESS); + /* Handle a newer libseccomp but an older kernel */ + if (!ctx && errno == EOPNOTSUPP) { + ctx = seccomp_init(SCMP_ACT_TRAP); + } +#else + ctx = seccomp_init(SCMP_ACT_TRAP); +#endif + if (!ctx) { + fuse_log(FUSE_LOG_ERR, "seccomp_init() failed\n"); + exit(1); + } + + add_allowlist(ctx, syscall_allowlist, G_N_ELEMENTS(syscall_allowlist)); + if (enable_syslog) { + add_allowlist(ctx, syscall_allowlist_syslog, + G_N_ELEMENTS(syscall_allowlist_syslog)); + } + + /* libvhost-user calls this for post-copy migration, we don't need it */ + if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOSYS), + SCMP_SYS(userfaultfd), 0) != 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_rule_add userfaultfd failed\n"); + exit(1); + } + + if (seccomp_load(ctx) < 0) { + fuse_log(FUSE_LOG_ERR, "seccomp_load() failed\n"); + exit(1); + } + + seccomp_release(ctx); +} diff --git a/tools/virtiofsd/passthrough_seccomp.h b/tools/virtiofsd/passthrough_seccomp.h new file mode 100644 index 000000000..a3ab073f0 --- /dev/null +++ b/tools/virtiofsd/passthrough_seccomp.h @@ -0,0 +1,15 @@ +/* + * Seccomp sandboxing for virtiofsd + * + * Copyright (C) 2019 Red Hat, Inc. + * + * SPDX-License-Identifier: GPL-2.0-or-later + */ + +#ifndef VIRTIOFSD_SECCOMP_H +#define VIRTIOFSD_SECCOMP_H + + +void setup_seccomp(bool enable_syslog); + +#endif /* VIRTIOFSD_SECCOMP_H */ |