Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fuse: never unkillably wait on request
fuse has the concept of "forcing" a request, which means (among other things) that it does an unkillable wait in request_wait_answer(). fuse flushes files when they are closed with this unkillable wait: $ sudo cat /proc/1544574/stack [<0>] request_wait_answer+0x12f/0x210 [<0>] fuse_simple_request+0x109/0x2c0 [<0>] fuse_flush+0x16f/0x1b0 [<0>] filp_close+0x27/0x70 [<0>] put_files_struct+0x6b/0xc0 [<0>] do_exit+0x360/0xb80 [<0>] do_group_exit+0x3a/0xa0 [<0>] get_signal+0x140/0x870 [<0>] arch_do_signal_or_restart+0xae/0x7c0 [<0>] exit_to_user_mode_prepare+0x10f/0x1c0 [<0>] syscall_exit_to_user_mode+0x26/0x40 [<0>] do_syscall_64+0x46/0xb0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae Generally, this is OK, since the fuse_dev_release() -> fuse_abort_conn() wakes up this code when a fuse dev goes away (i.e. a fuse daemon is killed or unmounted or whatever). However, there's a problem when the fuse daemon itself spawns a thread that does a flush: since the thread has a copy of the fd table with an fd pointing to the same fuse device, the reference count isn't decremented to zero in fuse_dev_release(), and the task hangs forever. Tasks can be aborted via fusectl's abort file, so all is not lost. However, this does wreak havoc in containers which mounted a fuse filesystem with this state. If the init pid exits (or crashes), the kernel tries to clean up the pidns: $ sudo cat /proc/1528591/stack [<0>] do_wait+0x156/0x2f0 [<0>] kernel_wait4+0x8d/0x140 [<0>] zap_pid_ns_processes+0x104/0x180 [<0>] do_exit+0xa41/0xb80 [<0>] do_group_exit+0x3a/0xa0 [<0>] __x64_sys_exit_group+0x14/0x20 [<0>] do_syscall_64+0x37/0xb0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae but hangs forever. Generally, container runtimes use "is init still alive" as a heuristic for whether the container is still running, so this leaves the container in a confusing state where it is not running, but runtimes think it is. It's not clear to me why we need an unkillable wait here... if we invalidate the request with INTERRUPTED as we do in the interruptible case, presumably (heh) it will be cleaned up. So, let's killably wait in the FORCE path. I've included a reproducer below, which works without any scary warnings with this patch applied. libfuse's test suite also passes without any issue with this patch applied. (git tree for reproducer is here: https://github.com/tych0/kernel-utils/tree/master/fuse2 root@(none):/tmp/fuse2# cat Makefile CFLAGS=-D_FILE_OFFSET_BITS=64 -Werror -I/usr/include/fuse3 LDLIBS=-lfuse3 build: fuse main .PHONY: check check: fuse main modprobe fuse mkdir -p mount unshare -Urpm --fork ./main clean: -rm -rf fuse main mount root@(none):/tmp/fuse2# cat fuse.c #define FUSE_USE_VERSION 31 #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <fuse.h> #include <errno.h> #include <string.h> #include <pthread.h> #include <sys/types.h> static int sleepy_open(const char *path, struct fuse_file_info *fi) { return 0; } static void *nested_fsync(void *) { int fd; fd = open("/home/tycho/packages/kernel-utils/fuse2/mount/b", O_RDONLY); write(fd, "foo", 3); printf("doing nested fsync\n"); fsync(fd); printf("nested fsync completed\n"); return NULL; } static int sleepy_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { int fd; pthread_t t; pthread_create(&t, NULL, nested_fsync, NULL); return 0; } static int sleepy_flush(const char *path, struct fuse_file_info *fi) { printf("eating a flush(%s)\n", path); while (1) sleep(100); return 0; } static int sleepy_opendir(const char *path, struct fuse_file_info *fi) { printf("opendir for %s\n", path); return 0; } static int sleepy_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi, enum fuse_readdir_flags) { if (strcmp(path, "/") != 0) return -ENOENT; if (filler(buf, ".", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "..", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "a", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "b", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; return 0; } static int sleepy_getattr(const char *path, struct stat *sb, struct fuse_file_info *fi) { struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; sb->st_size = 0; if (strcmp(path, "/") == 0) { sb->st_mode = S_IFDIR | 00755; sb->st_nlink = 2; } else if (strcmp(path, "/a") == 0 || strcmp(path, "/b") == 0) { sb->st_nlink = 1; sb->st_mode = S_IFREG | 00755; } else { return -ENOENT; } return 0; } static void *sleepy_init(struct fuse_conn_info *info, struct fuse_config *config) { config->intr = 0; return NULL; } const struct fuse_operations sleepy_fuse = { .open = sleepy_open, .opendir = sleepy_opendir, .readdir = sleepy_readdir, .getattr = sleepy_getattr, .write = sleepy_write, .flush = sleepy_flush, .init = sleepy_init, }; int main(int argc, char *argv[]) { if (fuse_main(argc, argv, &sleepy_fuse, NULL)) { printf("fuse_main failed\n"); exit(1); } printf("fuse main exited\n"); exit(1); } root@(none):/tmp/fuse2# cat main.c #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sys/stat.h> #include <sys/types.h> #include <fcntl.h> #include <errno.h> #include <string.h> #include <signal.h> static void do_write(void) { int fd; fd = open("mount/a", O_CREAT|O_WRONLY, 0600); if (fd < 0) { perror("open fuse file"); exit(1); } errno = 0; write(fd, "hello", 5); printf("write returned, doing a flush to hang...\n"); fsync(fd); printf("flush didn't hang?\n"); exit(0); } int main(void) { pid_t fuse_pid, write_pid; fuse_pid = fork(); if (fuse_pid < 0) { perror("fuse fork"); exit(1); } if (fuse_pid == 0) { execlp("./fuse", "./fuse", "-s", "-f", "mount", NULL); perror("exec"); exit(1); } // lazy... let fuse get mounted sleep(2); write_pid = fork(); if (write_pid < 0) { perror("getattr fork"); exit(1); } if (write_pid == 0) do_write(); // lazy let the write() and nested fsync() from fuse happen sleep(2); printf("killing pid ns...\n"); return 0; } Signed-off-by: Tycho Andersen <[email protected]>
- Loading branch information