From 3aa07284cb42b8b443bc3e823a9a0acf54629ef5 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 15 Jun 2022 15:16:22 -0600 Subject: [PATCH] fuse: never unkillably wait on request fuse has the concept of "forcing" a request, which means (among other things) that it does an unkillable wait in request_wait_answer(). fuse flushes files when they are closed with this unkillable wait: $ sudo cat /proc/1544574/stack [<0>] request_wait_answer+0x12f/0x210 [<0>] fuse_simple_request+0x109/0x2c0 [<0>] fuse_flush+0x16f/0x1b0 [<0>] filp_close+0x27/0x70 [<0>] put_files_struct+0x6b/0xc0 [<0>] do_exit+0x360/0xb80 [<0>] do_group_exit+0x3a/0xa0 [<0>] get_signal+0x140/0x870 [<0>] arch_do_signal_or_restart+0xae/0x7c0 [<0>] exit_to_user_mode_prepare+0x10f/0x1c0 [<0>] syscall_exit_to_user_mode+0x26/0x40 [<0>] do_syscall_64+0x46/0xb0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae Generally, this is OK, since the fuse_dev_release() -> fuse_abort_conn() wakes up this code when a fuse dev goes away (i.e. a fuse daemon is killed or unmounted or whatever). However, there's a problem when the fuse daemon itself spawns a thread that does a flush: since the thread has a copy of the fd table with an fd pointing to the same fuse device, the reference count isn't decremented to zero in fuse_dev_release(), and the task hangs forever. Tasks can be aborted via fusectl's abort file, so all is not lost. However, this does wreak havoc in containers which mounted a fuse filesystem with this state. If the init pid exits (or crashes), the kernel tries to clean up the pidns: $ sudo cat /proc/1528591/stack [<0>] do_wait+0x156/0x2f0 [<0>] kernel_wait4+0x8d/0x140 [<0>] zap_pid_ns_processes+0x104/0x180 [<0>] do_exit+0xa41/0xb80 [<0>] do_group_exit+0x3a/0xa0 [<0>] __x64_sys_exit_group+0x14/0x20 [<0>] do_syscall_64+0x37/0xb0 [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae but hangs forever. Generally, container runtimes use "is init still alive" as a heuristic for whether the container is still running, so this leaves the container in a confusing state where it is not running, but runtimes think it is. It's not clear to me why we need an unkillable wait here... if we invalidate the request with INTERRUPTED as we do in the interruptible case, presumably (heh) it will be cleaned up. So, let's killably wait in the FORCE path. I've included a reproducer below, which works without any scary warnings with this patch applied. libfuse's test suite also passes without any issue with this patch applied. (git tree for reproducer is here: https://github.com/tych0/kernel-utils/tree/master/fuse2 root@(none):/tmp/fuse2# cat Makefile CFLAGS=-D_FILE_OFFSET_BITS=64 -Werror -I/usr/include/fuse3 LDLIBS=-lfuse3 build: fuse main .PHONY: check check: fuse main modprobe fuse mkdir -p mount unshare -Urpm --fork ./main clean: -rm -rf fuse main mount root@(none):/tmp/fuse2# cat fuse.c #define FUSE_USE_VERSION 31 #include #include #include #include #include #include #include #include static int sleepy_open(const char *path, struct fuse_file_info *fi) { return 0; } static void *nested_fsync(void *) { int fd; fd = open("/home/tycho/packages/kernel-utils/fuse2/mount/b", O_RDONLY); write(fd, "foo", 3); printf("doing nested fsync\n"); fsync(fd); printf("nested fsync completed\n"); return NULL; } static int sleepy_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { int fd; pthread_t t; pthread_create(&t, NULL, nested_fsync, NULL); return 0; } static int sleepy_flush(const char *path, struct fuse_file_info *fi) { printf("eating a flush(%s)\n", path); while (1) sleep(100); return 0; } static int sleepy_opendir(const char *path, struct fuse_file_info *fi) { printf("opendir for %s\n", path); return 0; } static int sleepy_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi, enum fuse_readdir_flags) { if (strcmp(path, "/") != 0) return -ENOENT; if (filler(buf, ".", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "..", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "a", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; if (filler(buf, "b", NULL, 0, FUSE_FILL_DIR_PLUS)) return -ENOMEM; return 0; } static int sleepy_getattr(const char *path, struct stat *sb, struct fuse_file_info *fi) { struct timespec now; if (clock_gettime(CLOCK_REALTIME, &now) < 0) return -EINVAL; sb->st_uid = sb->st_gid = 0; sb->st_atim = sb->st_mtim = sb->st_ctim = now; sb->st_size = 0; if (strcmp(path, "/") == 0) { sb->st_mode = S_IFDIR | 00755; sb->st_nlink = 2; } else if (strcmp(path, "/a") == 0 || strcmp(path, "/b") == 0) { sb->st_nlink = 1; sb->st_mode = S_IFREG | 00755; } else { return -ENOENT; } return 0; } static void *sleepy_init(struct fuse_conn_info *info, struct fuse_config *config) { config->intr = 0; return NULL; } const struct fuse_operations sleepy_fuse = { .open = sleepy_open, .opendir = sleepy_opendir, .readdir = sleepy_readdir, .getattr = sleepy_getattr, .write = sleepy_write, .flush = sleepy_flush, .init = sleepy_init, }; int main(int argc, char *argv[]) { if (fuse_main(argc, argv, &sleepy_fuse, NULL)) { printf("fuse_main failed\n"); exit(1); } printf("fuse main exited\n"); exit(1); } root@(none):/tmp/fuse2# cat main.c #include #include #include #include #include #include #include #include #include static void do_write(void) { int fd; fd = open("mount/a", O_CREAT|O_WRONLY, 0600); if (fd < 0) { perror("open fuse file"); exit(1); } errno = 0; write(fd, "hello", 5); printf("write returned, doing a flush to hang...\n"); fsync(fd); printf("flush didn't hang?\n"); exit(0); } int main(void) { pid_t fuse_pid, write_pid; fuse_pid = fork(); if (fuse_pid < 0) { perror("fuse fork"); exit(1); } if (fuse_pid == 0) { execlp("./fuse", "./fuse", "-s", "-f", "mount", NULL); perror("exec"); exit(1); } // lazy... let fuse get mounted sleep(2); write_pid = fork(); if (write_pid < 0) { perror("getattr fork"); exit(1); } if (write_pid == 0) do_write(); // lazy let the write() and nested fsync() from fuse happen sleep(2); printf("killing pid ns...\n"); return 0; } Signed-off-by: Tycho Andersen --- fs/fuse/dev.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d6b5339c56e2ca..3886ddcca1263d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -297,7 +297,6 @@ void fuse_request_end(struct fuse_req *req) spin_unlock(&fiq->lock); } WARN_ON(test_bit(FR_PENDING, &req->flags)); - WARN_ON(test_bit(FR_SENT, &req->flags)); if (test_bit(FR_BACKGROUND, &req->flags)) { spin_lock(&fc->bg_lock); clear_bit(FR_BACKGROUND, &req->flags); @@ -385,30 +384,33 @@ static void request_wait_answer(struct fuse_req *req) queue_interrupt(req); } - if (!test_bit(FR_FORCE, &req->flags)) { - /* Only fatal signals may interrupt this */ - err = wait_event_killable(req->waitq, - test_bit(FR_FINISHED, &req->flags)); - if (!err) - return; + /* Only fatal signals may interrupt this */ + err = wait_event_killable(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) + return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; - return; - } + spin_lock(&fiq->lock); + /* Request is not yet in userspace, bail out */ + if (test_bit(FR_PENDING, &req->flags)) { + list_del(&req->list); spin_unlock(&fiq->lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return; } + spin_unlock(&fiq->lock); /* - * Either request is already in userspace, or it was forced. - * Wait it out. + * Womp womp. We sent a request to userspace and now we're getting + * killed. */ - wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + /* request *must* be FR_SENT here, because we ignored FR_PENDING before */ + WARN_ON(!test_bit(FR_SENT, &req->flags)); + queue_interrupt(req); } static void __fuse_request_send(struct fuse_req *req)