Skip to content

Commit

Permalink
fuse: never unkillably wait on request
Browse files Browse the repository at this point in the history
fuse has the concept of "forcing" a request, which means (among other
things) that it does an unkillable wait in request_wait_answer(). fuse
flushes files when they are closed with this unkillable wait:

$ sudo cat /proc/1544574/stack
[<0>] request_wait_answer+0x12f/0x210
[<0>] fuse_simple_request+0x109/0x2c0
[<0>] fuse_flush+0x16f/0x1b0
[<0>] filp_close+0x27/0x70
[<0>] put_files_struct+0x6b/0xc0
[<0>] do_exit+0x360/0xb80
[<0>] do_group_exit+0x3a/0xa0
[<0>] get_signal+0x140/0x870
[<0>] arch_do_signal_or_restart+0xae/0x7c0
[<0>] exit_to_user_mode_prepare+0x10f/0x1c0
[<0>] syscall_exit_to_user_mode+0x26/0x40
[<0>] do_syscall_64+0x46/0xb0
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae

Generally, this is OK, since the fuse_dev_release() -> fuse_abort_conn()
wakes up this code when a fuse dev goes away (i.e. a fuse daemon is killed
or unmounted or whatever). However, there's a problem when the fuse daemon
itself spawns a thread that does a flush: since the thread has a copy of
the fd table with an fd pointing to the same fuse device, the reference
count isn't decremented to zero in fuse_dev_release(), and the task hangs
forever.

Tasks can be aborted via fusectl's abort file, so all is not lost. However,
this does wreak havoc in containers which mounted a fuse filesystem with
this state. If the init pid exits (or crashes), the kernel tries to clean
up the pidns:

$ sudo cat /proc/1528591/stack
[<0>] do_wait+0x156/0x2f0
[<0>] kernel_wait4+0x8d/0x140
[<0>] zap_pid_ns_processes+0x104/0x180
[<0>] do_exit+0xa41/0xb80
[<0>] do_group_exit+0x3a/0xa0
[<0>] __x64_sys_exit_group+0x14/0x20
[<0>] do_syscall_64+0x37/0xb0
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae

but hangs forever. Generally, container runtimes use "is init still alive"
as a heuristic for whether the container is still running, so this leaves
the container in a confusing state where it is not running, but runtimes
think it is.

It's not clear to me why we need an unkillable wait here... if we
invalidate the request with INTERRUPTED as we do in the interruptible case,
presumably (heh) it will be cleaned up.

So, let's killably wait in the FORCE path.

I've included a reproducer below, which works without any scary warnings
with this patch applied. libfuse's test suite also passes without any issue
with this patch applied. (git tree for reproducer is here:
https://github.com/tych0/kernel-utils/tree/master/fuse2

    root@(none):/tmp/fuse2# cat Makefile
    CFLAGS=-D_FILE_OFFSET_BITS=64 -Werror -I/usr/include/fuse3
    LDLIBS=-lfuse3

    build: fuse main

    .PHONY: check
    check: fuse main
            modprobe fuse
            mkdir -p mount
            unshare -Urpm --fork ./main

    clean:
            -rm -rf fuse main mount
    root@(none):/tmp/fuse2# cat fuse.c
    #define FUSE_USE_VERSION 31
    #include <stdio.h>
    #include <unistd.h>
    #include <stdlib.h>
    #include <fuse.h>
    #include <errno.h>
    #include <string.h>
    #include <pthread.h>
    #include <sys/types.h>

    static int sleepy_open(const char *path, struct fuse_file_info *fi)
    {
            return 0;
    }

    static void *nested_fsync(void *)
    {
            int fd;

            fd = open("/home/tycho/packages/kernel-utils/fuse2/mount/b", O_RDONLY);
            write(fd, "foo", 3);
            printf("doing nested fsync\n");
            fsync(fd);
            printf("nested fsync completed\n");
            return NULL;
    }

    static int sleepy_write(const char *path, const char *buf, size_t size,
                            off_t offset, struct fuse_file_info *fi)
    {
            int fd;
            pthread_t t;

            pthread_create(&t, NULL, nested_fsync, NULL);

            return 0;
    }

    static int sleepy_flush(const char *path, struct fuse_file_info *fi)
    {
            printf("eating a flush(%s)\n", path);
            while (1) sleep(100);
            return 0;
    }

    static int sleepy_opendir(const char *path, struct fuse_file_info *fi)
    {
            printf("opendir for %s\n", path);
            return 0;
    }

    static int sleepy_readdir(const char *path, void *buf, fuse_fill_dir_t filler,
                              off_t offset, struct fuse_file_info *fi,
                              enum fuse_readdir_flags)

    {
            if (strcmp(path, "/") != 0)
                    return -ENOENT;

            if (filler(buf, ".", NULL, 0, FUSE_FILL_DIR_PLUS))
                    return -ENOMEM;
            if (filler(buf, "..", NULL, 0, FUSE_FILL_DIR_PLUS))
                    return -ENOMEM;
            if (filler(buf, "a", NULL, 0, FUSE_FILL_DIR_PLUS))
                    return -ENOMEM;
            if (filler(buf, "b", NULL, 0, FUSE_FILL_DIR_PLUS))
                    return -ENOMEM;

            return 0;
    }

    static int sleepy_getattr(const char *path, struct stat *sb, struct fuse_file_info *fi)
    {
            struct timespec now;

            if (clock_gettime(CLOCK_REALTIME, &now) < 0)
                    return -EINVAL;
            sb->st_uid = sb->st_gid = 0;
            sb->st_atim = sb->st_mtim = sb->st_ctim = now;
            sb->st_size = 0;

            if (strcmp(path, "/") == 0) {
                    sb->st_mode = S_IFDIR | 00755;
                    sb->st_nlink = 2;
            } else if (strcmp(path, "/a") == 0 || strcmp(path, "/b") == 0) {
                    sb->st_nlink = 1;
                    sb->st_mode = S_IFREG | 00755;
            } else {
                    return -ENOENT;
            }

            return 0;
    }

    static void *sleepy_init(struct fuse_conn_info *info, struct fuse_config *config)
    {
            config->intr = 0;
            return NULL;
    }

    const struct fuse_operations sleepy_fuse = {
            .open = sleepy_open,
            .opendir = sleepy_opendir,
            .readdir = sleepy_readdir,
            .getattr = sleepy_getattr,
            .write = sleepy_write,
            .flush = sleepy_flush,
            .init = sleepy_init,
    };

    int main(int argc, char *argv[])
    {
            if (fuse_main(argc, argv, &sleepy_fuse, NULL)) {
                    printf("fuse_main failed\n");
                    exit(1);
            }

            printf("fuse main exited\n");
            exit(1);
    }
    root@(none):/tmp/fuse2# cat main.c
    #include <stdlib.h>
    #include <stdio.h>
    #include <unistd.h>
    #include <sys/stat.h>
    #include <sys/types.h>
    #include <fcntl.h>
    #include <errno.h>
    #include <string.h>
    #include <signal.h>

    static void do_write(void)
    {
            int fd;

            fd = open("mount/a", O_CREAT|O_WRONLY, 0600);
            if (fd < 0) {
                    perror("open fuse file");
                    exit(1);
            }

            errno = 0;
            write(fd, "hello", 5);
            printf("write returned, doing a flush to hang...\n");
            fsync(fd);
            printf("flush didn't hang?\n");
            exit(0);
    }

    int main(void)
    {
            pid_t fuse_pid, write_pid;

            fuse_pid = fork();
            if (fuse_pid < 0) {
                    perror("fuse fork");
                    exit(1);
            }

            if (fuse_pid == 0) {
                    execlp("./fuse", "./fuse", "-s", "-f", "mount", NULL);
                    perror("exec");
                    exit(1);
            }

            // lazy... let fuse get mounted
            sleep(2);

            write_pid = fork();
            if (write_pid < 0) {
                    perror("getattr fork");
                    exit(1);
            }

            if (write_pid == 0)
                    do_write();

            // lazy let the write() and nested fsync() from fuse happen
            sleep(2);

            printf("killing pid ns...\n");
            return 0;
    }

Signed-off-by: Tycho Andersen <[email protected]>
  • Loading branch information
tych0 committed Jun 20, 2022
1 parent 81d8d30 commit 3aa0728
Showing 1 changed file with 21 additions and 19 deletions.
40 changes: 21 additions & 19 deletions fs/fuse/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ void fuse_request_end(struct fuse_req *req)
spin_unlock(&fiq->lock);
}
WARN_ON(test_bit(FR_PENDING, &req->flags));
WARN_ON(test_bit(FR_SENT, &req->flags));
if (test_bit(FR_BACKGROUND, &req->flags)) {
spin_lock(&fc->bg_lock);
clear_bit(FR_BACKGROUND, &req->flags);
Expand Down Expand Up @@ -385,30 +384,33 @@ static void request_wait_answer(struct fuse_req *req)
queue_interrupt(req);
}

if (!test_bit(FR_FORCE, &req->flags)) {
/* Only fatal signals may interrupt this */
err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
if (!err)
return;
/* Only fatal signals may interrupt this */
err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
if (!err)
return;

spin_lock(&fiq->lock);
/* Request is not yet in userspace, bail out */
if (test_bit(FR_PENDING, &req->flags)) {
list_del(&req->list);
spin_unlock(&fiq->lock);
__fuse_put_request(req);
req->out.h.error = -EINTR;
return;
}
spin_lock(&fiq->lock);
/* Request is not yet in userspace, bail out */
if (test_bit(FR_PENDING, &req->flags)) {
list_del(&req->list);
spin_unlock(&fiq->lock);
__fuse_put_request(req);
req->out.h.error = -EINTR;
return;
}
spin_unlock(&fiq->lock);

/*
* Either request is already in userspace, or it was forced.
* Wait it out.
* Womp womp. We sent a request to userspace and now we're getting
* killed.
*/
wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
set_bit(FR_INTERRUPTED, &req->flags);
/* matches barrier in fuse_dev_do_read() */
smp_mb__after_atomic();
/* request *must* be FR_SENT here, because we ignored FR_PENDING before */
WARN_ON(!test_bit(FR_SENT, &req->flags));
queue_interrupt(req);
}

static void __fuse_request_send(struct fuse_req *req)
Expand Down

0 comments on commit 3aa0728

Please sign in to comment.