diff --git a/kernel/include/onyx/block.h b/kernel/include/onyx/block.h index 0d5a85a3d..1483f20db 100644 --- a/kernel/include/onyx/block.h +++ b/kernel/include/onyx/block.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 - 2022 Pedro Falcato + * Copyright (c) 2016 - 2023 Pedro Falcato * This file is part of Onyx, and is released under the terms of the MIT License * check LICENSE at the root directory for more information * @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -89,6 +90,8 @@ struct blockdev // An optional partition prefix, like the 'p' in nvme0n1p1 cul::string partition_prefix; + unique_ptr wbdev{nullptr}; + constexpr blockdev() = default; }; @@ -178,7 +181,8 @@ unique_ptr blkdev_create_scsi_like_dev(); // Read-write user and group, no permissions to others #define BLOCK_DEVICE_PERMISSIONS 0660 -struct blockdev; void partition_setup_disk(struct blockdev *dev); +flush::writeback_dev *bdev_get_wbdev(struct inode *ino); + #endif diff --git a/kernel/include/onyx/buffer.h b/kernel/include/onyx/buffer.h index 476d5ea02..e07c60917 100644 --- a/kernel/include/onyx/buffer.h +++ b/kernel/include/onyx/buffer.h @@ -1,7 +1,9 @@ /* - * Copyright (c) 2020 Pedro Falcato + * Copyright (c) 2020 - 2023 Pedro Falcato * This file is part of Onyx, and is released under the terms of the MIT License * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT */ #ifndef _ONYX_BUFFER_H #define _ONYX_BUFFER_H @@ -16,6 +18,8 @@ * It's supposed to be used by filesystems only, for metadata. */ +struct vm_object; + struct block_buf { /* This block's refcount */ @@ -34,11 +38,36 @@ struct block_buf sector_t block_nr; /* The block size */ unsigned int block_size; + struct list_head assoc_buffers_node; + struct vm_object *assoc_buffers_obj; }; #define BLOCKBUF_FLAG_DIRTY (1 << 0) #define BLOCKBUF_FLAG_UNDER_WB (1 << 1) +static inline bool bb_test_and_set(struct block_buf *buf, unsigned int flag) +{ + unsigned int old; + do + { + old = __atomic_load_n(&buf->flags, __ATOMIC_ACQUIRE); + if (old & flag) + return false; + } while (!__atomic_compare_exchange_n(&buf->flags, &old, old | flag, false, __ATOMIC_RELEASE, + __ATOMIC_RELAXED)); + return true; +} + +static inline bool bb_test_and_clear(struct block_buf *buf, unsigned int flag) +{ + return __atomic_fetch_and(&buf->flags, ~flag, __ATOMIC_RELEASE) & flag; +} + +static inline void bb_clear_flag(struct block_buf *buf, unsigned int flag) +{ + __atomic_and_fetch(&buf->flags, ~flag, __ATOMIC_RELEASE); +} + #define MAX_BLOCK_SIZE PAGE_SIZE struct superblock; @@ -46,7 +75,7 @@ struct superblock; struct block_buf *page_add_blockbuf(struct page *page, unsigned int page_off); struct block_buf *sb_read_block(const struct superblock *sb, unsigned long block); void block_buf_free(struct block_buf *buf); -void block_buf_writeback(struct block_buf *buf); +void block_buf_sync(struct block_buf *buf); void block_buf_dirty(struct block_buf *buf); struct block_buf *block_buf_from_page(struct page *p); void page_destroy_block_bufs(struct page *page); @@ -71,6 +100,40 @@ static inline void *block_buf_data(struct block_buf *b) return (void *) (((unsigned long) PAGE_TO_VIRT(b->this_page)) + b->page_off); } +/** + * @brief Associate a block_buf with a vm_object + * This is used for e.g indirect blocks that want to be written back + * when doing fsync. The vm_object does *not* need to be the block device's. + * + * @param buf Block buf + * @param object Object + */ +void block_buf_associate(struct block_buf *buf, struct vm_object *object); + +/** + * @brief Sync all the associated buffers to this vm_object + * + * @param object VM object (of probably an fs's inode) + */ +void block_buf_sync_assoc(struct vm_object *object); + +/** + * @brief Dirty a block buffer and associate it with an inode + * The association will allow us to write this buffer back when syncing + * the inode's data. + * + * @param buf Buffer to dirty + * @param inode Inode to add it to + */ +void block_buf_dirty_inode(struct block_buf *buf, struct inode *inode); + +/** + * @brief Tear down a vm object's assoc list + * + * @param object Object to tear down + */ +void block_buf_tear_down_assoc(struct vm_object *object); + #ifdef __cplusplus void page_remove_block_buf(struct page *page, size_t offset, size_t end); @@ -177,22 +240,32 @@ class buf_dirty_trigger { private: auto_block_buf &buf; + struct inode *inode{nullptr}; bool dont_dirty; + void do_dirty() + { + if (inode) + block_buf_dirty_inode(buf, inode); + else + block_buf_dirty(buf); + } + public: - buf_dirty_trigger(auto_block_buf &b) : buf{b}, dont_dirty{false} + buf_dirty_trigger(auto_block_buf &b, struct inode *inode = nullptr) + : buf{b}, inode{inode}, dont_dirty{false} { } ~buf_dirty_trigger() { if (!dont_dirty) - block_buf_dirty(buf); + do_dirty(); } void explicit_dirty() { - block_buf_dirty(buf); + do_dirty(); dont_dirty = true; } diff --git a/kernel/include/onyx/filemap.h b/kernel/include/onyx/filemap.h index 4ab345298..83d0547ab 100644 --- a/kernel/include/onyx/filemap.h +++ b/kernel/include/onyx/filemap.h @@ -12,9 +12,9 @@ #include #include +#include struct file; -struct inode; struct page; /** @@ -46,4 +46,27 @@ ssize_t filemap_write_iter(struct file *filp, size_t off, iovec_iter *iter, unsi int filemap_find_page(struct inode *ino, size_t pgoff, unsigned int flags, struct page **outp); +void page_start_writeback(struct page *page, struct inode *inode) + EXCLUDES(inode->i_pages->page_lock) REQUIRES(page); + +void page_end_writeback(struct page *page, struct inode *inode) EXCLUDES(inode->i_pages->page_lock) + REQUIRES(page); + +/** + * @brief Marks a page dirty in the filemap + * + * @param ino Inode to mark dirty + * @param page Page to mark dirty + * @param pgoff Page offset + * @invariant page is locked + */ +void filemap_mark_dirty(struct inode *ino, struct page *page, size_t pgoff) REQUIRES(page); + +struct writepages_info; + +int filemap_writepages(struct inode *inode, struct writepages_info *wpinfo); + +#define FILEMAP_MARK_DIRTY RA_MARK_0 +#define FILEMAP_MARK_WRITEBACK RA_MARK_1 + #endif diff --git a/kernel/include/onyx/list.h b/kernel/include/onyx/list.h index 47f1c410d..24e79af25 100644 --- a/kernel/include/onyx/list.h +++ b/kernel/include/onyx/list.h @@ -158,7 +158,8 @@ static inline void list_copy(struct list_head *dest, const struct list_head *src static inline void list_move(struct list_head *dest, struct list_head *src) { - list_copy(dest, src); + if (!list_is_empty(src)) + list_copy(dest, src); list_reset(src); } diff --git a/kernel/include/onyx/mm/flush.h b/kernel/include/onyx/mm/flush.h index d422ea1a5..c73857f31 100644 --- a/kernel/include/onyx/mm/flush.h +++ b/kernel/include/onyx/mm/flush.h @@ -1,11 +1,13 @@ /* - * Copyright (c) 2019 Pedro Falcato + * Copyright (c) 2019 - 2023 Pedro Falcato * This file is part of Onyx, and is released under the terms of the MIT License * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT */ -#ifndef _ONYX_MM_FLUSH_H -#define _ONYX_MM_FLUSH_H +#ifndef _ONYX_MM_WRITEBACK_H +#define _ONYX_MM_WRITEBACK_H #include #include @@ -13,36 +15,18 @@ #include #include -/* TODO: This file started as mm specific but it's quite fs now, no? */ - struct inode; - -struct flush_object; -/* Implemented by users of the flush subsystem */ -struct flush_ops -{ - ssize_t (*flush)(struct flush_object *fmd); - bool (*is_dirty)(struct flush_object *fmd); - void (*set_dirty)(bool value, struct flush_object *fmd); -}; - -struct flush_object -{ - struct list_head dirty_list; - void *blk_list; - const struct flush_ops *ops; -}; +struct blockdev; /* Keep C APIs here */ void flush_init(void); -void flush_add_buf(struct flush_object *blk); -void flush_remove_buf(struct flush_object *blk); void flush_add_inode(struct inode *ino); void flush_remove_inode(struct inode *ino); -ssize_t flush_sync_one(struct flush_object *obj); void flush_do_sync(void); +#define WB_FLAG_SYNC (1 << 0) + #ifdef __cplusplus #include @@ -50,57 +34,48 @@ void flush_do_sync(void); namespace flush { -class flush_dev +class writeback_dev { private: - /* Each flush dev has a list of dirty bufs that need flushing. */ - struct list_head dirty_bufs; + /* Each writeback dev has a list of dirty inodes that need flushing. */ struct list_head dirty_inodes; - atomic block_load; - struct mutex __lock; + struct spinlock __lock; /* Each flush dev also is associated with a thread that runs every x seconds */ struct thread *thread; struct semaphore thread_sem; + struct blockdev *bdev; + struct list_head wbdev_list_node; public: - static constexpr unsigned long wb_run_delta_ms = 10000; - constexpr flush_dev() - : dirty_bufs{}, dirty_inodes{}, block_load{0}, __lock{}, thread{}, thread_sem{} + constexpr writeback_dev(struct blockdev *bdev) + : dirty_inodes{}, thread{}, thread_sem{}, bdev{bdev} { - mutex_init(&__lock); - INIT_LIST_HEAD(&dirty_bufs); INIT_LIST_HEAD(&dirty_inodes); } - ~flush_dev() - { - } - - unsigned long get_load() - { - return block_load; - } + ~writeback_dev() = default; - void lock() ACQUIRE(__lock) + void lock() { - mutex_lock(&__lock); + spin_lock(&__lock); } - void unlock() RELEASE(__lock) + void unlock() { - mutex_unlock(&__lock); + spin_unlock(&__lock); } - bool called_from_sync(); - void init(); void run(); - bool add_buf(struct flush_object *buf); - void remove_buf(struct flush_object *buf); void add_inode(struct inode *ino); void remove_inode(struct inode *ino); - void sync(); - ssize_t sync_one(struct flush_object *obj); + void sync(unsigned int flags); + void end_inode_writeback(struct inode *ino); + + static writeback_dev *from_list_head(struct list_head *l) + { + return container_of(l, writeback_dev, wbdev_list_node); + } }; }; // namespace flush diff --git a/kernel/include/onyx/mm/vm_object.h b/kernel/include/onyx/mm/vm_object.h index 26f8fe57b..43ffec368 100644 --- a/kernel/include/onyx/mm/vm_object.h +++ b/kernel/include/onyx/mm/vm_object.h @@ -90,6 +90,10 @@ struct vm_object struct vm_object *prev_private{nullptr}, *next_private{nullptr}; + /* See fs/buffer.cpp for example usage of these struct members */ + struct spinlock private_lock; + struct list_head private_list; + vm_object(); ~vm_object(); diff --git a/kernel/include/onyx/page.h b/kernel/include/onyx/page.h index 5413588bf..8f44159cf 100644 --- a/kernel/include/onyx/page.h +++ b/kernel/include/onyx/page.h @@ -62,10 +62,11 @@ */ #define PAGE_BUDDY (1 << 3) #define PAGE_FLAG_BUFFER (1 << 4) /* Used by the filesystem code */ -#define PAGE_FLAG_FLUSHING (1 << 5) +/* bit 5 is unused */ #define PAGE_FLAG_FILESYSTEM1 (1 << 6) /* Filesystem private flag */ #define PAGE_FLAG_WAITERS (1 << 7) #define PAGE_FLAG_UPTODATE (1 << 8) +#define PAGE_FLAG_WRITEBACK (1 << 9) struct vm_object; @@ -297,6 +298,16 @@ __always_inline bool page_locked(struct page *p) return page_flag_set(p, PAGE_FLAG_LOCKED); } +__always_inline void page_set_writeback(struct page *p) +{ + __atomic_fetch_or(&p->flags, PAGE_FLAG_WRITEBACK, __ATOMIC_RELEASE); +} + +__always_inline void page_clear_writeback(struct page *p) +{ + __atomic_fetch_and(&p->flags, ~PAGE_FLAG_WRITEBACK, __ATOMIC_RELEASE); +} + void __reclaim_page(struct page *new_page); void reclaim_pages(unsigned long start, unsigned long end); void page_allocate_pagemap(unsigned long __maxpfn); diff --git a/kernel/include/onyx/radix.h b/kernel/include/onyx/radix.h index e3812cee1..28e97ab65 100644 --- a/kernel/include/onyx/radix.h +++ b/kernel/include/onyx/radix.h @@ -298,6 +298,19 @@ class radix_tree * @param mark The mark to clear */ void clear_mark(unsigned long index, unsigned int mark); + + /** + * @brief Check if a given mark is set (in the whole tree) + * + * @param mark Mark to check + * @return True if set, else false + */ + bool mark_is_set(unsigned int mark) + { + if (!tree) + return false; + return !tree->mark_empty(mark); + } }; static inline bool radix_err(unsigned long ret) diff --git a/kernel/include/onyx/superblock.h b/kernel/include/onyx/superblock.h index 09e83842d..45b39b918 100644 --- a/kernel/include/onyx/superblock.h +++ b/kernel/include/onyx/superblock.h @@ -10,11 +10,11 @@ #define _ONYX_SUPERBLOCK_H #include -#include #include #include #include +#include struct file; @@ -29,7 +29,7 @@ struct superblock struct spinlock s_ilock; unsigned long s_ref; void *s_helper; - int (*flush_inode)(struct inode *inode); + int (*flush_inode)(struct inode *inode, bool in_sync); int (*kill_inode)(struct inode *inode); int (*statfs)(struct statfs *buf, superblock *sb); unsigned int s_block_size; diff --git a/kernel/include/onyx/trace/trace_base.h b/kernel/include/onyx/trace/trace_base.h index b08cc7ea7..4d43e743c 100644 --- a/kernel/include/onyx/trace/trace_base.h +++ b/kernel/include/onyx/trace/trace_base.h @@ -63,13 +63,13 @@ struct scope_guard } }; -#define TRACE_EVENT_DURATION(name, ...) \ - u64 __trace_timestamp = trace_##name##_enabled() ? clocksource_get_time() : 0; \ - scope_guard __PASTE(__trace_scope_guard, \ - __COUNTER__){[__trace_timestamp __VA_OPT__(, ) __VA_ARGS__]() { \ - if (__trace_timestamp) \ - trace_##name(__trace_timestamp __VA_OPT__(, ) __VA_ARGS__); \ - }}; +#define TRACE_EVENT_DURATION(name, ...) \ + u64 __trace_timestamp##__COUNTER__ = trace_##name##_enabled() ? clocksource_get_time() : 0; \ + scope_guard __PASTE(__trace_scope_guard, __COUNTER__){ \ + [__trace_timestamp##__COUNTER__ __VA_OPT__(, ) __VA_ARGS__]() { \ + if (__trace_timestamp##__COUNTER__) \ + trace_##name(__trace_timestamp##__COUNTER__ __VA_OPT__(, ) __VA_ARGS__); \ + }}; #define TRACE_EVENT(name, ...) \ do \ diff --git a/kernel/include/onyx/vfs.h b/kernel/include/onyx/vfs.h index c90e82fa8..d11b0f11c 100644 --- a/kernel/include/onyx/vfs.h +++ b/kernel/include/onyx/vfs.h @@ -49,6 +49,17 @@ typedef int (*__stat)(struct stat *buf, struct file *node); typedef struct inode *(*__symlink)(const char *name, const char *linkpath, struct dentry *dir); typedef unsigned int (*putdir_t)(struct dirent *, struct dirent *ubuf, unsigned int count); +struct writepages_info +{ + /* Start and end (inclusive) of writepages */ + unsigned long start; + unsigned long end; + unsigned int flags; +}; + +/* Wait for writeback to complete (this is part of sync or fsync) */ +#define WRITEPAGES_SYNC (1 << 0) + struct file_ops { __read read; @@ -78,6 +89,8 @@ struct file_ops void (*release)(struct file *filp); ssize_t (*read_iter)(struct file *filp, size_t offset, iovec_iter *iter, unsigned int flags); ssize_t (*write_iter)(struct file *filp, size_t offset, iovec_iter *iter, unsigned int flags); + int (*writepages)(struct inode *ino, struct writepages_info *wpinfo); + int (*fsyncdata)(struct inode *ino, struct writepages_info *wpinfo); }; struct getdents_ret @@ -91,21 +104,18 @@ int inode_init(struct inode *ino, bool is_reg); class pipe; #define INODE_FLAG_DONT_CACHE (1 << 0) -#define INODE_FLAG_DIRTY (1 << 1) #define INODE_FLAG_NO_SEEK (1 << 2) -#define INODE_FLAG_WB (1 << 3) -#define INODE_FLAG_FREEING (1 << 4) +#define I_FREEING (1 << 4) #define I_DATADIRTY (1 << 5) #define I_DIRTY (1 << 1) #define I_WRITEBACK (1 << 3) #define I_HASHED (1 << 7) +#define I_DIRTYALL (I_DIRTY | I_DATADIRTY) + struct inode { unsigned long i_refc; - /* TODO: We could use a lock here to protect i_flags to have - * thread-safe dirties, etc... - */ unsigned int i_flags; ino_t i_inode; gid_t i_gid; @@ -221,8 +231,6 @@ struct file *get_fs_base(const char *file, struct file *rel_base); void inode_mark_dirty(struct inode *ino, unsigned int flags = I_DIRTY); -int inode_flush(struct inode *ino); - int inode_special_init(struct inode *ino); /** @@ -388,4 +396,8 @@ ssize_t read_iter_vfs(struct file *filp, size_t off, iovec_iter *iter, unsigned int noop_prepare_write(struct inode *ino, struct page *page, size_t page_off, size_t offset, size_t len); + +void inode_wait_writeback(struct inode *ino); +bool inode_no_dirty(struct inode *ino, unsigned int flags); + #endif diff --git a/kernel/kernel/fs/anon_inode.cpp b/kernel/kernel/fs/anon_inode.cpp index ab9f96fac..b65370991 100644 --- a/kernel/kernel/fs/anon_inode.cpp +++ b/kernel/kernel/fs/anon_inode.cpp @@ -29,14 +29,9 @@ struct inode *anon_inode_alloc(mode_t file_type) { DCHECK(anonsb != nullptr); - struct inode *ino = new inode; + struct inode *ino = inode_create(true); if (!ino) - return ino; - if (inode_init(ino, true) < 0) - { - delete ino; return nullptr; - } ino->i_atime = ino->i_mtime = clock_get_posix_time(); ino->i_sb = anonsb; diff --git a/kernel/kernel/fs/block.cpp b/kernel/kernel/fs/block.cpp index 442532483..a8c2ced6c 100644 --- a/kernel/kernel/fs/block.cpp +++ b/kernel/kernel/fs/block.cpp @@ -72,6 +72,7 @@ struct superblock *bdev_sb; struct block_inode { struct inode b_inode; + flush::writeback_dev *b_wbdev; /** * @brief Create a new blockdev inode @@ -79,7 +80,7 @@ struct block_inode * @arg dev Block device * @return Properly set up blockdev inode, or NULL */ - static unique_ptr create(const struct blockdev *dev); + static unique_ptr create(const struct blockdev *dev, flush::writeback_dev *wbdev); }; /** @@ -88,7 +89,7 @@ struct block_inode * @arg dev Block device * @return Properly set up blockdev inode, or NULL */ -unique_ptr block_inode::create(const struct blockdev *dev) +unique_ptr block_inode::create(const struct blockdev *dev, flush::writeback_dev *wbdev) { unique_ptr ino = make_unique(); if (!ino) @@ -99,6 +100,7 @@ unique_ptr block_inode::create(const struct blockdev *dev) return nullptr; inode.i_dev = dev->dev->dev(); inode.i_sb = bdev_sb; + ino->b_wbdev = wbdev; superblock_add_inode(bdev_sb, &inode); return ino; @@ -140,7 +142,16 @@ int blkdev_init(struct blockdev *blk) blk->dev = dev; - auto ino = block_inode::create(blk); + blk->wbdev = make_unique(blk); + if (!blk->wbdev) + { + dev_unregister_dev(dev, true); + return -ENOMEM; + } + + blk->wbdev->init(); + + auto ino = block_inode::create(blk, blk->wbdev.get()); if (!ino) { dev_unregister_dev(dev, true); @@ -274,3 +285,24 @@ unique_ptr blkdev_create_scsi_like_dev() return cul::move(dev); } + +flush::writeback_dev *bdev_get_wbdev(struct inode *ino) +{ + flush::writeback_dev *dev; + DCHECK(ino->i_sb != nullptr); + + if (ino->i_sb == bdev_sb) + dev = ((struct block_inode *) ino)->b_wbdev; + else + { + /* Find the block device, get the wbdev that way */ + blockdev *bdev = ino->i_sb->s_bdev; + if (S_ISBLK(ino->i_mode)) + bdev = (blockdev *) ino->i_helper; + DCHECK(bdev != nullptr); + dev = bdev->wbdev.get(); + } + + DCHECK(dev != nullptr); + return dev; +} diff --git a/kernel/kernel/fs/buffer.cpp b/kernel/kernel/fs/buffer.cpp index 3e3bb9a51..c23a62ba7 100644 --- a/kernel/kernel/fs/buffer.cpp +++ b/kernel/kernel/fs/buffer.cpp @@ -13,12 +13,17 @@ #include #include #include +#include -#include +static struct slab_cache *buffer_cache = nullptr; -memory_pool block_buf_pool; +__init static void buffer_cache_init() +{ + buffer_cache = kmem_cache_create("block_buf", sizeof(block_buf), 0, 0, nullptr); + CHECK(buffer_cache != nullptr); +} -ssize_t buffer_writepage(struct page *page, size_t offset, struct inode *ino) +ssize_t buffer_writepage(struct page *page, size_t offset, struct inode *ino) REQUIRES(page) { auto blkdev = reinterpret_cast(ino->i_helper); DCHECK(blkdev != nullptr); @@ -30,6 +35,18 @@ ssize_t buffer_writepage(struct page *page, size_t offset, struct inode *ino) for (block_buf *it = bufs; it != nullptr; it = it->next) { if (it->flags & BLOCKBUF_FLAG_DIRTY) + { + if (!first_dirty) + first_dirty = it; + last_dirty = it; + bb_clear_flag(it, BLOCKBUF_FLAG_DIRTY); + } + } + + if (!first_dirty) + { + // HACK! Take the first and last buffers of the page + for (block_buf *it = bufs; it != nullptr; it = it->next) { if (!first_dirty) first_dirty = it; @@ -38,6 +55,7 @@ ssize_t buffer_writepage(struct page *page, size_t offset, struct inode *ino) } DCHECK(first_dirty != nullptr); + DCHECK(last_dirty != nullptr); sector_t disk_sect = (first_dirty->block_nr * first_dirty->block_size) / blkdev->sector_size; @@ -53,10 +71,13 @@ ssize_t buffer_writepage(struct page *page, size_t offset, struct inode *ino) r.vec = &vec; - __atomic_fetch_or(&vec.page->flags, PAGE_FLAG_FLUSHING, __ATOMIC_RELAXED); + page_start_writeback(page, ino); + __atomic_fetch_or(&vec.page->flags, PAGE_FLAG_WRITEBACK, __ATOMIC_RELAXED); if (bio_submit_request(blkdev, &r) < 0) return -EIO; + page_end_writeback(page, ino); + #if 0 printk("Flushed #%lu[sector %lu].\n", buf->block_nr, disk_sect); #endif @@ -92,7 +113,7 @@ struct block_buf *page_add_blockbuf(struct page *page, unsigned int page_off) { assert(page->flags & PAGE_FLAG_BUFFER); - auto buf = block_buf_pool.allocate(); + auto buf = (struct block_buf *) kmem_cache_alloc(buffer_cache, GFP_KERNEL); if (!buf) { return nullptr; @@ -103,6 +124,7 @@ struct block_buf *page_add_blockbuf(struct page *page, unsigned int page_off) buf->next = nullptr; buf->refc = 1; buf->flags = 0; + buf->assoc_buffers_obj = nullptr; /* It's better to do this naively using O(n) as to keep memory usage per-struct page low. */ /* We're not likely to hit substancial n's anyway */ @@ -135,19 +157,39 @@ void block_buf_remove(struct block_buf *buf) } } -void block_buf_writeback(struct block_buf *buf) +void block_buf_sync(struct block_buf *buf) { - // flush_sync_one(&buf->flush_obj); + /* TODO: Only write *this* buffer, instead of the whole page */ + struct page *page = buf->this_page; + lock_page(page); + buffer_writepage(page, page->pageoff << PAGE_SHIFT, buf->dev->b_ino); + unlock_page(page); + /* TODO: This will need to be adapted for async... */ } void block_buf_free(struct block_buf *buf) { if (buf->flags & BLOCKBUF_FLAG_DIRTY) - block_buf_writeback(buf); + block_buf_sync(buf); + + /* TODO: I'm not sure if this is totally safe... think through it a bit more, once this is + * actually a likely case (when page reclamation becomes a thing). + */ + while (buf->assoc_buffers_obj) + { + struct vm_object *obj = buf->assoc_buffers_obj; + scoped_lock g{obj->private_lock}; + + if (buf->assoc_buffers_obj == obj) + { + list_remove(&buf->assoc_buffers_node); + break; + } + } block_buf_remove(buf); - block_buf_pool.free(buf); + kmem_cache_free(buffer_cache, buf); } void page_destroy_block_bufs(struct page *page) @@ -187,9 +229,11 @@ ssize_t bbuffer_readpage(struct page *p, size_t off, struct inode *ino) return -EIO; } + auto block_size = blkdev->sector_size; auto sb = blkdev->sb; - assert(sb != nullptr); + if (sb) + block_size = sb->s_block_size; struct page_iov vec; vec.length = PAGE_SIZE; @@ -202,7 +246,6 @@ ssize_t bbuffer_readpage(struct page *p, size_t off, struct inode *ino) r.sector_number = sec_nr; r.flags = BIO_REQ_READ_OP; - auto block_size = sb->s_block_size; auto nr_blocks = PAGE_SIZE / block_size; size_t starting_block_nr = off / block_size; @@ -238,6 +281,8 @@ struct file_ops buffer_ops = { .prepare_write = noop_prepare_write, .read_iter = filemap_read_iter, .write_iter = filemap_write_iter, + .writepages = filemap_writepages, + .fsyncdata = filemap_writepages, }; struct block_buf *sb_read_block(const struct superblock *sb, unsigned long block) @@ -290,7 +335,12 @@ struct block_buf *sb_read_block(const struct superblock *sb, unsigned long block void block_buf_dirty(block_buf *buf) { - /* XXX */ + if (!bb_test_and_set(buf, BLOCKBUF_FLAG_DIRTY)) + return; + struct page *page = buf->this_page; + lock_page(page); + filemap_mark_dirty(buf->dev->b_ino, page, buf->this_page->pageoff); + unlock_page(page); } void page_remove_block_buf(struct page *page, size_t offset, size_t end) @@ -309,3 +359,82 @@ void page_remove_block_buf(struct page *page, size_t offset, size_t end) pp = &(*pp)->next; } } + +/** + * @brief Associate a block_buf with a vm_object + * This is used for e.g indirect blocks that want to be written back + * when doing fsync. The vm_object does *not* need to be the block device's. + * + * @param buf Block buf + * @param object Object + */ +void block_buf_associate(struct block_buf *buf, struct vm_object *object) +{ + scoped_lock g{object->private_lock}; + DCHECK(buf->assoc_buffers_obj == object || buf->assoc_buffers_obj == nullptr); + + if (!buf->assoc_buffers_obj) + { + buf->assoc_buffers_obj = object; + list_add_tail(&buf->assoc_buffers_node, &object->private_list); + } +} + +/** + * @brief Sync all the associated buffers to this vm_object + * + * @param object VM object (of probably an fs's inode) + */ +void block_buf_sync_assoc(struct vm_object *object) +{ + spin_lock(&object->private_lock); + // Progressively pop the head of the list, grab a ref (so we can't be moved away) and remove it + // from the assoc buffers list, release the lock, sync the buffer, and do it all again. + while (!list_is_empty(&object->private_list)) + { + struct block_buf *bb = container_of(list_first_element(&object->private_list), + struct block_buf, assoc_buffers_node); + block_buf_get(bb); + list_remove(&bb->assoc_buffers_node); + bb->assoc_buffers_obj = nullptr; + spin_unlock(&object->private_lock); + + if (bb->flags & BLOCKBUF_FLAG_DIRTY) + block_buf_sync(bb); + block_buf_put(bb); + + spin_lock(&object->private_lock); + } + + spin_unlock(&object->private_lock); +} + +/** + * @brief Dirty a block buffer and associate it with an inode + * The association will allow us to write this buffer back when syncing + * the inode's data. + * + * @param buf Buffer to dirty + * @param inode Inode to add it to + */ +void block_buf_dirty_inode(struct block_buf *buf, struct inode *inode) +{ + block_buf_dirty(buf); + block_buf_associate(buf, inode->i_pages); +} + +/** + * @brief Tear down a vm object's assoc list + * + * @param object Object to tear down + */ +void block_buf_tear_down_assoc(struct vm_object *object) +{ + scoped_lock g{object->private_lock}; + list_for_every_safe (&object->private_list) + { + struct block_buf *bb = container_of(l, struct block_buf, assoc_buffers_node); + bb->assoc_buffers_obj = nullptr; + list_remove(&bb->assoc_buffers_node); + } +} diff --git a/kernel/kernel/fs/dev.cpp b/kernel/kernel/fs/dev.cpp index 8f88b6217..27d9dd8d6 100644 --- a/kernel/kernel/fs/dev.cpp +++ b/kernel/kernel/fs/dev.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -23,6 +22,7 @@ #include #include #include +#include #include #include @@ -487,9 +487,8 @@ inode *devfs_mount(blockdev *dev) fops->getdirent = devfs_getdirent; new_fs->override_file_ops(fops.release()); - - superblock_init(new_fs.get()); new_fs->s_devnr = ex.value()->dev(); + new_fs->s_flags &= ~SB_FLAG_IN_MEMORY; auto node = new_fs->create_inode(S_IFDIR | S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); if (!node) diff --git a/kernel/kernel/fs/ext2/alloc.cpp b/kernel/kernel/fs/ext2/alloc.cpp index 8416fb2d7..437f34018 100644 --- a/kernel/kernel/fs/ext2/alloc.cpp +++ b/kernel/kernel/fs/ext2/alloc.cpp @@ -49,7 +49,7 @@ expected, int> ext2_superblock::allocate_ memset(ino, 0, inode_size); - update_inode(ino, res.value()); + update_inode(ino, res.value(), false); return cul::pair{res.value(), ino}; } diff --git a/kernel/kernel/fs/ext2/ext2.cpp b/kernel/kernel/fs/ext2/ext2.cpp index 8b7523575..def47fdbf 100644 --- a/kernel/kernel/fs/ext2/ext2.cpp +++ b/kernel/kernel/fs/ext2/ext2.cpp @@ -45,6 +45,7 @@ ssize_t ext2_writepage(struct page *page, size_t off, struct inode *ino); int ext2_prepare_write(inode *ino, struct page *page, size_t page_off, size_t offset, size_t len); int ext2_link(struct inode *target, const char *name, struct inode *dir); inode *ext2_symlink(const char *name, const char *dest, dentry *dir); +static int ext2_fsyncdata(struct inode *ino, struct writepages_info *wpinfo); struct file_ops ext2_ops = {.open = ext2_open, .close = ext2_close, @@ -62,7 +63,9 @@ struct file_ops ext2_ops = {.open = ext2_open, .writepage = ext2_writepage, .prepare_write = ext2_prepare_write, .read_iter = filemap_read_iter, - .write_iter = filemap_write_iter}; + .write_iter = filemap_write_iter, + .writepages = filemap_writepages, + .fsyncdata = ext2_fsyncdata}; void ext2_delete_inode(struct inode *inode_, uint32_t inum, struct ext2_superblock *fs) { @@ -95,12 +98,13 @@ void ext2_close(struct inode *vfs_ino) free(inode); } -ssize_t ext2_writepage(page *page, size_t off, inode *ino) +ssize_t ext2_writepage(page *page, size_t off, inode *ino) REQUIRES(page) { auto buf = block_buf_from_page(page); auto sb = ext2_superblock_from_inode(ino); + DCHECK(buf != nullptr); - assert(buf != nullptr); + page_start_writeback(page, ino); while (buf) { @@ -115,6 +119,7 @@ ssize_t ext2_writepage(page *page, size_t off, inode *ino) if (sb_write_bio(sb, v, 1, buf->block_nr) < 0) { + page_end_writeback(page, ino); sb->error("Error writing back page"); return -EIO; } @@ -122,6 +127,8 @@ ssize_t ext2_writepage(page *page, size_t off, inode *ino) buf = buf->next; } + page_end_writeback(page, ino); + return PAGE_SIZE; } @@ -457,7 +464,7 @@ struct inode *ext2_creat(const char *name, int mode, struct dentry *dir) return i; } -int ext2_flush_inode(struct inode *inode) +int ext2_flush_inode(struct inode *inode, bool in_sync) { struct ext2_inode *ino = ext2_get_inode_from_node(inode); struct ext2_superblock *fs = ext2_superblock_from_inode(inode); @@ -475,7 +482,7 @@ int ext2_flush_inode(struct inode *inode) ino->i_mode = inode->i_mode; ino->i_uid = inode->i_uid; - fs->update_inode(ino, (ext2_inode_no) inode->i_inode); + fs->update_inode(ino, (ext2_inode_no) inode->i_inode, in_sync); return 0; } @@ -714,7 +721,7 @@ void ext2_superblock::error(const char *str) const sb->s_state = EXT2_ERROR_FS; block_buf_dirty(sb_bb); - block_buf_writeback(sb_bb); + block_buf_sync(sb_bb); if (sb->s_errors == EXT2_ERRORS_CONTINUE) return; @@ -742,3 +749,14 @@ int ext2_superblock::stat_fs(struct statfs *buf) return 0; } + +static int ext2_fsyncdata(struct inode *ino, struct writepages_info *wpinfo) +{ + /* Sync the actual pages, then writeback indirect blocks */ + if (int st = filemap_writepages(ino, wpinfo); st < 0) + return st; + /* If not a block device, sync indirect blocks (that have been associated with the vm object) */ + if (!S_ISBLK(ino->i_mode)) + block_buf_sync_assoc(ino->i_pages); + return 0; +} diff --git a/kernel/kernel/fs/ext2/ext2.h b/kernel/kernel/fs/ext2/ext2.h index f35b0ef43..b6c1c76c9 100644 --- a/kernel/kernel/fs/ext2/ext2.h +++ b/kernel/kernel/fs/ext2/ext2.h @@ -10,7 +10,6 @@ #include #include -#include #include #include @@ -21,6 +20,8 @@ #include #include +#include + #include #include @@ -467,8 +468,10 @@ struct ext2_superblock : public superblock * * @param ino Pointer to ext2_inode * @param inode_no Inode number + * @param in_sync If this is part of a sync/fsync call (i.e do we need to write the buffer back + * immediately.) */ - void update_inode(const ext2_inode *ino, ext2_inode_no inode_no); + void update_inode(const ext2_inode *ino, ext2_inode_no inode_no, bool in_sync = false); /** * @brief Reads metadata blocks from the filesystem using sb_read_block diff --git a/kernel/kernel/fs/ext2/ext2_ll.cpp b/kernel/kernel/fs/ext2/ext2_ll.cpp index 5d40e3a55..6ded875b8 100644 --- a/kernel/kernel/fs/ext2/ext2_ll.cpp +++ b/kernel/kernel/fs/ext2/ext2_ll.cpp @@ -101,8 +101,10 @@ ext2_inode *ext2_superblock::get_inode(ext2_inode_no inode) const * * @param ino Pointer to ext2_inode * @param inode_no Inode number + * @param in_sync If this is part of a sync/fsync call (i.e do we need to write the buffer back + * immediately.) */ -void ext2_superblock::update_inode(const ext2_inode *ino, ext2_inode_no inode_no) +void ext2_superblock::update_inode(const ext2_inode *ino, ext2_inode_no inode_no, bool in_sync) { assert(inode_no != 0); uint32_t bg_no = ext2_inode_number_to_bg(inode_no, this); @@ -128,6 +130,9 @@ void ext2_superblock::update_inode(const ext2_inode *ino, ext2_inode_no inode_no memcpy(on_disk, ino, inode_size); block_buf_dirty(buf); + + if (in_sync) + block_buf_sync(buf); } void ext2_dirty_sb(ext2_superblock *fs) diff --git a/kernel/kernel/fs/ext2/inode.cpp b/kernel/kernel/fs/ext2/inode.cpp index 9c5b51730..22ba7dac0 100644 --- a/kernel/kernel/fs/ext2/inode.cpp +++ b/kernel/kernel/fs/ext2/inode.cpp @@ -162,7 +162,7 @@ expected ext2_create_path(struct inode *ino, ext2_block_no b ino->i_blocks += sb->block_size >> 9; if (buf) - block_buf_dirty(buf); + block_buf_dirty_inode(buf, ino); else { inode_update_ctime(ino); @@ -179,7 +179,7 @@ expected ext2_create_path(struct inode *ino, ext2_block_no b if (should_zero_block) [[unlikely]] { memset(curr_block, 0, sb->block_size); - block_buf_dirty(buf); + block_buf_dirty_inode(buf, ino); } } else @@ -367,7 +367,7 @@ expected ext2_trunc_indirect_block(ext2_block_no block, return unexpected{-EIO}; } - buf_dirty_trigger dirty_trig{buf}; + buf_dirty_trigger dirty_trig{buf, ino}; uint32_t *blockbuf = (uint32_t *) block_buf_data(buf); diff --git a/kernel/kernel/fs/filemap.cpp b/kernel/kernel/fs/filemap.cpp index 16664c0c4..0c4873028 100644 --- a/kernel/kernel/fs/filemap.cpp +++ b/kernel/kernel/fs/filemap.cpp @@ -14,8 +14,6 @@ #include #include -#define FILEMAP_MARK_DIRTY RA_MARK_0 - int filemap_find_page(struct inode *ino, size_t pgoff, unsigned int flags, struct page **outp) NO_THREAD_SAFETY_ANALYSIS { @@ -205,11 +203,11 @@ ssize_t filemap_read_iter(struct file *filp, size_t off, iovec_iter *iter, unsig * @param pgoff Page offset * @invariant page is locked */ -static void filemap_mark_dirty(struct inode *ino, struct page *page, size_t pgoff) REQUIRES(page) +void filemap_mark_dirty(struct inode *ino, struct page *page, size_t pgoff) REQUIRES(page) { DCHECK(page_locked(page)); - if (ino->i_sb && ino->i_sb->s_flags & SB_FLAG_NODIRTY) - return; + // if (ino->i_sb && ino->i_sb->s_flags & SB_FLAG_NODIRTY) + // return; if (!page_test_set_flag(page, PAGE_FLAG_DIRTY)) return; /* Already marked as dirty, not our problem! */ @@ -220,6 +218,13 @@ static void filemap_mark_dirty(struct inode *ino, struct page *page, size_t pgof ino->i_pages->vm_pages.set_mark(pgoff, FILEMAP_MARK_DIRTY); } + /* TODO: This is horribly leaky and horrible and awful but it stops NR_DIRTY from leaking on + * tmpfs filesystems. I'll refrain from making a proper interface for this, because this really + * needs the axe. + */ + if (!inode_no_dirty(ino, I_DATADIRTY)) + inc_page_stat(page, NR_DIRTY); + inode_mark_dirty(ino, I_DATADIRTY); } @@ -354,6 +359,102 @@ ssize_t filemap_write_iter(struct file *filp, size_t off, iovec_iter *iter, unsi return st; } +static int filemap_get_tagged_pages(struct inode *inode, unsigned int mark, unsigned long start, + unsigned long end, struct page **batch, unsigned int batchlen) + EXCLUDES(inode->i_pages->page_lock) +{ + int batchidx = 0; + scoped_mutex g{inode->i_pages->page_lock}; + radix_tree::cursor cursor = + radix_tree::cursor::from_range_on_marks(&inode->i_pages->vm_pages, mark, start, end); + + while (!cursor.is_end()) + { + if (!batchlen--) + break; + struct page *page = (struct page *) cursor.get(); + batch[batchidx++] = page; + page_ref(page); + } + + return batchidx; +} + +void page_start_writeback(struct page *page, struct inode *inode) + EXCLUDES(inode->i_pages->page_lock) REQUIRES(page) +{ + struct vm_object *obj = inode->i_pages; + scoped_mutex g{obj->page_lock}; + obj->vm_pages.set_mark(page->pageoff, FILEMAP_MARK_WRITEBACK); + page_set_writeback(page); + inc_page_stat(page, NR_WRITEBACK); +} + +void page_end_writeback(struct page *page, struct inode *inode) EXCLUDES(inode->i_pages->page_lock) + REQUIRES(page) +{ + struct vm_object *obj = inode->i_pages; + scoped_mutex g{obj->page_lock}; + obj->vm_pages.clear_mark(page->pageoff, FILEMAP_MARK_WRITEBACK); + page_clear_writeback(page); + dec_page_stat(page, NR_WRITEBACK); +} + +static void page_clear_dirty(struct page *page) REQUIRES(page) +{ + /* Clear the dirty flag for IO */ + /* TODO: Add mmap walking and write-protect those mappings */ + struct vm_object *obj = page->owner; + __atomic_and_fetch(&page->flags, ~PAGE_FLAG_DIRTY, __ATOMIC_RELEASE); + scoped_mutex g{obj->page_lock}; + obj->vm_pages.clear_mark(page->pageoff, FILEMAP_MARK_DIRTY); + /* TODO: I don't know if this (clearing the dirty mark *here*) is safe with regards to potential + * sync()'s running at the same time. + */ + dec_page_stat(page, NR_DIRTY); +} + +int filemap_writepages(struct inode *inode, struct writepages_info *wpinfo) +{ + const ino_t ino = inode->i_inode; + const dev_t dev = inode->i_dev; + TRACE_EVENT_DURATION(filemap_writepages, ino, dev); + unsigned long start = wpinfo->start; + struct page *page; + int found = 0; + + /* TODO: When writepage turns async, handle WRITEPAGES_SYNC. Until then, it's a noop. */ + (void) wpinfo->flags; + + while ((found = filemap_get_tagged_pages(inode, FILEMAP_MARK_DIRTY, start, wpinfo->end, &page, + 1)) > 0) + { + const unsigned long pageoff = page->pageoff; + /* Start the next iteration from the following page */ + start = pageoff + 1; + + TRACE_EVENT_DURATION(filemap_writepage, ino, dev, pageoff); + /* TODO: Make writepages asynchronous! This is a huge performance PITA */ + lock_page(page); + + page_clear_dirty(page); + + ssize_t st = inode->i_fops->writepage(page, pageoff << PAGE_SHIFT, inode); + unlock_page(page); + if (st < 0) + { + /* Error! */ + page_unref(page); + return st; + } + + page_unref(page); + page = nullptr; + } + + return 0; +} + int filemap_private_fault(struct vm_pf_context *ctx) { struct vm_area_struct *region = ctx->entry; diff --git a/kernel/kernel/fs/inode.cpp b/kernel/kernel/fs/inode.cpp index 364bc9d3b..9e41f9c29 100644 --- a/kernel/kernel/fs/inode.cpp +++ b/kernel/kernel/fs/inode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 - 2022 Pedro Falcato + * Copyright (c) 2020 - 2023 Pedro Falcato * This file is part of Onyx, and is released under the terms of the MIT License * check LICENSE at the root directory for more information * @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,29 @@ static struct spinlock inode_hashtable_locks[inode_hashtable_size]; int pipe_do_fifo(inode *ino); +struct slab_cache *inode_cache; + +__init static void inode_cache_init() +{ + inode_cache = kmem_cache_create("inode", sizeof(struct inode), 0, KMEM_CACHE_HWALIGN, nullptr); + CHECK(inode_cache != nullptr); +} + +struct inode *inode_create(bool is_cached) +{ + struct inode *inode = (struct inode *) kmem_cache_alloc(inode_cache, GFP_KERNEL); + if (!inode) + return nullptr; + + if (inode_init(inode, is_cached) < 0) + { + kmem_cache_free(inode_cache, inode); + return nullptr; + } + + return inode; +} + int inode_special_init(struct inode *ino) { if (S_ISBLK(ino->i_mode) || S_ISCHR(ino->i_mode)) @@ -56,6 +80,12 @@ int inode_special_init(struct inode *ino) ino->i_fops = const_cast(dev->fops()); ino->i_helper = dev->private_; + if (S_ISBLK(ino->i_mode)) + { + struct blockdev *bdev = (struct blockdev *) dev->private_; + ino->i_pages = bdev->b_ino->i_pages; + vmo_ref(ino->i_pages); + } } else if (S_ISFIFO(ino->i_mode)) { @@ -84,27 +114,88 @@ ssize_t inode_sync(struct inode *inode) { if (!inode->i_pages) return 0; - scoped_mutex g{inode->i_pages->page_lock}; -#if 0 - // TODO: This sucks - inode->i_pages->for_every_page([&](struct page *page, unsigned long off) -> bool { - struct page_cache_block *b = page->cache; + unsigned int flags; - if (page->flags & PAGE_FLAG_DIRTY) - { - flush_sync_one(&b->fobj); - } + { + scoped_lock g{inode->i_lock}; + flags = inode->i_flags; + } - return true; - }); -#endif + if (flags & I_DATADIRTY) + { + struct writepages_info info; + info.start = 0; + info.end = ULONG_MAX; + info.flags = WRITEPAGES_SYNC; + int st = inode->i_fops->fsyncdata ? inode->i_fops->fsyncdata(inode, &info) : 0; + if (st < 0) + return st; + } + + if (flags & I_DIRTY) + { + int st = 0; + if (inode->i_sb && inode->i_sb->flush_inode) + st = inode->i_sb->flush_inode(inode, true); + if (st < 0) + return st; + } return 0; } bool inode_is_cacheable(struct inode *file); +/** + * @brief Attempt to remove ourselves from wbdev IO queues. + * This function sleeps if I_WRITEBACK is set. The inode *may* be dirty after the function + * completes. + * + * @param inode Inode to remove + */ +static void inode_wait_for_wb_and_remove(struct inode *inode) +{ + /* Attempt to remove ourselves from wbdev IO queues. This function sleeps if I_WRITEBACK. */ + spin_lock(&inode->i_lock); + for (;;) + { + if (!(inode->i_flags & (I_WRITEBACK | I_DIRTYALL))) + break; + if (inode->i_flags & I_WRITEBACK) + { + spin_unlock(&inode->i_lock); + /* Sleep and try again */ + inode_wait_writeback(inode); + spin_lock(&inode->i_lock); + continue; + } + + if (inode->i_flags & I_DIRTYALL) + { + /* Drop the inode lock, lock the wbdev, lock the inode again */ + flush::writeback_dev *wbdev = bdev_get_wbdev(inode); + unsigned int old_flags = inode->i_flags; + spin_unlock(&inode->i_lock); + wbdev->lock(); + spin_lock(&inode->i_lock); + + if (inode->i_flags != old_flags) + { + /* Drop the wbdev lock and spin again */ + wbdev->unlock(); + continue; + } + + wbdev->remove_inode(inode); + wbdev->unlock(); + break; + } + } + + spin_unlock(&inode->i_lock); +} + void inode_release(struct inode *inode) { bool should_die = inode_get_nlink(inode) == 0; @@ -117,8 +208,9 @@ void inode_release(struct inode *inode) superblock_remove_inode(inode->i_sb, inode); } - /*if (inode->i_flags & INODE_FLAG_DIRTY) - flush_remove_inode(inode);*/ + inode->set_evicting(); + + inode_wait_for_wb_and_remove(inode); if (inode_is_cacheable(inode)) inode_sync(inode); @@ -135,13 +227,18 @@ void inode_release(struct inode *inode) sb->kill_inode(inode); } + DCHECK((inode->i_flags & (I_DIRTYALL | I_WRITEBACK)) == 0); + /* Destroy the page cache *after* kill inode, since kill_inode might need to access the vmo */ inode_destroy_page_caches(inode); if (inode->i_fops->close != nullptr) inode->i_fops->close(inode); - free(inode); + /* Note: We use kfree here, and not kmem_cache_free, because in some filesystems is not + * allocated by inode_create. + */ + kfree(inode); } void inode_unref(struct inode *ino) @@ -173,7 +270,7 @@ struct inode *superblock_find_inode(struct superblock *sb, ino_t ino_nr) if (ino->i_dev == sb->s_devnr && ino->i_inode == ino_nr) { - if (ino->i_flags & INODE_FLAG_FREEING) + if (ino->i_flags & I_FREEING) { g.unlock(); wait_for( @@ -375,7 +472,7 @@ void inode_trim_cache() { scoped_lock g2{ino->i_lock}; - if (ino->i_flags & INODE_FLAG_FREEING) + if (ino->i_flags & I_FREEING) continue; // Already being freed // Evictable, so evict @@ -401,7 +498,8 @@ void inode_trim_cache() void inode::set_evicting() { - i_flags |= INODE_FLAG_FREEING; + scoped_lock g{i_lock}; + i_flags |= I_FREEING; } int noop_prepare_write(struct inode *ino, struct page *page, size_t page_off, size_t offset, @@ -409,3 +507,24 @@ int noop_prepare_write(struct inode *ino, struct page *page, size_t page_off, si { return 0; } + +void inode_wait_writeback(struct inode *ino) +{ + spin_lock(&ino->i_lock); + if (!(ino->i_flags & I_WRITEBACK)) + { + spin_unlock(&ino->i_lock); + return; + } + + spin_unlock(&ino->i_lock); + + wait_for( + ino, + [](void *_ino) -> bool { + struct inode *ino_ = (struct inode *) _ino; + scoped_lock g{ino_->i_lock}; + return !(ino_->i_flags & I_WRITEBACK); + }, + WAIT_FOR_FOREVER, 0); +} diff --git a/kernel/kernel/fs/tmpfs.cpp b/kernel/kernel/fs/tmpfs.cpp index 7c0d58842..e359f0632 100644 --- a/kernel/kernel/fs/tmpfs.cpp +++ b/kernel/kernel/fs/tmpfs.cpp @@ -112,7 +112,7 @@ ssize_t tmpfs_readpage(struct page *page, size_t offset, struct inode *ino) return PAGE_SIZE; } -ssize_t tmpfs_writepage(struct page *page, size_t offset, struct inode *ino) +ssize_t tmpfs_writepage(struct page *page, size_t offset, struct inode *ino) REQUIRES(page) { return PAGE_SIZE; } diff --git a/kernel/kernel/fs/vfs.cpp b/kernel/kernel/fs/vfs.cpp index 3956c2f49..3daafcb1d 100644 --- a/kernel/kernel/fs/vfs.cpp +++ b/kernel/kernel/fs/vfs.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -734,54 +735,46 @@ int inode_init(struct inode *inode, bool is_cached) return 0; } -struct inode *inode_create(bool is_cached) +bool inode_no_dirty(struct inode *ino, unsigned int flags) { - struct inode *inode = (struct inode *) zalloc(sizeof(*inode)); - - if (!inode) - return nullptr; - - if (inode_init(inode, is_cached) < 0) - { - free(inode); - return nullptr; - } + if (!ino->i_sb) + return true; + if (!(ino->i_sb->s_flags & SB_FLAG_NODIRTY)) + return false; - return inode; + /* If NODIRTY, check if we are a block device, and that we are dirtying pages */ + if (S_ISBLK(ino->i_mode)) + return !(flags & I_DATADIRTY); + return true; } void inode_mark_dirty(struct inode *ino, unsigned int flags) { - if (ino->i_sb && ino->i_sb->s_flags & SB_FLAG_NODIRTY) + /* FIXME: Ugh, leaky abstractions... */ + if (inode_no_dirty(ino, flags)) return; + DCHECK(flags & I_DIRTYALL); + /* Already dirty */ if ((ino->i_flags & flags) == flags) return; + auto dev = bdev_get_wbdev(ino); + dev->lock(); spin_lock(&ino->i_lock); + unsigned int old_flags = ino->i_flags; + ino->i_flags |= flags; trace_wb_dirty_inode(ino->i_inode, ino->i_dev); - /* TODO: queue this somewhere */ - spin_unlock(&ino->i_lock); -} - -int inode_flush(struct inode *ino) -{ - struct superblock *sb = ino->i_sb; - - if (!sb || !sb->flush_inode) - return 0; + /* The writeback code will take care of redirtying if need be */ + if (!(old_flags & (I_WRITEBACK | I_DIRTYALL))) + dev->add_inode(ino); - __sync_fetch_and_or(&ino->i_flags, INODE_FLAG_WB); - - int st = sb->flush_inode(ino); - - __sync_fetch_and_and(&ino->i_flags, ~(INODE_FLAG_WB | INODE_FLAG_DIRTY)); - - return st; + spin_unlock(&ino->i_lock); + dev->unlock(); } struct file *inode_to_file(struct inode *ino) diff --git a/kernel/kernel/fs/writeback.cpp b/kernel/kernel/fs/writeback.cpp index 972daf7d7..9d6d6f322 100644 --- a/kernel/kernel/fs/writeback.cpp +++ b/kernel/kernel/fs/writeback.cpp @@ -1,264 +1,234 @@ /* - * Copyright (c) 2019 Pedro Falcato + * Copyright (c) 2019 - 2023 Pedro Falcato * This file is part of Onyx, and is released under the terms of the MIT License * check LICENSE at the root directory for more information + * + * SPDX-License-Identifier: MIT */ #include #include -#include +#include +#include +#include #include #include #include +/* Brief comment on lock ordering in this file: + * Lock ordering goes like this: + * wbdev -> inode + * Any attempt to grab the wbdev with the inode lock must drop the inode lock beforehand. + */ + static void flush_thr_init(void *arg); namespace flush { -static constexpr unsigned long nr_wb_threads = 4UL; -array thread_list; +struct rwlock wbdev_list_lock; +DEFINE_LIST(wbdev_list); -void flush_dev::init() +/* Run the writeback thread every 10s, if needed */ +static constexpr unsigned long wb_run_delta_ms = 10000; + +void writeback_dev::init() { + { + scoped_rwlock g{wbdev_list_lock}; + list_add_tail(&wbdev_list_node, &wbdev_list); + } + thread = sched_create_thread(flush_thr_init, THREAD_KERNEL, (void *) this); assert(thread != nullptr); sched_start_thread(thread); } -void flush_dev::sync() +static int writeback_inode(struct inode *inode, unsigned int sync_flags) { - lock(); + const ino_t inum = inode->i_inode; + const dev_t dev = inode->i_dev; + struct writepages_info winfo; + winfo.start = 0; + winfo.end = ULONG_MAX; + winfo.flags = 0; - // printk("Syncing\n"); - /* We have to use list_for_every_safe because between clearing the dirty - * flag and going to the next buf some other cpu can see the flag is clear, - * and queue it up for another flush in another flush_dev (which isn't locked) */ - list_for_every_safe (&dirty_bufs) - { - flush_object *buf = container_of(l, flush_object, dirty_list); - /*printk("writeback file %p, size %lu, off %lu\n", blk->node, - blk->size, blk->offset);*/ - - buf->ops->flush(buf); + if (sync_flags & WB_FLAG_SYNC) + winfo.flags |= WRITEPAGES_SYNC; - buf->ops->set_dirty(false, buf); + CHECK(inode->i_fops->writepages != nullptr); + unsigned int flags; - block_load--; - } + DCHECK(inode->i_flags & I_WRITEBACK); - list_for_every_safe (&dirty_inodes) { - struct inode *ino = container_of(l, struct inode, i_dirty_inode_node); - - __sync_fetch_and_and(&ino->i_flags, ~INODE_FLAG_DIRTY); - __sync_synchronize(); - - inode_flush(ino); - block_load--; + scoped_lock g{inode->i_lock}; + flags = inode->i_flags & I_DIRTYALL; + // Note: We clear I_DIRTY here. Any posterior dirty will re-dirty the inode. + inode->i_flags &= ~I_DIRTY; } - /* reset the list */ - list_reset(&dirty_bufs); - list_reset(&dirty_inodes); - assert(block_load == 0); - - unlock(); -} - -ssize_t flush_dev::sync_one(struct flush_object *obj) -{ - lock(); - - size_t res = obj->ops->flush(obj); - - obj->ops->set_dirty(false, obj); - - list_remove(&obj->dirty_list); - block_load--; - - unlock(); - - return res; -} - -void flush_dev::run() -{ - while (true) + if (flags & I_DATADIRTY) { - while (this->get_load()) - { - sched_sleep_ms(flush_dev::wb_run_delta_ms); - - // printk("Flushing data to disk\n"); - sync(); - } - - sem_wait(&thread_sem); + int st; + if (sync_flags & WB_FLAG_SYNC) + st = inode->i_fops->fsyncdata(inode, &winfo); + else + st = inode->i_fops->writepages(inode, &winfo); + if (st < 0) + return st; } -} - -bool flush_dev::called_from_sync() -{ - /* We detect this by testing if the current thread holds this lock */ - return mutex_holds_lock(&__lock); -} -bool flush_dev::add_buf(struct flush_object *obj) -{ - /* It's very possible the flush code is calling us from sync and trying to - * lock the flush dev would cause a deadlock. Therefore, we want to sync it ourselves right now. - */ - if (called_from_sync()) + if (flags & I_DIRTY) { - obj->ops->flush(obj); - obj->ops->set_dirty(false, obj); - return false; + TRACE_EVENT_DURATION(wb_write_inode, inum, dev); + int st = 0; + if (inode->i_sb && inode->i_sb->flush_inode) + st = inode->i_sb->flush_inode(inode, sync_flags & WB_FLAG_SYNC); + if (st < 0) + return st; } - lock(); - - list_add_tail(&obj->dirty_list, &dirty_bufs); - if (block_load++ == 0) { - sem_signal(&thread_sem); + scoped_lock g{inode->i_lock}; + // Note: we cleared I_DIRTY before, so don't do it again; that would lead to data loss. + inode->i_flags &= (~flags | I_DIRTY); + + /* Re-dirty if FILEMAP_MARK_DIRTY is set */ + /* XXX We're skipping the lock here, because we can't grab it as it is a mutex */ + if (inode->i_pages->vm_pages.mark_is_set(FILEMAP_MARK_DIRTY)) + inode->i_flags |= I_DATADIRTY; + // Ok, now we have the proper I_DIRTY flags set. end_inode_writeback will deal with + // requeuing it if need be. } - unlock(); - - return true; + return 0; } -void flush_dev::remove_buf(struct flush_object *obj) +void writeback_dev::end_inode_writeback(struct inode *ino) { lock(); + spin_lock(&ino->i_lock); + DCHECK(ino->i_flags & I_WRITEBACK); + /* Unset I_WRITEBACK and re-queue the inode if need be */ + ino->i_flags &= ~I_WRITEBACK; - /* We do a last check here inside the lock to be sure it's actually still dirty */ - if (obj->ops->is_dirty(obj)) - { - /* TODO: I'm not sure this is 100% safe, because it might've gotten dirtied again - * to a different flushdev(but in that case, should we be removing it anyways?). - * This also applies to remove_inode(). - */ - block_load--; - list_remove(&obj->dirty_list); - } + if (ino->i_flags & I_DIRTYALL) + add_inode(ino); + wake_address(ino); + spin_unlock(&ino->i_lock); unlock(); } -void flush_dev::add_inode(struct inode *ino) +void writeback_dev::sync(unsigned int flags) { + DEFINE_LIST(io_list); + TRACE_EVENT_DURATION(wb_wbdev_run); lock(); - list_add_tail(&ino->i_dirty_inode_node, &dirty_inodes); - - if (block_load++ == 0) + /* Go through the dirty inodes list, set I_WRITEBACK and then splice the list into io_list. + * We'll then work with that *without the lock*. Dirtying inode code will avoid + * putting I_WRITEBACK inodes into the dirty_inodes list, which saves our bacon here. + */ + list_for_every (&dirty_inodes) { - sem_signal(&thread_sem); + struct inode *ino = container_of(l, struct inode, i_dirty_inode_node); + scoped_lock g{ino->i_lock}; + DCHECK(!(ino->i_flags & I_WRITEBACK)); + DCHECK(ino->i_flags & (I_DIRTY | I_DATADIRTY)); + ino->i_flags |= I_WRITEBACK; } + list_move(&io_list, &dirty_inodes); + DCHECK(list_is_empty(&dirty_inodes)); + /* dirty_inodes is now empty, all inodes are I_WRITEBACK. I_WRITEBACK inodes will not go away. + */ unlock(); -} - -void flush_dev::remove_inode(struct inode *ino) -{ - lock(); - /* We do a last check here inside the lock to be sure it's actually still dirty */ - if (ino->i_flags & INODE_FLAG_DIRTY) + /* Now do writeback */ + list_for_every_safe (&io_list) { - block_load--; + struct inode *ino = container_of(l, struct inode, i_dirty_inode_node); + DCHECK(ino->i_flags & I_WRITEBACK); + list_remove(&ino->i_dirty_inode_node); + writeback_inode(ino, flags); + end_inode_writeback(ino); } - - unlock(); } -} // namespace flush - -void flush_thr_init(void *arg) -{ - flush::flush_dev *b = reinterpret_cast(arg); - b->run(); -} - -flush::flush_dev *flush_allocate_dev() +void writeback_dev::run() { - flush::flush_dev *blk = nullptr; - unsigned long load = ~0UL; - - for (auto &b : flush::thread_list) + trace_wb_wbdev_create(); + while (true) { - if (b.get_load() < load) + while (!list_is_empty(&dirty_inodes)) { - load = b.get_load(); - blk = &b; + sched_sleep_ms(wb_run_delta_ms); + sync(0); } - } - return blk; + sem_wait(&thread_sem); + } } -void flush_add_buf(struct flush_object *f) +void writeback_dev::add_inode(struct inode *ino) { - flush::flush_dev *blk = flush_allocate_dev(); - - /* wat */ - assert(blk != nullptr); - - /* If we were flushed right away, we're going to want to avoid setting f->blk_list */ - if (!blk->add_buf(f)) - return; - - f->blk_list = (void *) blk; + DCHECK(!(ino->i_flags & I_WRITEBACK)); + bool should_wake = list_is_empty(&dirty_inodes); + list_add_tail(&ino->i_dirty_inode_node, &dirty_inodes); + if (should_wake) + sem_signal(&thread_sem); } -void flush_remove_buf(struct flush_object *blk) +void writeback_dev::remove_inode(struct inode *ino) { - flush::flush_dev *b = (flush::flush_dev *) blk->blk_list; + list_remove(&ino->i_dirty_inode_node); +} + +} // namespace flush - b->remove_buf(blk); +void flush_thr_init(void *arg) +{ + flush::writeback_dev *b = reinterpret_cast(arg); + b->run(); } -void flush_init(void) +void flush_init() { - for (auto &b : flush::thread_list) - { - b.init(); - } } void flush_add_inode(struct inode *ino) { - auto dev = flush_allocate_dev(); - + // HACK! the inode - file - vm_object scheme we have is currently so screwed up, that some + // inodes can end up with no i_sb. This is bad. Let's ignore this problem for the time being. + // One cannot writeback those inodes. Oh no! They were not supposed to be written back anyway. + if (!ino->i_sb) + return; + auto dev = bdev_get_wbdev(ino); ino->i_flush_dev = dev; - dev->add_inode(ino); } void flush_remove_inode(struct inode *ino) { - auto dev = reinterpret_cast(ino->i_flush_dev); - + // HACK! See flush_add_inode()'s comment. + if (!ino->i_sb) + return; + auto dev = bdev_get_wbdev(ino); dev->remove_inode(ino); - ino->i_flush_dev = nullptr; } -ssize_t flush_sync_one(struct flush_object *obj) -{ - flush::flush_dev *b = (flush::flush_dev *) obj->blk_list; - - return b->sync_one(obj); -} - void flush_do_sync() { - for (auto &w : flush::thread_list) + /* TODO: This sub-optimal and will need to be changed when writeback becomes async */ + scoped_rwlock g{flush::wbdev_list_lock}; + list_for_every (&flush::wbdev_list) { - w.sync(); + flush::writeback_dev *wbdev = flush::writeback_dev::from_list_head(l); + wbdev->sync(WB_FLAG_SYNC); } } diff --git a/kernel/kernel/mm/vm_object.cpp b/kernel/kernel/mm/vm_object.cpp index a69014c72..077c7e4aa 100644 --- a/kernel/kernel/mm/vm_object.cpp +++ b/kernel/kernel/mm/vm_object.cpp @@ -25,6 +25,7 @@ vm_object::vm_object() INIT_LIST_HEAD(&mappings); mutex_init(&page_lock); mutex_init(&mapping_lock); + INIT_LIST_HEAD(&private_list); } /** diff --git a/kernel/kernel/process.cpp b/kernel/kernel/process.cpp index 60bb036d5..722e04676 100644 --- a/kernel/kernel/process.cpp +++ b/kernel/kernel/process.cpp @@ -576,7 +576,7 @@ pid_t sys_wait4(pid_t pid, int *wstatus, int options, rusage *usage) wait_for_event_interruptible(¤t->wait_child_event, wait_handle_processes(current, w)); #if 0 - printk("st %d w.status %d\n", st, w.status); + printk("st %d w.status %d\n", st, w.status); #endif if (st < 0) diff --git a/kernel/trace/trace_filemap.json b/kernel/trace/trace_filemap.json index 6a0cca3ee..ca12318eb 100644 --- a/kernel/trace/trace_filemap.json +++ b/kernel/trace/trace_filemap.json @@ -5,7 +5,7 @@ "args": [ {"type": "ino_t", "name": "ino"}, {"type": "dev_t", "name": "dev"}, - {"type": "u64", "name": "page_off"} + {"type": "u64", "name": "page_off"} ], "format": { @@ -13,5 +13,37 @@ "dev": {"type": "u32"}, "page_off": {"type": "u64"} } + }, + { + "name": "writepage", + "category": "filemap", + "args": [ + {"type": "u64", "name": "ts"}, + {"type": "ino_t", "name": "ino"}, + {"type": "dev_t", "name": "dev"}, + {"type": "u64", "name": "page_off"} + ], + + "format": { + "ino": {"type": "u64"}, + "dev": {"type": "u32"}, + "page_off": {"type": "u64"}, + "end_ts": {"type": "u64", "cond": "TIME"} + } + }, + { + "name": "writepages", + "category": "filemap", + "args": [ + {"type": "u64", "name": "ts"}, + {"type": "ino_t", "name": "ino"}, + {"type": "dev_t", "name": "dev"} + ], + + "format": { + "ino": {"type": "u64"}, + "dev": {"type": "u32"}, + "end_ts": {"type": "u64", "cond": "TIME"} + } } ] diff --git a/kernel/trace/trace_writeback.json b/kernel/trace/trace_writeback.json index aa739dc93..a82bc0e56 100644 --- a/kernel/trace/trace_writeback.json +++ b/kernel/trace/trace_writeback.json @@ -11,5 +11,39 @@ "ino": {"type": "u64"}, "dev": {"type": "u32"} } + }, + { + "name": "wbdev_create", + "category": "wb", + "args": [ + ], + + "format": { + } + }, + { + "name": "wbdev_run", + "category": "wb", + "args": [ + {"type": "u64", "name": "ts"} + ], + + "format": { + "end_ts": {"type": "u64", "cond": "TIME"} + } + }, + { + "name": "write_inode", + "category": "wb", + "args": [ + {"type": "u64", "name": "ts"}, + {"type": "ino_t", "name": "ino"}, + {"type": "dev_t", "name": "dev"} + ], + "format": { + "end_ts": {"type": "u64", "cond": "TIME"}, + "ino": {"type": "u64"}, + "dev": {"type": "u32"} + } } ]