From e5234d1a6c40565c63b528908d8724443bb467b8 Mon Sep 17 00:00:00 2001 From: Zac Wen Date: Fri, 1 Nov 2024 21:55:44 -0700 Subject: [PATCH] Use fallocate for file size extension when supported (#11403) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/11403 When Copy-on-Write (COW) is disabled on Btrfs, automatic relocation creates snapshots of files, ignoring the noCOW setting. This results in increased disk usage and can lead to "no space left" errors in production. One possible enhancement we can make is to use `fallocate` to reserve space immediately after file creation. This helps ensure the allocated space is as continuous as possible. Reviewed By: xiaoxmeng Differential Revision: D65316028 fbshipit-source-id: f690fb454d40b45c005358c85d5389b75e7ebe77 --- velox/common/caching/SsdFile.cpp | 21 ++++++++++++--------- velox/common/caching/SsdFile.h | 4 ++-- velox/common/caching/tests/SsdFileTest.cpp | 2 +- velox/common/file/File.cpp | 15 +++++++++++++++ 4 files changed, 30 insertions(+), 12 deletions(-) diff --git a/velox/common/caching/SsdFile.cpp b/velox/common/caching/SsdFile.cpp index 68859306e338..c1297d485c66 100644 --- a/velox/common/caching/SsdFile.cpp +++ b/velox/common/caching/SsdFile.cpp @@ -169,11 +169,15 @@ SsdFile::SsdFile(const Config& config) writeFile_ = fs_->openFileForWrite(fileName_, fileOptions); readFile_ = fs_->openFileForRead(fileName_); - const uint64_t size = writeFile_->size(); - numRegions_ = std::min(size / kRegionSize, maxRegions_); - fileSize_ = numRegions_ * kRegionSize; - if ((size % kRegionSize > 0) || (size > numRegions_ * kRegionSize)) { - writeFile_->truncate(fileSize_); + // NOTE: checkpoint recovery will set 'numRegions_' and 'dataSize_' + // accordingly. + numRegions_ = 0; + dataSize_ = 0; + + const auto maxFileSize = kRegionSize * maxRegions_; + if (writeFile_->size() != maxFileSize) { + // Initialize and pre-allocate (if possible) the data file with fixed space. + writeFile_->truncate(static_cast(maxFileSize)); } // The existing regions in the file are writable. writableRegions_.resize(numRegions_); @@ -334,10 +338,8 @@ std::optional> SsdFile::getSpace( bool SsdFile::growOrEvictLocked() { process::TraceContext trace("SsdFile::growOrEvictLocked"); if (numRegions_ < maxRegions_) { - const auto newSize = (numRegions_ + 1) * kRegionSize; try { - writeFile_->truncate(newSize); - fileSize_ = newSize; + dataSize_ = (numRegions_ + 1) * kRegionSize; writableRegions_.push_back(numRegions_); regionSizes_[numRegions_] = 0; erasedRegionSizes_[numRegions_] = 0; @@ -448,7 +450,7 @@ void SsdFile::write(std::vector& pins) { writeOffset += writeLength; writeLength = 0; } - VELOX_CHECK_GE(fileSize_, writeOffset); + VELOX_CHECK_GE(dataSize_, writeOffset); { std::lock_guard l(mutex_); @@ -1007,6 +1009,7 @@ void SsdFile::readCheckpoint(std::ifstream& state) { maxRegions_, "Trying to start from checkpoint with a different capacity"); numRegions_ = readNumber(state); + dataSize_ = numRegions_ * kRegionSize; std::vector scores(maxRegions); state.read(asChar(scores.data()), maxRegions_ * sizeof(double)); std::unordered_map idMap; diff --git a/velox/common/caching/SsdFile.h b/velox/common/caching/SsdFile.h index 31300a274a78..660be3f3d884 100644 --- a/velox/common/caching/SsdFile.h +++ b/velox/common/caching/SsdFile.h @@ -563,8 +563,8 @@ class SsdFile { // File system. std::shared_ptr fs_; - // Size of the backing file in bytes. Must be multiple of kRegionSize. - uint64_t fileSize_{0}; + // The size of actual cached data in bytes. Must be multiple of kRegionSize. + uint64_t dataSize_{0}; // ReadFile for cache data file. std::unique_ptr readFile_; diff --git a/velox/common/caching/tests/SsdFileTest.cpp b/velox/common/caching/tests/SsdFileTest.cpp index 0b0bef1ca842..532f7855eb82 100644 --- a/velox/common/caching/tests/SsdFileTest.cpp +++ b/velox/common/caching/tests/SsdFileTest.cpp @@ -647,7 +647,7 @@ TEST_F(SsdFileTest, recoverFromCheckpointWithChecksum) { ASSERT_EQ(statsAfterRecover.entriesCached, stats.entriesCached); } else { ASSERT_EQ(statsAfterRecover.bytesCached, 0); - ASSERT_EQ(statsAfterRecover.regionsCached, stats.regionsCached); + ASSERT_EQ(statsAfterRecover.regionsCached, 0); ASSERT_EQ(statsAfterRecover.entriesCached, 0); } diff --git a/velox/common/file/File.cpp b/velox/common/file/File.cpp index 6a30f0a26159..f4b7f4b5838d 100644 --- a/velox/common/file/File.cpp +++ b/velox/common/file/File.cpp @@ -377,6 +377,21 @@ void LocalWriteFile::write( void LocalWriteFile::truncate(int64_t newSize) { checkNotClosed(closed_); VELOX_CHECK_GE(newSize, 0, "New size cannot be negative."); +#ifdef linux + if (newSize > size_) { + // Use fallocate to extend the file. + const auto ret = ::fallocate(fd_, 0, 0, newSize); + VELOX_CHECK_EQ( + ret, + 0, + "fallocate failed in LocalWriteFile::truncate: {}.", + folly::errnoStr(errno)); + size_ = newSize; + return; + } +#endif // linux + + // Fallback to ftruncate. const auto ret = ::ftruncate(fd_, newSize); VELOX_CHECK_EQ( ret,