From a870b1e634a18417c8c58e6dcb0362ff8413f7b6 Mon Sep 17 00:00:00 2001 From: Matt Keeter Date: Mon, 22 Apr 2024 17:53:42 -0400 Subject: [PATCH] Avoid writes that span ZFS records --- downstairs/src/extent_inner_raw_v2.rs | 115 +++++++++++++++++++++++--- 1 file changed, 102 insertions(+), 13 deletions(-) diff --git a/downstairs/src/extent_inner_raw_v2.rs b/downstairs/src/extent_inner_raw_v2.rs index 16cf0af01..11f5a1fcb 100644 --- a/downstairs/src/extent_inner_raw_v2.rs +++ b/downstairs/src/extent_inner_raw_v2.rs @@ -42,6 +42,7 @@ struct PackedBlockContext { const ENCRYPTED_BLOCK_CONTEXT: u32 = 1; const UNENCRYPTED_BLOCK_CONTEXT: u32 = 2; +const ZFS_RECORDSIZE: u64 = 128 * 1024; const BLOCK_CONTEXT_SIZE_BYTES: u64 = std::mem::size_of::() as u64; @@ -511,6 +512,8 @@ impl RawInnerV2 { let (start_pos, mut parity) = self.layout.block_pos(writes[0].offset.value); + let start_block = writes[0].offset; + let mut block = start_block.value; // TODO we're overestimating capacity here, because we can write // contexts in pairs. @@ -518,10 +521,12 @@ impl RawInnerV2 { let mut ctx_slice = ctxs.as_slice(); let mut write_slice = writes; + let padding = vec![0u8; self.layout.padding_size() as usize]; while !write_slice.is_empty() || !ctx_slice.is_empty() { match parity { Parity::FirstBlock => { iovecs.push(IoSlice::new(&write_slice[0].data)); + block += 1; write_slice = &write_slice[1..]; parity = Parity::FirstContext; } @@ -547,13 +552,15 @@ impl RawInnerV2 { // contexts). iovecs.push(IoSlice::new(&write_slice[0].data)); write_slice = &write_slice[1..]; + if self.layout.has_padding_after(block) { + iovecs.push(IoSlice::new(&padding)); + } parity = Parity::FirstBlock; + block += 1; } } } - let start_block = writes[0].offset; - let expected_bytes = n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES); @@ -637,52 +644,88 @@ impl RawInnerV2 { let mut buf_slice = &mut buf[..]; let mut ctx_slice = &mut ctxs[..]; + // This is awkward: we know how many blocks and contexts we're reading, + // and have pre-allocated data for them. However, we don't know how + // many chunks of padding we may need to read! As such, we'll store a + // `Vec>`, and use `None` to represent padding reads; + // then, we'll go through and splice them in once we know their total + // size. + let mut block = start_block.value; + let mut padding_count = 0; while !ctx_slice.is_empty() || !buf_slice.is_empty() { match parity { Parity::FirstBlock => { let (b, next) = buf_slice.split_at_mut(block_size); - iovecs.push(IoSliceMut::new(b)); + iovecs.push(Some(IoSliceMut::new(b))); buf_slice = next; parity = Parity::FirstContext; + block += 1; } Parity::FirstContext => { if ctx_slice.len() > 1 { let (b, next) = ctx_slice.split_at_mut(2); - iovecs.push(IoSliceMut::new(b.as_bytes_mut())); + iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut()))); ctx_slice = next; parity = Parity::SecondBlock; } else { let (b, next) = ctx_slice.split_at_mut(1); - iovecs.push(IoSliceMut::new(b.as_bytes_mut())); + iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut()))); ctx_slice = next; parity = Parity::SecondContext; } } Parity::SecondContext => { let (b, next) = ctx_slice.split_at_mut(1); - iovecs.push(IoSliceMut::new(b.as_bytes_mut())); + iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut()))); ctx_slice = next; parity = Parity::SecondBlock; } Parity::SecondBlock => { - if buf_slice.len() > block_size { + let has_padding = self.layout.has_padding_after(block); + if buf_slice.len() > block_size && !has_padding { let (b, next) = buf_slice.split_at_mut(block_size * 2); - iovecs.push(IoSliceMut::new(b)); + iovecs.push(Some(IoSliceMut::new(b))); buf_slice = next; parity = Parity::FirstContext; + block += 2; } else { let (b, next) = buf_slice.split_at_mut(block_size); - iovecs.push(IoSliceMut::new(b)); + iovecs.push(Some(IoSliceMut::new(b))); buf_slice = next; + if has_padding { + iovecs.push(None); + padding_count += 1; + } parity = Parity::FirstBlock; + block += 1; } } } } - let expected_bytes = + // How many bytes do we expect `preadv` to return? + let mut expected_bytes = n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES); + // Now that we know the total number of padded reads, replace the `None` + // with borrowed chunks of a dummy array (`padding`) and unwrap all of + // the IoVecs. + let mut padding = vec![]; + if padding_count > 0 { + let padding_size = self.layout.padding_size() as usize; + padding.resize(padding_size * padding_count, 0u8); + expected_bytes += padding.len() as u64; + for (iov, p) in iovecs + .iter_mut() + .filter(|b| b.is_none()) + .zip(padding.chunks_mut(padding_size)) + { + *iov = Some(IoSliceMut::new(p)); + } + } + let mut iovecs: Vec<_> = + iovecs.into_iter().map(Option::unwrap).collect(); + // Finally we get to read the actual data. That's why we're here cdt::extent__read__file__start!(|| { (job_id.0, self.extent_number, n_blocks as u64) @@ -778,7 +821,19 @@ impl RawLayout { /// Returns the byte offset of the `block_written` bitpacked array fn block_written_array_offset(&self) -> u64 { - self.block_count() * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES) + let bpr = self.blocks_per_record(); + let bc = self.block_count(); + + if bc % bpr == 0 { + (bc / bpr) * ZFS_RECORDSIZE + } else { + let record_count = bc / bpr; + let trailing_blocks = bc - record_count * bpr; + + record_count * ZFS_RECORDSIZE + + trailing_blocks + * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES) + } } /// Returns the size of the `block_written` bitpacked array, in bytes @@ -885,7 +940,11 @@ impl RawLayout { /// /// This offset could either be block data or context, depending on parity! fn block_pos(&self, block: u64) -> (u64, Parity) { - let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES); + let bpr = self.blocks_per_record(); + let record = block / bpr; + let block = block % bpr; + let pos = record * ZFS_RECORDSIZE + + block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES); let parity = match block % 2 { 0 => Parity::FirstBlock, 1 => Parity::SecondContext, @@ -896,13 +955,43 @@ impl RawLayout { /// Returns the position of the given block's context fn context_slot(&self, block: u64) -> u64 { - let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES); + let bpr = self.blocks_per_record(); + let record = block / bpr; + let block = block % bpr; + let pos = record * ZFS_RECORDSIZE + + block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES); match block % 2 { 0 => pos + self.block_size(), 1 => pos, _ => unreachable!(), } } + + /// Returns the number of blocks that fit into a ZFS recordsize + fn blocks_per_record(&self) -> u64 { + // Each block contains data and a single context slot + let bytes_per_block = self.block_size() + BLOCK_CONTEXT_SIZE_BYTES; + // We guarantee that there are an even number of blocks per record, for + // simplicity (so that padding always comes after `Parity::SecondBlock`) + 2 * (ZFS_RECORDSIZE / (2 * bytes_per_block)) + } + + /// Checks whether there is padding after the given block + fn has_padding_after(&self, block: u64) -> bool { + // No padding at the end of the file + if block == self.block_count() - 1 { + return false; + } + // Otherwise, there's padding at the end of each block-pair-group + let bpr = self.blocks_per_record(); + (block % bpr) == bpr - 1 + } + + /// Returns the size of `recordsize` padding + fn padding_size(&self) -> u64 { + let bpr = self.blocks_per_record(); + ZFS_RECORDSIZE - bpr * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES) + } } /// Represents position in a block-context pair.