Skip to content

Commit

Permalink
Avoid writes that span ZFS records
Browse files Browse the repository at this point in the history
  • Loading branch information
mkeeter committed Apr 22, 2024
1 parent 449393a commit a870b1e
Showing 1 changed file with 102 additions and 13 deletions.
115 changes: 102 additions & 13 deletions downstairs/src/extent_inner_raw_v2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ struct PackedBlockContext {

const ENCRYPTED_BLOCK_CONTEXT: u32 = 1;
const UNENCRYPTED_BLOCK_CONTEXT: u32 = 2;
const ZFS_RECORDSIZE: u64 = 128 * 1024;

const BLOCK_CONTEXT_SIZE_BYTES: u64 =
std::mem::size_of::<PackedBlockContext>() as u64;
Expand Down Expand Up @@ -511,17 +512,21 @@ impl RawInnerV2 {

let (start_pos, mut parity) =
self.layout.block_pos(writes[0].offset.value);
let start_block = writes[0].offset;
let mut block = start_block.value;

// TODO we're overestimating capacity here, because we can write
// contexts in pairs.
let mut iovecs = Vec::with_capacity(n_blocks * 2);

let mut ctx_slice = ctxs.as_slice();
let mut write_slice = writes;
let padding = vec![0u8; self.layout.padding_size() as usize];
while !write_slice.is_empty() || !ctx_slice.is_empty() {
match parity {
Parity::FirstBlock => {
iovecs.push(IoSlice::new(&write_slice[0].data));
block += 1;
write_slice = &write_slice[1..];
parity = Parity::FirstContext;
}
Expand All @@ -547,13 +552,15 @@ impl RawInnerV2 {
// contexts).
iovecs.push(IoSlice::new(&write_slice[0].data));
write_slice = &write_slice[1..];
if self.layout.has_padding_after(block) {
iovecs.push(IoSlice::new(&padding));
}
parity = Parity::FirstBlock;
block += 1;
}
}
}

let start_block = writes[0].offset;

let expected_bytes =
n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);

Expand Down Expand Up @@ -637,52 +644,88 @@ impl RawInnerV2 {
let mut buf_slice = &mut buf[..];
let mut ctx_slice = &mut ctxs[..];

// This is awkward: we know how many blocks and contexts we're reading,
// and have pre-allocated data for them. However, we don't know how
// many chunks of padding we may need to read! As such, we'll store a
// `Vec<Option<IoSliceMut>>`, and use `None` to represent padding reads;
// then, we'll go through and splice them in once we know their total
// size.
let mut block = start_block.value;
let mut padding_count = 0;
while !ctx_slice.is_empty() || !buf_slice.is_empty() {
match parity {
Parity::FirstBlock => {
let (b, next) = buf_slice.split_at_mut(block_size);
iovecs.push(IoSliceMut::new(b));
iovecs.push(Some(IoSliceMut::new(b)));
buf_slice = next;
parity = Parity::FirstContext;
block += 1;
}
Parity::FirstContext => {
if ctx_slice.len() > 1 {
let (b, next) = ctx_slice.split_at_mut(2);
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
ctx_slice = next;
parity = Parity::SecondBlock;
} else {
let (b, next) = ctx_slice.split_at_mut(1);
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
ctx_slice = next;
parity = Parity::SecondContext;
}
}
Parity::SecondContext => {
let (b, next) = ctx_slice.split_at_mut(1);
iovecs.push(IoSliceMut::new(b.as_bytes_mut()));
iovecs.push(Some(IoSliceMut::new(b.as_bytes_mut())));
ctx_slice = next;
parity = Parity::SecondBlock;
}
Parity::SecondBlock => {
if buf_slice.len() > block_size {
let has_padding = self.layout.has_padding_after(block);
if buf_slice.len() > block_size && !has_padding {
let (b, next) = buf_slice.split_at_mut(block_size * 2);
iovecs.push(IoSliceMut::new(b));
iovecs.push(Some(IoSliceMut::new(b)));
buf_slice = next;
parity = Parity::FirstContext;
block += 2;
} else {
let (b, next) = buf_slice.split_at_mut(block_size);
iovecs.push(IoSliceMut::new(b));
iovecs.push(Some(IoSliceMut::new(b)));
buf_slice = next;
if has_padding {
iovecs.push(None);
padding_count += 1;
}
parity = Parity::FirstBlock;
block += 1;
}
}
}
}

let expected_bytes =
// How many bytes do we expect `preadv` to return?
let mut expected_bytes =
n_blocks as u64 * (block_size as u64 + BLOCK_CONTEXT_SIZE_BYTES);

// Now that we know the total number of padded reads, replace the `None`
// with borrowed chunks of a dummy array (`padding`) and unwrap all of
// the IoVecs.
let mut padding = vec![];
if padding_count > 0 {
let padding_size = self.layout.padding_size() as usize;
padding.resize(padding_size * padding_count, 0u8);
expected_bytes += padding.len() as u64;
for (iov, p) in iovecs
.iter_mut()
.filter(|b| b.is_none())
.zip(padding.chunks_mut(padding_size))
{
*iov = Some(IoSliceMut::new(p));
}
}
let mut iovecs: Vec<_> =
iovecs.into_iter().map(Option::unwrap).collect();

// Finally we get to read the actual data. That's why we're here
cdt::extent__read__file__start!(|| {
(job_id.0, self.extent_number, n_blocks as u64)
Expand Down Expand Up @@ -778,7 +821,19 @@ impl RawLayout {

/// Returns the byte offset of the `block_written` bitpacked array
fn block_written_array_offset(&self) -> u64 {
self.block_count() * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
let bpr = self.blocks_per_record();
let bc = self.block_count();

if bc % bpr == 0 {
(bc / bpr) * ZFS_RECORDSIZE
} else {
let record_count = bc / bpr;
let trailing_blocks = bc - record_count * bpr;

record_count * ZFS_RECORDSIZE
+ trailing_blocks
* (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
}
}

/// Returns the size of the `block_written` bitpacked array, in bytes
Expand Down Expand Up @@ -885,7 +940,11 @@ impl RawLayout {
///
/// This offset could either be block data or context, depending on parity!
fn block_pos(&self, block: u64) -> (u64, Parity) {
let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
let bpr = self.blocks_per_record();
let record = block / bpr;
let block = block % bpr;
let pos = record * ZFS_RECORDSIZE
+ block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
let parity = match block % 2 {
0 => Parity::FirstBlock,
1 => Parity::SecondContext,
Expand All @@ -896,13 +955,43 @@ impl RawLayout {

/// Returns the position of the given block's context
fn context_slot(&self, block: u64) -> u64 {
let pos = block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
let bpr = self.blocks_per_record();
let record = block / bpr;
let block = block % bpr;
let pos = record * ZFS_RECORDSIZE
+ block * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES);
match block % 2 {
0 => pos + self.block_size(),
1 => pos,
_ => unreachable!(),
}
}

/// Returns the number of blocks that fit into a ZFS recordsize
fn blocks_per_record(&self) -> u64 {
// Each block contains data and a single context slot
let bytes_per_block = self.block_size() + BLOCK_CONTEXT_SIZE_BYTES;
// We guarantee that there are an even number of blocks per record, for
// simplicity (so that padding always comes after `Parity::SecondBlock`)
2 * (ZFS_RECORDSIZE / (2 * bytes_per_block))
}

/// Checks whether there is padding after the given block
fn has_padding_after(&self, block: u64) -> bool {
// No padding at the end of the file
if block == self.block_count() - 1 {
return false;
}
// Otherwise, there's padding at the end of each block-pair-group
let bpr = self.blocks_per_record();
(block % bpr) == bpr - 1
}

/// Returns the size of `recordsize` padding
fn padding_size(&self) -> u64 {
let bpr = self.blocks_per_record();
ZFS_RECORDSIZE - bpr * (self.block_size() + BLOCK_CONTEXT_SIZE_BYTES)
}
}

/// Represents position in a block-context pair.
Expand Down

0 comments on commit a870b1e

Please sign in to comment.