Skip to content

Commit

Permalink
Rebase doc_ids to min_doc_id to reduce space
Browse files Browse the repository at this point in the history
  • Loading branch information
lalinsky committed Dec 7, 2024
1 parent 54e022d commit 9b90a3e
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 14 deletions.
4 changes: 2 additions & 2 deletions src/FileSegment.zig
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ pub fn search(self: Self, sorted_hashes: []const u32, results: *SearchResults) !
if (block_no != prev_block_no) {
prev_block_no = block_no;
const block_data = self.getBlockData(block_no);
try filefmt.readBlock(block_data, &block_items);
try filefmt.readBlock(block_data, &block_items, self.min_doc_id);
}
const matches = std.sort.equalRange(Item, Item{ .hash = hash, .id = 0 }, block_items.items, {}, Item.cmpByHash);
for (matches[0]..matches[1]) |i| {
Expand Down Expand Up @@ -197,7 +197,7 @@ pub const Reader = struct {
self.index = 0;
const block_data = self.segment.getBlockData(self.block_no);
self.block_no += 1;
try filefmt.readBlock(block_data, &self.items);
try filefmt.readBlock(block_data, &self.items, self.segment.min_doc_id);
}
return self.items.items[self.index];
}
Expand Down
24 changes: 12 additions & 12 deletions src/filefmt.zig
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ const BlockHeader = struct {
first_item: Item,
};

pub fn decodeBlockHeader(data: []const u8) !BlockHeader {
pub fn decodeBlockHeader(data: []const u8, min_doc_id: u32) !BlockHeader {
assert(data.len >= min_block_size);

const num_items = std.mem.readInt(u16, data[0..2], .little);
Expand All @@ -119,11 +119,11 @@ pub fn decodeBlockHeader(data: []const u8) !BlockHeader {

return .{
.num_items = num_items,
.first_item = Item{ .hash = hash.value, .id = id.value },
.first_item = Item{ .hash = hash.value, .id = id.value + min_doc_id },
};
}

pub fn readBlock(data: []const u8, items: *std.ArrayList(Item)) !void {
pub fn readBlock(data: []const u8, items: *std.ArrayList(Item), min_doc_id: u32) !void {
var ptr: usize = 0;

if (data.len < 2) {
Expand All @@ -149,7 +149,7 @@ pub fn readBlock(data: []const u8, items: *std.ArrayList(Item)) !void {
ptr += diff_doc_id.size;

last_hash += diff_hash.value;
last_doc_id = if (diff_hash.value > 0) diff_doc_id.value else last_doc_id + diff_doc_id.value;
last_doc_id = if (diff_hash.value > 0) diff_doc_id.value + min_doc_id else last_doc_id + diff_doc_id.value;

const item = items.addOneAssumeCapacity();
item.* = .{ .hash = last_hash, .id = last_doc_id };
Expand All @@ -161,7 +161,7 @@ pub fn readBlock(data: []const u8, items: *std.ArrayList(Item)) !void {
}
}

pub fn encodeBlock(data: []u8, reader: anytype) !u16 {
pub fn encodeBlock(data: []u8, reader: anytype, min_doc_id: u32) !u16 {
assert(data.len >= 2);

var ptr: usize = 2;
Expand All @@ -174,7 +174,7 @@ pub fn encodeBlock(data: []u8, reader: anytype) !u16 {
assert(item.hash > last_hash or (item.hash == last_hash and item.id >= last_doc_id));

const diff_hash = item.hash - last_hash;
const diff_doc_id = if (diff_hash > 0) item.id else item.id - last_doc_id;
const diff_doc_id = if (diff_hash > 0) item.id - min_doc_id else item.id - last_doc_id;

if (ptr + varint32Size(diff_hash) + varint32Size(diff_doc_id) > data.len) {
break;
Expand Down Expand Up @@ -211,13 +211,13 @@ test "writeBlock/readBlock/readFirstItemFromBlock" {
var block_data: [block_size]u8 = undefined;

var reader = segment.reader();
const num_items = try encodeBlock(block_data[0..], &reader);
const num_items = try encodeBlock(block_data[0..], &reader, 0);
try testing.expectEqual(segment.items.items.len, num_items);

var items = std.ArrayList(Item).init(std.testing.allocator);
defer items.deinit();

try readBlock(block_data[0..], &items);
try readBlock(block_data[0..], &items, 0);
try testing.expectEqualSlices(
Item,
&[_]Item{
Expand All @@ -230,7 +230,7 @@ test "writeBlock/readBlock/readFirstItemFromBlock" {
items.items,
);

const header = try decodeBlockHeader(block_data[0..]);
const header = try decodeBlockHeader(block_data[0..], 0);
try testing.expectEqual(items.items.len, header.num_items);
try testing.expectEqual(items.items[0], header.first_item);
}
Expand Down Expand Up @@ -343,7 +343,7 @@ pub fn writeSegmentFile(dir: std.fs.Dir, reader: anytype) !void {

var block_data: [block_size]u8 = undefined;
while (true) {
const n = try encodeBlock(block_data[0..], reader);
const n = try encodeBlock(block_data[0..], reader, segment.min_doc_id);
try writer.writeAll(block_data[0..]);
if (n == 0) {
break;
Expand Down Expand Up @@ -466,7 +466,7 @@ pub fn readSegmentFile(dir: fs.Dir, info: SegmentInfo, segment: *FileSegment) !v
var block_data = block_data_buffer[0..block_size];
while (true) {
try reader.readNoEof(block_data);
const block_header = try decodeBlockHeader(block_data);
const block_header = try decodeBlockHeader(block_data, segment.min_doc_id);
if (block_header.num_items == 0) {
break;
}
Expand Down Expand Up @@ -534,7 +534,7 @@ test "writeFile/readFile" {
var items = std.ArrayList(Item).init(testing.allocator);
defer items.deinit();

try readBlock(segment.getBlockData(0), &items);
try readBlock(segment.getBlockData(0), &items, segment.min_doc_id);
try std.testing.expectEqualSlices(Item, &[_]Item{
Item{ .hash = 1, .id = 1 },
Item{ .hash = 2, .id = 1 },
Expand Down

0 comments on commit 9b90a3e

Please sign in to comment.