-
Notifications
You must be signed in to change notification settings - Fork 339
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
(core): Primary key index scans and single-column secondary index scans #350
Changes from 27 commits
f02da18
3d56fbd
e118b70
99871bb
ff236c7
ed19f47
3a11887
02d6fa3
fe90aac
9169f6e
43015f6
d8a695a
db0e2ea
d2233d6
3826d4e
d22dbe9
d3e797f
47534cb
dde10d2
bb1c8b6
37f8771
1ae8d28
af9a751
6e7db36
15a66ea
e5cf052
fc71f2b
8563d62
93a8110
572db69
43038cb
556f4b7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,13 @@ use crate::storage::sqlite3_ondisk::{ | |
read_btree_cell, read_varint, write_varint, BTreeCell, DatabaseHeader, PageContent, PageType, | ||
TableInteriorCell, TableLeafCell, | ||
}; | ||
use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue}; | ||
use crate::types::{Cursor, CursorResult, OwnedRecord, OwnedValue, SeekKey, SeekOp}; | ||
use crate::Result; | ||
|
||
use std::cell::{Ref, RefCell}; | ||
use std::rc::Rc; | ||
|
||
use super::sqlite3_ondisk::{write_varint_to_vec, OverflowCell}; | ||
use super::sqlite3_ondisk::{write_varint_to_vec, IndexInteriorCell, IndexLeafCell, OverflowCell}; | ||
|
||
/* | ||
These are offsets of fields in the header of a b-tree page. | ||
|
@@ -23,6 +23,7 @@ const BTREE_HEADER_OFFSET_CELL_CONTENT: usize = 5; /* pointer to first byte of c | |
const BTREE_HEADER_OFFSET_FRAGMENTED: usize = 7; /* number of fragmented bytes -> u8 */ | ||
const BTREE_HEADER_OFFSET_RIGHTMOST: usize = 8; /* if internalnode, pointer right most pointer (saved separately from cells) -> u32 */ | ||
|
||
#[derive(Debug)] | ||
pub struct MemPage { | ||
parent: Option<Rc<MemPage>>, | ||
page_idx: usize, | ||
|
@@ -56,6 +57,7 @@ pub struct BTreeCursor { | |
record: RefCell<Option<OwnedRecord>>, | ||
null_flag: bool, | ||
database_header: Rc<RefCell<DatabaseHeader>>, | ||
going_upwards: bool, | ||
} | ||
|
||
impl BTreeCursor { | ||
|
@@ -72,6 +74,7 @@ impl BTreeCursor { | |
record: RefCell::new(None), | ||
null_flag: false, | ||
database_header, | ||
going_upwards: false, | ||
} | ||
} | ||
|
||
|
@@ -109,6 +112,7 @@ impl BTreeCursor { | |
} | ||
None => match parent { | ||
Some(ref parent) => { | ||
self.going_upwards = true; | ||
self.page.replace(Some(parent.clone())); | ||
continue; | ||
} | ||
|
@@ -145,57 +149,114 @@ impl BTreeCursor { | |
let record = crate::storage::sqlite3_ondisk::read_record(_payload)?; | ||
return Ok(CursorResult::Ok((Some(*_rowid), Some(record)))); | ||
} | ||
BTreeCell::IndexInteriorCell(_) => { | ||
unimplemented!(); | ||
BTreeCell::IndexInteriorCell(IndexInteriorCell { | ||
payload, | ||
left_child_page, | ||
.. | ||
}) => { | ||
if self.going_upwards { | ||
self.going_upwards = false; | ||
mem_page.advance(); | ||
let record = crate::storage::sqlite3_ondisk::read_record(payload)?; | ||
let rowid = match record.values.last() { | ||
Some(OwnedValue::Integer(rowid)) => *rowid as u64, | ||
_ => unreachable!("index cells should have an integer rowid"), | ||
}; | ||
return Ok(CursorResult::Ok((Some(rowid), Some(record)))); | ||
} else { | ||
let mem_page = | ||
MemPage::new(Some(mem_page.clone()), *left_child_page as usize, 0); | ||
self.page.replace(Some(Rc::new(mem_page))); | ||
continue; | ||
} | ||
} | ||
BTreeCell::IndexLeafCell(_) => { | ||
unimplemented!(); | ||
BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { | ||
mem_page.advance(); | ||
let record = crate::storage::sqlite3_ondisk::read_record(payload)?; | ||
let rowid = match record.values.last() { | ||
Some(OwnedValue::Integer(rowid)) => *rowid as u64, | ||
_ => unreachable!("index cells should have an integer rowid"), | ||
}; | ||
|
||
return Ok(CursorResult::Ok((Some(rowid), Some(record)))); | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn btree_seek_rowid( | ||
fn seek<'a>( | ||
&mut self, | ||
rowid: u64, | ||
key: SeekKey<'a>, | ||
op: SeekOp, | ||
) -> Result<CursorResult<(Option<u64>, Option<OwnedRecord>)>> { | ||
self.move_to(rowid)?; | ||
match self.move_to(key.clone(), op.clone())? { | ||
CursorResult::Ok(_) => {} | ||
CursorResult::IO => return Ok(CursorResult::IO), | ||
}; | ||
|
||
let mem_page = self.get_mem_page(); | ||
|
||
let page_idx = mem_page.page_idx; | ||
let page = self.pager.read_page(page_idx)?; | ||
let page = RefCell::borrow(&page); | ||
if page.is_locked() { | ||
return Ok(CursorResult::IO); | ||
} | ||
|
||
let page = page.contents.read().unwrap(); | ||
let page = page.as_ref().unwrap(); | ||
|
||
for cell_idx in 0..page.cell_count() { | ||
match &page.cell_get( | ||
let cell = page.cell_get( | ||
cell_idx, | ||
self.pager.clone(), | ||
self.max_local(page.page_type()), | ||
self.min_local(page.page_type()), | ||
self.usable_space(), | ||
)? { | ||
)?; | ||
match &cell { | ||
BTreeCell::TableLeafCell(TableLeafCell { | ||
_rowid: cell_rowid, | ||
_payload: p, | ||
_payload: payload, | ||
first_overflow_page: _, | ||
}) => { | ||
if *cell_rowid == rowid { | ||
let record = crate::storage::sqlite3_ondisk::read_record(p)?; | ||
let SeekKey::TableRowId(rowid_key) = key else { | ||
unreachable!("table seek key should be a rowid"); | ||
}; | ||
mem_page.advance(); | ||
let comparison = match op { | ||
SeekOp::GT => *cell_rowid > rowid_key, | ||
SeekOp::GE => *cell_rowid >= rowid_key, | ||
SeekOp::EQ => *cell_rowid == rowid_key, | ||
}; | ||
if comparison { | ||
let record = crate::storage::sqlite3_ondisk::read_record(payload)?; | ||
return Ok(CursorResult::Ok((Some(*cell_rowid), Some(record)))); | ||
} | ||
} | ||
BTreeCell::IndexLeafCell(IndexLeafCell { payload, .. }) => { | ||
let SeekKey::IndexKey(index_key) = key else { | ||
unreachable!("index seek key should be a record"); | ||
}; | ||
mem_page.advance(); | ||
let record = crate::storage::sqlite3_ondisk::read_record(payload)?; | ||
let comparison = match op { | ||
SeekOp::GT => record > *index_key, | ||
SeekOp::GE => record >= *index_key, | ||
SeekOp::EQ => record == *index_key, | ||
}; | ||
if comparison { | ||
let rowid = match record.values.get(1) { | ||
Some(OwnedValue::Integer(rowid)) => *rowid as u64, | ||
_ => unreachable!("index cells should have an integer rowid"), | ||
}; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't get this part. We assume There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thats a bug i forgot to refactor. it should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed in penberg@93a8110 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I left the rowid naming, copypasted from discord: I think i will leave the rowid naming because it reflects current reality better, doesnt our code pretty much assume we dont have WITHOUT ROWID tables right now? And I think WITHOUT ROWID tables are pretty much btree indexes in that they store everything in the key area? https://www.sqlite.org/withoutrowid.html#:~:text=A%20WITHOUT%20ROWID%20table%20uses,binary%20search%20on%20the%20rowid. |
||
return Ok(CursorResult::Ok((Some(rowid), Some(record)))); | ||
} | ||
} | ||
cell_type => { | ||
unreachable!("unexpected cell type: {:?}", cell_type); | ||
} | ||
} | ||
} | ||
|
||
Ok(CursorResult::Ok((None, None))) | ||
} | ||
|
||
|
@@ -240,7 +301,7 @@ impl BTreeCursor { | |
} | ||
} | ||
|
||
pub fn move_to(&mut self, key: u64) -> Result<CursorResult<()>> { | ||
pub fn move_to<'a>(&mut self, key: SeekKey<'a>, cmp: SeekOp) -> Result<CursorResult<()>> { | ||
// For a table with N rows, we can find any row by row id in O(log(N)) time by starting at the root page and following the B-tree pointers. | ||
// B-trees consist of interior pages and leaf pages. Interior pages contain pointers to other pages, while leaf pages contain the actual row data. | ||
// | ||
|
@@ -294,8 +355,16 @@ impl BTreeCursor { | |
_left_child_page, | ||
_rowid, | ||
}) => { | ||
if key < *_rowid { | ||
mem_page.advance(); | ||
let SeekKey::TableRowId(rowid_key) = key else { | ||
unreachable!("table seek key should be a rowid"); | ||
}; | ||
mem_page.advance(); | ||
let comparison = match cmp { | ||
SeekOp::GT => rowid_key < *_rowid, | ||
SeekOp::GE => rowid_key <= *_rowid, | ||
SeekOp::EQ => rowid_key <= *_rowid, | ||
}; | ||
if comparison { | ||
let mem_page = | ||
MemPage::new(Some(mem_page.clone()), *_left_child_page as usize, 0); | ||
self.page.replace(Some(Rc::new(mem_page))); | ||
|
@@ -312,20 +381,43 @@ impl BTreeCursor { | |
"we don't iterate leaf cells while trying to move to a leaf cell" | ||
); | ||
} | ||
BTreeCell::IndexInteriorCell(_) => { | ||
unimplemented!(); | ||
BTreeCell::IndexInteriorCell(IndexInteriorCell { | ||
left_child_page, | ||
payload, | ||
.. | ||
}) => { | ||
let SeekKey::IndexKey(index_key) = key else { | ||
unreachable!("index seek key should be a record"); | ||
}; | ||
let record = crate::storage::sqlite3_ondisk::read_record(payload)?; | ||
let comparison = match cmp { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: can we rename comparison to found? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Renamed to |
||
SeekOp::GT => index_key < &record, | ||
SeekOp::GE => index_key <= &record, | ||
SeekOp::EQ => index_key <= &record, | ||
}; | ||
if comparison { | ||
let mem_page = | ||
MemPage::new(Some(mem_page.clone()), *left_child_page as usize, 0); | ||
self.page.replace(Some(Rc::new(mem_page))); | ||
found_cell = true; | ||
break; | ||
} else { | ||
mem_page.advance(); | ||
} | ||
} | ||
BTreeCell::IndexLeafCell(_) => { | ||
unimplemented!(); | ||
unreachable!( | ||
"we don't iterate leaf cells while trying to move to a leaf cell" | ||
); | ||
} | ||
} | ||
} | ||
|
||
if !found_cell { | ||
let parent = mem_page.clone(); | ||
let parent = mem_page.parent.clone(); | ||
match page.rightmost_pointer() { | ||
Some(right_most_pointer) => { | ||
let mem_page = MemPage::new(Some(parent), right_most_pointer as usize, 0); | ||
let mem_page = MemPage::new(parent, right_most_pointer as usize, 0); | ||
self.page.replace(Some(Rc::new(mem_page))); | ||
continue; | ||
} | ||
|
@@ -1285,8 +1377,8 @@ impl Cursor for BTreeCursor { | |
Ok(*self.rowid.borrow()) | ||
} | ||
|
||
fn seek_rowid(&mut self, rowid: u64) -> Result<CursorResult<bool>> { | ||
match self.btree_seek_rowid(rowid)? { | ||
fn seek(&mut self, key: SeekKey<'_>, op: SeekOp) -> Result<CursorResult<bool>> { | ||
match self.seek(key, op)? { | ||
CursorResult::Ok((rowid, record)) => { | ||
self.rowid.replace(rowid); | ||
self.record.replace(record); | ||
|
@@ -1311,7 +1403,7 @@ impl Cursor for BTreeCursor { | |
_ => unreachable!("btree tables are indexed by integers!"), | ||
}; | ||
if !moved_before { | ||
match self.move_to(*int_key as u64)? { | ||
match self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)? { | ||
CursorResult::Ok(_) => {} | ||
CursorResult::IO => return Ok(CursorResult::IO), | ||
}; | ||
|
@@ -1336,7 +1428,7 @@ impl Cursor for BTreeCursor { | |
OwnedValue::Integer(i) => i, | ||
_ => unreachable!("btree tables are indexed by integers!"), | ||
}; | ||
match self.move_to(*int_key as u64)? { | ||
match self.move_to(SeekKey::TableRowId(*int_key as u64), SeekOp::EQ)? { | ||
CursorResult::Ok(_) => {} | ||
CursorResult::IO => return Ok(CursorResult::IO), | ||
}; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Aren't index keys also blobs/strings/whatever?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are getting the rowid here and AFAIK it's always the last value in the payload record